# AI-generated text checker

In [8]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [9]:
import nltk
from nltk import FreqDist
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import numpy as np
import pandas as pd


# Load dataset 
#df = pd.read_csv('df_with_factcheck.csv')
df = pd.read_csv('main_df2.csv')



In [10]:
df['preprocessed_caption'] = df['preprocessed_caption'].astype(str)


First AI Text Checker

In [11]:
import nltk
from collections import Counter
from textblob import TextBlob
import numpy as np

text = ' '.join(df['preprocessed_caption'])  # Join the preprocessed captions into a single string

# 1. N-gram Analysis
def ngram_analysis(text, n=2):
    n_grams = list(nltk.ngrams(text.split(), n))
    freq_dist = nltk.FreqDist(n_grams)
    print(freq_dist)

ngram_analysis(text)

# 2. Perplexity
def perplexity(text):
    prob_dist = nltk.FreqDist(text.split())
    entropy = -1 * sum([p * np.log2(p) for p in prob_dist.values()])
    return np.power(2, entropy)

print(perplexity(text))

# 3. Burstiness
def burstiness(text):
    word_counts = Counter(text.split())
    burstiness = len(word_counts) / np.std(list(word_counts.values()))
    return burstiness

print(burstiness(text))

# 4. Stylometry
def stylometry(text):
    blob = TextBlob(text)
    avg_sentence_length = sum(len(sentence.words) for sentence in blob.sentences) / len(blob.sentences)
    passive_voice = text.lower().count('was') + text.lower().count('were')
    vocabulary_richness = len(set(text.split())) / len(text.split())
    return avg_sentence_length, passive_voice, vocabulary_richness

print(stylometry(text))

# 5. Consistency and Coherence Analysis
def consistency(text):
    sentences = text.split(".")
    topics = [sentence.split()[0] for sentence in sentences if sentence]
    topic_changes = len(set(topics))
    return topic_changes

print(consistency(text))


<FreqDist with 169333 samples and 239304 outcomes>
0.0
272.5753070663207
(221308.0, 140, 0.11852238774785316)
1


Second AI Checker

In [12]:
!pip install transformers==4.30.2

Collecting transformers==4.30.2
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 KB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.3.1
  Downloading safetensors-0.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-

In [13]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from difflib import SequenceMatcher

# Load the pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Select the preprocessed captions from your DataFrame
preprocessed_captions = df['preprocessed_caption']

# Iterate over each preprocessed caption
for preprocessed_caption in preprocessed_captions:
    # Preprocess and tokenize the caption
    tokenized_caption = tokenizer.encode(preprocessed_caption, return_tensors='pt')

    # Generate captions using the language model
    generated_captions = model.generate(tokenized_caption, max_length=144, num_return_sequences=1)

    # Convert generated captions to text
    generated_captions_text = [tokenizer.decode(caption, skip_special_tokens=True) for caption in generated_captions]

    # Calculate similarity between preprocessed caption and generated captions
    similarity_scores = [SequenceMatcher(None, preprocessed_caption, caption).ratio() for caption in generated_captions_text]

    # Set a threshold to determine if the caption is AI-generated
    threshold = 0.9
    is_ai_generated = any(score > threshold for score in similarity_scores)

    if is_ai_generated:
        print("The preprocessed caption '{}' is likely written by AI.".format(preprocessed_caption))
    else:
        print("The preprocessed caption '{}' is likely not written by AI.".format(preprocessed_caption))


  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 40.8kB/s]
Downloading model.safetensors: 100%|██████████| 548M/548M [00:05<00:00, 96.6MB/s]
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 20.0kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 101MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 135MB/s]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 144, but `max_length` is set to 144. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
2023-06-29 15:50:36.702379: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized

KeyboardInterrupt: 

In [14]:
!pip install transformers==4.30.2

You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the device to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to generate captions using the GPT-2 model
def generate_captions(text):
    # Tokenize the input text
    inputs = tokenizer.encode(text, return_tensors='pt').to(device)

    # Generate captions using the GPT-2 model
    outputs = model.generate(inputs, max_length=200, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode the generated captions
    generated_captions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    return generated_captions

# Apply caption generation on the 'preprocessed_caption' column of the DataFrame
df['generated_captions'] = df['preprocessed_caption'].apply(generate_captions)

# Set a threshold to determine if the caption is AI-generated
threshold = 0.9
ai_generated_captions = []
non_ai_captions = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    preprocessed_caption = row['preprocessed_caption']
    generated_captions = row['generated_captions']
    
    # Calculate similarity between preprocessed caption and generated captions
    similarity_scores = [calculate_similarity(preprocessed_caption, caption) for caption in generated_captions]

    # Set a threshold to determine if the caption is AI-generated
    is_ai_generated = any(score > threshold for score in similarity_scores)

    if is_ai_generated:
        ai_generated_captions.extend(generated_captions[:5])
    else:
        non_ai_captions.extend(generated_captions[:5])

# Print the top 5 AI-generated captions
print("AI-generated captions:")
for caption in ai_generated_captions[:5]:
    print(caption)
print()

# Print the top 5 non AI-generated captions
print("Non AI-generated captions:")
for caption in non_ai_captions[:5]:
    print(caption)
print()

# Print the overall count
print("AI-generated captions count:", len(ai_generated_captions))
print("Non AI-generated captions count:", len(non_ai_captions))


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the device to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to generate captions using the GPT-2 model
def generate_captions(text):
    # Tokenize the input text
    inputs = tokenizer.encode(text, return_tensors='pt').to(device)

    # Generate captions using the GPT-2 model
    outputs = model.generate(inputs, max_length=300, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode the generated captions
    generated_captions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    return generated_captions

# Remove rows with missing values in 'preprocessed_caption' column
df.dropna(subset=['preprocessed_caption'], inplace=True)

# Apply caption generation on the 'preprocessed_caption' column of the DataFrame
df['generated_captions'] = df['preprocessed_caption'].apply(generate_captions)

# Set a threshold to determine if the caption is AI-generated
threshold = 0.9
ai_generated_captions = []
non_ai_captions = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    preprocessed_caption = row['preprocessed_caption']
    generated_captions = row['generated_captions']
    
    # Calculate similarity between preprocessed caption and generated captions
    similarity_scores = [calculate_similarity(preprocessed_caption, caption) for caption in generated_captions]

    # Set a threshold to determine if the caption is AI-generated
    is_ai_generated = any(score > threshold for score in similarity_scores)

    if is_ai_generated:
        ai_generated_captions.extend(generated_captions[:5])
    else:
        non_ai_captions.extend(generated_captions[:5])

# Print the top 5 AI-generated captions
print("AI-generated captions:")
for caption in ai_generated_captions[:5]:
    print(caption)
print()

# Print the top 5 non AI-generated captions
print("Non AI-generated captions:")
for caption in non_ai_captions[:5]:
    print(caption)
print()

# Print the overall count
print("AI-generated captions count:", len(ai_generated_captions))
print("Non AI-generated captions count:", len(non_ai_captions))


KeyboardInterrupt: 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4c8af7b1-f3b8-45ab-bbdc-6a32713107d1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>