In [1]:
import numpy as np
import re 
import math
import random
from collections import defaultdict

In [2]:

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-zA-Z0-9,.?! ]', '', text)  # Removing punctuation except ,.?! and space
    text = re.sub(r'\s+', ' ', text).strip()  #Removing extra whitespaces
    return text

# tokenization
def add_tokens(text):
    return "<START> " + text + " <END>"

# Loading data
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()
    return data

# Cleaning and tokenizing
def clean_tokenize_data(data):
    cleaned_data = [clean_text(doc) for doc in data]
    tokenized_data = [add_tokens(doc) for doc in cleaned_data]
    return tokenized_data

# Partitioning
def partition_data(data, train_percent):
    total_docs = len(data)
    train_size = int(total_docs * train_percent)
    train_data = data[:train_size]
    test_data = data[train_size:]
    return train_data, test_data


human_file = 'hum.txt'
gpt_file = 'gpt.txt'


human_data = load_data(human_file)
gpt_data = load_data(gpt_file)

cleaned_human_data = clean_tokenize_data(human_data)
cleaned_gpt_data = clean_tokenize_data(gpt_data)

all_data = cleaned_human_data + cleaned_gpt_data




In [3]:
# Partition into training and test sets
train_data_human, test_data_human = partition_data(cleaned_human_data, train_percent=0.9)

train_data_gpt, test_data_gpt = partition_data(cleaned_gpt_data, train_percent=0.9)

test_data = test_data_human+ test_data_gpt
train_data = train_data_human + train_data_gpt



**Building Vocabulary**

In [4]:
def build_vocabulary(documents):
    vocabulary = set()
    for doc in documents:
        vocabulary.update(doc.split())
    return vocabulary

v = build_vocabulary(all_data)
v_list = list(v)

size_of_vocab=len(v_list)

**Counting**

In [5]:

def count_unigram(data):
    word_frequency = {}
    for document in data:
     
        for word in document.split():
            # Count occurrences of each word
            if word in word_frequency:
                word_frequency[word] += 1
            else:
                word_frequency[word] = 1
    return word_frequency


In [6]:
human_count_unigram_train = count_unigram(train_data_human)
gpt_count_unigram_train = count_unigram(train_data_gpt)


In [7]:
def count_bigram(data):
    word_pairs_frequency = {}
    for document in data:
        words = document.split()
        
        for i in range(len(words) - 1):
            word_pair = (words[i], words[i+1])
            
            if word_pair in word_pairs_frequency:
                word_pairs_frequency[word_pair] += 1
            else:
                word_pairs_frequency[word_pair]=1
    return word_pairs_frequency

In [8]:
human_count_bigram_train = count_bigram(train_data_human)
gpt_count_bigram_train = count_bigram(train_data_gpt)

bigrams = count_bigram(all_data)
bigram_human = count_bigram(cleaned_human_data)
bigram_gpt = count_bigram(cleaned_gpt_data)


In [9]:
def count_trigram(data):
    word_trigrams_frequency = {}
    for document in data:
        words = document.split()
        
        for i in range(len(words) - 2):
            word_trigram = (words[i], words[i+1], words[i+2])
            if word_trigram in word_trigrams_frequency:
            # Count occurrences of each trigram
                word_trigrams_frequency[word_trigram] += 1
            else:
                word_trigrams_frequency[word_trigram] = 1
    return word_trigrams_frequency

In [10]:
human_count_trigram_train = count_trigram(train_data_human)
gpt_count_trigram_train= count_trigram(train_data_gpt)
trigram_human = count_trigram(cleaned_human_data)
trigram_gpt = count_trigram(cleaned_gpt_data)

**OOV Rate**

In [11]:
train_trigrams = count_trigram(train_data)
train_bigrams = count_bigram(train_data)
test_trigrams = count_trigram(test_data)
test_bigrams = count_bigram(test_data)

In [12]:
def calculate_oov_rate(test_bigrams, test_trigrams, train_bigrams, train_trigrams):
    #Total Count
    total_test_bigrams_with_repeats = sum(test_bigrams.values())
    total_test_trigrams_with_repeats = sum(test_trigrams.values())
    
    # Converting to set
    train_bigrams_set = set(train_bigrams.keys())
    train_trigrams_set = set(train_trigrams.keys())
    
    oov_bigrams = 0
    oov_trigrams = 0
    
    # Count OOV bigrams
    for bigram in test_bigrams:
        if bigram not in train_bigrams_set:
            oov_bigrams += test_bigrams[bigram]  # Count with repeats
    
    # Count OOV trigrams
    for trigram in test_trigrams:
        if trigram not in train_trigrams_set:
            oov_trigrams += test_trigrams[trigram]  # Count with repeats
    
    # Calculating OOV rates
    oov_rate_bigrams = (oov_bigrams / total_test_bigrams_with_repeats) * 100
    oov_rate_trigrams = (oov_trigrams / total_test_trigrams_with_repeats) * 100
    
    return oov_rate_bigrams, oov_rate_trigrams


oov_rate_bigrams, oov_rate_trigrams = calculate_oov_rate(test_bigrams, test_trigrams, train_bigrams, train_trigrams)
print("OOV rate for bigrams:", oov_rate_bigrams)
print("OOV rate for trigrams:", oov_rate_trigrams)


OOV rate for bigrams: 9.61161054240112
OOV rate for trigrams: 35.245720112894254


**Bigram Classifier**

In [13]:
#Step 1 with laplacian smoothing
def laplacian_smoothing_bigram(unigram_count, bigram_count):
    conditional_probablity = (bigram_count + 1)/(unigram_count + size_of_vocab)
    return conditional_probablity


In [14]:
#Step 2 of bigram
#Taking log for faster computation
def prob_of_document_bigram(document ,unigram_corpus, bigram_corpus):
    words = document.split()
    log_prob = 0
    for i in range(len(words) - 1):
        unigram_count = unigram_corpus.get((words[i+1]),0)
        bigram_count = bigram_corpus.get((words[i],words[i+1]),0)
        conditional_probablity_word = laplacian_smoothing_bigram(unigram_count, bigram_count)
        log_prob += math.log(conditional_probablity_word)
        
    return log_prob

In [15]:
total_gpt = len(cleaned_gpt_data)

total_human = len(cleaned_human_data)

total = len(all_data)

prob_of_human = total_human/total
prob_of_gpt = total_gpt/total

In [16]:
def bigram_classifier(document):
    
    max_class = None
    log_prob_gpt = math.log(prob_of_gpt)+prob_of_document_bigram(document, gpt_count_unigram_train, gpt_count_bigram_train)
    log_prob_human = math.log(prob_of_human) + prob_of_document_bigram(document, human_count_unigram_train, human_count_bigram_train)
    
    if log_prob_gpt >=log_prob_human:
            max_class = "GPT"
    else:
          max_class = "Human"
    return max_class
    

In [17]:
correct_predictions = 0
total_predictions = len(test_data)
for document in test_data:
    predicted_class = bigram_classifier(document)
    actual_class = "Human" if document in cleaned_human_data else "GPT"
    if predicted_class == actual_class:
        correct_predictions += 1

accuracy = correct_predictions / total_predictions
print("Accuracy of Bigram:", accuracy)


Accuracy of Bigram: 0.9592481703260146


**Trigram**

In [18]:
#Step 1 of trigram
def laplacian_smoothing_trigram(bigram_count, trigram_count):
    conditional_probability = (trigram_count+1)/ (bigram_count+size_of_vocab)
    return conditional_probability

In [19]:
#Step 2 of trigram
#Taking Log to avoid zero hence imporve the quality of the classifier
def prob_of_document_trigram(document, bigram_corpus, trigram_corpus):
    words = document.split()
    log_prob = 0
    for i in range(len(words) - 2):
        bigram_count = bigram_corpus.get((words[i+1], words[i+2]), 0)
        trigram_count = trigram_corpus.get((words[i], words[i+1], words[i+2]), 0)
        conditional_probability_word = laplacian_smoothing_trigram(bigram_count, trigram_count)
        log_prob += math.log(conditional_probability_word)
    return log_prob


In [20]:
def trigram_classifier(document):
    max_class = None
    log_prob_gpt = math.log(prob_of_gpt) + prob_of_document_trigram(document, gpt_count_bigram_train, gpt_count_trigram_train)
    log_prob_human = math.log(prob_of_human) + prob_of_document_trigram(document, human_count_bigram_train, human_count_trigram_train)
    
    if log_prob_gpt >= log_prob_human:
        max_class = "GPT"
    else:
        max_class = "Human"
    
    return max_class

In [21]:
correct_predictions = 0
total_predictions = len(test_data)
for document in test_data:
    predicted_class = trigram_classifier(document)
    actual_class = "Human" if document in cleaned_human_data else "GPT"
    if predicted_class == actual_class:
        correct_predictions += 1

accuracy = correct_predictions / total_predictions
print("Accuracy of Trigram:", accuracy)

Accuracy of Trigram: 0.9421157684630739


**Text Generation**

In [22]:
import numpy as np

def probability_distribution_bigram(word, T, bigrams):
    denominator = 0
    unnormalized_probs = {}

    for bigram, count in bigrams.items():
        if word in bigram:
            
            denominator += np.exp(count / T)  
            
    for bigram, count in bigrams.items():
        #Handle overflow error
        if word in bigram:
            try:
                numerator = np.exp(count / T)
            except OverflowError:
                numerator = 0.0
            
            
            
            start_index = bigram.index(word)
            
            other_word = bigram[1 - start_index]
            #Handle Ovelflow error
            try:
                unnormalized_probs[other_word] = numerator / denominator
            except OverflowError:
                unnormalized_probs[other_word] = 0.0
            
    total_prob = sum(unnormalized_probs.values())
    
    if total_prob == 0.0:
        # If total probability is 0, assign equal probabilities to all words
        num_words = len(unnormalized_probs)
        probabilities = {word: 1 / num_words for word in unnormalized_probs.keys()}
    else:
        # Otherwise, normalize the probabilities
        probabilities = {bigram: prob / total_prob for bigram, prob in unnormalized_probs.items()}
        
    return probabilities


#prob = probability_distribution_bigram('<START>', T=50, bigrams)
#print(prob)


In [23]:
#Probability distribution trigram
def probability_distribution_trigram(word, T, trigrams):
    denominator = 0
    unnormalized_probs = {}

    for trigram, count in trigrams.items():
        if word in trigram:
            
            denominator += np.exp(count / T)  
            
    for trigram, count in trigrams.items():
        if word in trigram:
            
            numerator = np.exp(count / T)
           
            
            start_index = trigram.index(word)
        
            other_word = trigram[1 - start_index]
            
            unnormalized_probs[other_word] = numerator / denominator
            
    total_prob = sum(unnormalized_probs.values())
    
    
    probabilities = {bigram: prob / total_prob for bigram, prob in unnormalized_probs.items()}
        
    return probabilities

#T = 50
#prob = probability_distribution_trigram('<START>', T, trigram_human)
#print(prob)


In [24]:
#Defining the constant vaiable
T= 50

def generate_sentence( probability_distribution_function,ngram_dict,max_length=20):
    sentence = ['<START>']  
    current_word = '<START>'
    
    # Generating words until reaching the maximum length or encountering the '<END>' token
    for _ in range(max_length):
        # Check if the current word is '<END>', if so, stop generating
        if current_word == '<END>':
            break
        
        # Getting the prob distribution for the current word
        dist = probability_distribution_function(current_word, T,ngram_dict)
        
        # choosing the next word based by randomly sampling from np.random
        next_word = np.random.choice(list(dist.keys()), p=list(dist.values()))
        
        
        sentence.append(next_word)
        
        #To avoid <START> being generated in the middle of the sentence
        if current_word == '<START>':
            continue
        else:
            current_word = next_word
    
    # Ensure the sentence ends with the '<END>' token
    if sentence[-1] != '<END>':
        sentence.append('<END>')
    
    # Return the generated sentence as a string
    return ' '.join(sentence)





# Generating 5 sentences for bigram_human
for _ in range(5):
    generated_sentence = generate_sentence(probability_distribution_bigram, bigram_human, max_length=20)
    print("Generated sentence for bigram_human:", generated_sentence)

# Generating 5 sentences for bigram_gpt
for _ in range(5):
    generated_sentence = generate_sentence(probability_distribution_bigram, bigram_gpt, max_length=20)
    print("Generated sentence for bigram_gpt:", generated_sentence)

# Generating 5 sentences for trigram_human
for _ in range(5):
    generated_sentence = generate_sentence(probability_distribution_trigram, trigram_human, max_length=20)
    print("Generated sentence for trigram_human:", generated_sentence)

# Generating 5 sentences for trigram_gpt
for _ in range(5):
    generated_sentence = generate_sentence(probability_distribution_trigram, trigram_gpt, max_length=20)
    print("Generated sentence for trigram_gpt:", generated_sentence)



Generated sentence for bigram_human: <START> the the the the the the the the the the the the the the the the the the the the <END>
Generated sentence for bigram_human: <START> the the the the the the the the the the the the the the the the the the the the <END>
Generated sentence for bigram_human: <START> the the the the the the the the the the the the the the the the the the the the <END>
Generated sentence for bigram_human: <START> the the the the the the the the the the the the the the the the the the the the <END>
Generated sentence for bigram_human: <START> the the the the the the the the the the the the the the the the the the the the <END>
Generated sentence for bigram_gpt: <START> the the the the the the the the the the the the the the the the the the the the <END>
Generated sentence for bigram_gpt: <START> the the the the the the the the the the the the the the the the the the the the <END>
Generated sentence for bigram_gpt: <START> the the the the the the the the the the the 