In [None]:
# import libraries and data
import pandas as pd
from collections import Counter
import numpy as np
from sbnltk.Tokenizer import sentenceTokenizer, wordTokenizer

# read training data
data = pd.read_csv('data/macroTrain.csv')

In [None]:
# calculate TTR
def calculate_ttr(text):
    """
    Calculate the Type-Token Ratio (TTR) for a given text.
    Input:
    - text: str, the input text
    Output:
    - ttr: float, the Type-Token Ratio
    """
    tokenizer = wordTokenizer()  # Initialize word tokenizer
    tokens = tokenizer.customized_tokenizer(text)  # Tokenize the text
    types = set(tokens)  # Get unique tokens
    ttr = len(types) / len(tokens)  # Calculate TTR
    return ttr

In [None]:
# create character bigram model using MLE and Add-One smoothing
def create_char_bigram(texts):
     """
    Create a character bigram model using Maximum Likelihood Estimate (MLE) and Add-One smoothing.
    Input:
    - texts: list of str, a list of texts
    Output:
    - char_bigram_model: Counter, the character bigram model"""
    char_bigram_model = Counter()  # Initialize character bigram model
    
    for text in texts:
        char_bigrams = [text[i:i+2] for i in range(len(text) - 1)]  # Generate character bigrams
        char_bigram_counts = Counter(char_bigrams)  # Count the occurrences of each bigram
        
        v = len(set(text))  # V = number of unique characters
        
        for bigram, count in char_bigram_counts.items():
            # Calculate Maximum Likelihood Estimate (MLE) probability
            wi_minus_1 = bigram[0]  # First character of the bigram
            probability = (count + 1) / (text.count(wi_minus_1) + v)  # Add-One smoothing formula
            
            char_bigram_model[bigram] += probability  # assign probability to the bigram
    
    return char_bigram_model

In [None]:
# create word bigram model with Add-One smoothing
def create_word_bigram(author_texts):
    """
    Create a word bigram model with Add-One smoothing.
    Input:
    - author_texts: list of str, texts authored by a specific author
    Output:
    - word_bigram_model: Counter, the word bigram model
    """
    word_bigram_model = Counter()  # Initialize word bigram model
    
    sentT = sentenceTokenizer()  # Initialize sentence tokenizer
    Wt = wordTokenizer()  # Initialize word tokenizer
    
    all_words = []  # List to store all words in all texts of the author
    
    for text in author_texts:
        sentences = sentT.customized_tokenizer(text)
        
        for sentence in sentences:
            words = ['<s>'] + Wt.customized_tokenizer(sentence) + ['</s>']
            
            # Remove specific bigrams
            words = [word for word in words if word not in [('', '</s>'), ('</s>', '<s>'), ('<s>', '')]]
            
            all_words.extend(words)  # Add words to the list of all words
    
    v = len(set(all_words))
    
    for i in range(len(all_words) - 1):
        bigram = (all_words[i], all_words[i+1])  # Get current bigram
        
        # Count occurrences of the bigram and the preceding word
        count_bigram = all_words.count(bigram)
        count_wi_minus_1 = all_words.count(bigram[0])
        
        # Add-One smoothing
        probability = (count_bigram + 1) / (count_wi_minus_1 + v)  # Add-One smoothing formula
        
        # probability to the bigram in the model
        word_bigram_model[bigram] += probability 
    
    return word_bigram_model



In [None]:
# Create models for each author
authors = data['label'].unique()
author_ttr_models = {}
author_char_bigram_models = {}
author_word_bigram_models = {}

for author in authors:
    author_data = data[data['label'] == author]
    author_texts = author_data['text']
    
    author_ttr_models[author] = np.mean(author_texts.apply(calculate_ttr))  # Average TTR for the author
    
    char_bigram_model = create_char_bigram(author_texts)
    word_bigram_model = create_word_bigram(author_texts)
    
    author_char_bigram_models[author] = char_bigram_model  # Store the character bigram model
    author_word_bigram_models[author] = word_bigram_model  # Store the word bigram model



In [None]:
# predict author using TTR
def predict_author_ttr(test_text, author_ttr_models):
    """
    Predict the author of a given text using Type-Token Ratio (TTR) models.
    Input:
    - test_text: str, the test text
    - author_ttr_models: dict, TTR models for each author
    Output:
    - predicted_author: str, the predicted author
    """
    test_ttr = calculate_ttr(test_text)  # Calculate TTR for test text
    
    author_scores = {}
    for author, ttr_model in author_ttr_models.items():
        author_scores[author] = abs(test_ttr - ttr_model)  # Calculate difference in TTR
    
    predicted_author = min(author_scores, key=author_scores.get)  # Identify the author with the lowest score
    return predicted_author

# Predict author using Character Bigram Model
def predict_author_char_bigram(test_text, author_char_bigram_models):
    """
    Predict the author of a given text using character bigram models.
    Input:
    - test_text: str, the test text
    - author_char_bigram_models: dict, character bigram models for each author
    Output:
    - predicted_author: str, the predicted author
    """
    test_char_bigram_model = create_char_bigram([test_text])  # Calculate character bigram model for test text
    
    author_scores = {}
    for author, char_bigram_model in author_char_bigram_models.items():
        author_scores[author] = 0
        
        # Compare character bigram models
        for bigram, prob in test_char_bigram_model.items():
            author_scores[author] += abs(prob - char_bigram_model.get(bigram, 0))
    
    predicted_author = min(author_scores, key=author_scores.get)  # Identify the author with the lowest score
    return predicted_author

# predict author using Word Bigram Model
def predict_author_word_bigram(test_text, author_word_bigram_models):
    """
    Predict the author of a given text using word bigram models.
    Input:
    - test_text: str, the test text
    - author_word_bigram_models: dict, word bigram models for each author
    Output:
    - predicted_author: str, the predicted author
    """
    test_word_bigram_model = create_word_bigram([test_text])  # Calculate word bigram model for test text
    
    author_scores = {}
    for author, word_bigram_model in author_word_bigram_models.items():
        author_scores[author] = 0
        
        # Compare word bigram models
        for bigram, prob in test_word_bigram_model.items():
            author_scores[author] += abs(prob - word_bigram_model.get(bigram, 0))
    
    predicted_author = min(author_scores, key=author_scores.get)  # Identify the author with the lowest score
    return predicted_author


In [None]:
# Read the test data
data = pd.read_csv('macroTest.csv')

# variables to count matches for each model
match_ttr = 0
match_char_bigram = 0
match_word_bigram = 0

# Iterate over each row in the test data
for index, row in data.iterrows():
    
    actual_author = row['label']    # actual author

    test_text = row['text']     # text
    
    # Predict author using TTR
    predicted_author_ttr = predict_author_ttr(test_text, author_ttr_models)
    
    # Predict author using Character Bigram Model
    predicted_author_char_bigram = predict_author_char_bigram(test_text, author_char_bigram_models)
    
    # Predict author using Word Bigram Model
    predicted_author_word_bigram = predict_author_word_bigram(test_text, author_word_bigram_models)
    
    # Check if predictions match the actual author and update match counts
    if predicted_author_ttr == actual_author:
        match_ttr += 1
    if predicted_author_char_bigram == actual_author:
        match_char_bigram += 1
    if predicted_author_word_bigram == actual_author:
        match_word_bigram += 1

# Calculate accuracy scores
total_texts = len(data)
accuracy_ttr = match_ttr / total_texts
accuracy_char_bigram = match_char_bigram / total_texts
accuracy_word_bigram = match_word_bigram / total_texts

# Output results
print("Accuracy using TTR:", accuracy_ttr)
print("Accuracy using Character Bigram Model:", accuracy_char_bigram)
print("Accuracy using Word Bigram Model:", accuracy_word_bigram)
