In [8]:
import spacy
from collections import Counter
import torch
from statistics import mean
import seaborn as sns
import matplotlib.pyplot as plt
import textstat
from sklearn.preprocessing import normalize
import pandas as pd
import tiktoken
from transformers import RobertaTokenizer, RobertaForMaskedLM
import argparse
import os
from pathlib import Path
import time
import torch
from scipy.spatial.distance import cosine

# Constants
nlp = spacy.load('en_core_web_sm')
FUNCTION_WORDS = {'a', 'in', 'of', 'the'}


def remove_prefix(data):
    """
    This function removes a predefined prefix from each text in a given dataset.

    Args:
    data (list of tuples): The data from the dataset. Each element of the list is a tuple, where the first element
    is the text and the second element is its label.

    Returns:
    texts (list): The list of texts after the prefix has been removed.
    labels (list): The list of labels corresponding to the texts.
    """

    texts, labels = zip(*data)

    prefixes = ["Answer:", "Story:", "Article:"]

    for prefix in prefixes:
        texts = [text.split(prefix, 1)[1].strip() if prefix in text else text for text in texts]

    return list(texts), list(labels)


def count_pos_tags_and_special_elements(text):
    # CHECKED
    """
      This function counts the frequency of POS (Part of Speech) tags, punctuation marks, and function words in a given text.
      It uses the SpaCy library for POS tagging.

      Args:
      text (str): The text for which to count POS tags and special elements.

      Returns:
      pos_counts (dict): A dictionary where keys are POS tags and values are their corresponding count.
      punctuation_counts (dict): A dictionary where keys are punctuation marks and values are their corresponding count.
      function_word_counts (dict): A dictionary where keys are function words and values are their corresponding count.

    """
    # Use SpaCy to parse the text
    doc = nlp(text)

    # Create a counter of POS tags
    pos_counts = Counter(token.pos_ for token in doc)

    # Create a counter of punctuation marks
    punctuation_counts = Counter(token.text for token in doc if token.pos_ == 'PUNCT')

    # Create a counter of function words
    function_word_counts = Counter(token.text for token in doc if token.lower_ in FUNCTION_WORDS)

    return dict(pos_counts), dict(punctuation_counts), dict(function_word_counts)


def calculate_readability_scores(text):
    """
    This function calculates the Flesch Reading Ease and Flesch-Kincaid Grade Level of a text using the textstat library.

    Args:
    text (str): The text to score.

    Returns:
    flesch_reading_ease (float): The Flesch Reading Ease score of the text.
    flesch_kincaid_grade_level (float): The Flesch-Kincaid Grade Level of the text.

    """
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    flesch_kincaid_grade_level = textstat.flesch_kincaid_grade(text)

    return flesch_reading_ease, flesch_kincaid_grade_level


def load_and_count(dataset_name, data):
    """
       This function loads the texts from the dataset and calculates the frequency of POS tags, punctuation marks,
       and function words.

       Args:
       dataset_name (str): The name of the dataset.
       data (list of tuples): The data from the dataset. Each element of the list is a tuple, where the first element
       is the text and the second element is its label.

       Returns:
       overall_pos_counts (Counter): A Counter object of POS tag frequencies.
       overall_punctuation_counts (Counter): A Counter object of punctuation mark frequencies.
       overall_function_word_counts (Counter): A Counter object of function word frequencies.
    """

    # CHECKED
    # Extract texts
    texts, labels = remove_prefix(dataset_name, data)

    # Calculate POS tag frequencies for the texts
    pos_frequencies, punctuation_frequencies, function_word_frequencies = zip(
        *[count_pos_tags_and_special_elements(text) for text in texts])

    # Then, sum the dictionaries to get the overall frequencies
    overall_pos_counts = Counter()
    for pos_freq in pos_frequencies:
        overall_pos_counts += Counter(pos_freq)

    overall_punctuation_counts = Counter()
    for punct_freq in punctuation_frequencies:
        overall_punctuation_counts += Counter(punct_freq)

    overall_function_word_counts = Counter()
    for function_word_freq in function_word_frequencies:
        overall_function_word_counts += Counter(function_word_freq)

    return overall_pos_counts, overall_punctuation_counts, overall_function_word_counts


def load_model():
    # CHECKED
    """
      This function loads a pre-trained model and its corresponding tokenizer from the Hugging Face model hub.

      Returns:
      model: The loaded model.
      tokenizer: The tokenizer corresponding to the model.

    """
    # model_name = 'allenai/scibert_scivocab_uncased'
    # model = AutoModelForMaskedLM.from_pretrained(model_name)
    # tokenizer = AutoTokenizer.from_pretrained(model_name)

    model_name = 'roberta-base'
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = RobertaForMaskedLM.from_pretrained(model_name)
    return model, tokenizer


def calculate_average_word_length(texts):
    """
     This function calculates the average word length of a list of texts using the SpaCy library.

     Args:
     texts (list): The list of texts.

     Returns:
     (float): The average word length.

    """

    word_lengths = []

    for text in texts:
        doc = nlp(text)
        for token in doc:
            if not token.is_punct:  # ignore punctuation
                word_lengths.append(len(token.text))

    return mean(word_lengths)


def calculate_average_sentence_length(texts):
    # CHEKCED
    """
    This function calculates the average sentence length of a list of texts using the SpaCy library.

    Args:
    texts (list): The list of texts.

    Returns:
    avg_sentence_length (float): The average sentence length.
    """
    sentence_lengths = []

    for text in texts:
        doc = nlp(text)
        for sent in doc.sents:
            sentence_lengths.append(len(sent))

    return mean(sentence_lengths)


def calculate_perplexity(text, model, tokenizer):
    """
    Calculates the perplexity of a text using a language model and tokenizer.

    Args:
    text (str): The text for which perplexity will be calculated.
    model: The language model used to calculate perplexity.
    tokenizer: The tokenizer used to tokenize the text.

    Returns:
    perplexity (float or None): The calculated perplexity of the text, or None if the text is too long.
    """

    try:
        input_ids = tokenizer.encode(text, return_tensors='pt')
        # Truncate the text to the first 512 tokens
        input_ids = input_ids[:, :512]

        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss
            perplexity = torch.exp(loss)
        return perplexity.item()
    except Exception as e:
        print(f"An error occurred in calculate_perplexity: {e}")
        return None


def calculate_cosine_similarity(text1, text2, model, tokenizer):
    """
    This function calculates cosine similarity between two texts.

    Args:
    text1 (str): The first text.
    text2 (str): The second text.
    model: The language model used to generate word embeddings.
    tokenizer: The tokenizer used to tokenize the text.

    Returns:
    cosine_similarity (float): The cosine similarity between the word embeddings of the two texts.
    """

    # Tokenize the texts
    input_ids1 = tokenizer.encode(text1, return_tensors="pt")
    input_ids2 = tokenizer.encode(text2, return_tensors="pt")

    # Generate word embeddings for the texts
    embeddings1 = model.roberta(input_ids1)[0].mean(dim=1).squeeze().detach()
    embeddings2 = model.roberta(input_ids2)[0].mean(dim=1).squeeze().detach()

    # Convert embeddings to numpy arrays
    embeddings1_np = embeddings1.numpy()
    embeddings2_np = embeddings2.numpy()

    # Apply L2 normalization to the embeddings
    normalized_embeddings1 = normalize(embeddings1_np.reshape(1, -1)).squeeze()
    normalized_embeddings2 = normalize(embeddings2_np.reshape(1, -1)).squeeze()

    # Convert back to torch tensors
    normalized_embeddings1 = torch.from_numpy(normalized_embeddings1)
    normalized_embeddings2 = torch.from_numpy(normalized_embeddings2)

    # Calculate cosine similarity
    cosine_similarity = 1 - cosine(embeddings1.numpy(), embeddings2.numpy())

    return cosine_similarity

def extract_prompts_and_texts(data):
    """
    This function extracts prompts and texts from the data.

    Args:
    data (list of tuples): The data. Each tuple consists of a text (including prompt) and a label.

    Returns:
    prompts_and_texts (list of tuples): The list of tuples where each tuple contains a prompt and a text.
    """

    prompts_and_texts = []

    full_texts, _ = zip(*data)
    texts, labels = remove_prefix(data)

    starting_points = ["Question:", "Prompt:", "Article:"]
    end_points = ["Answer:", "Story:", "Summary:"]

    for full_text, text in zip(full_texts, texts):
        prompt = None
        for start, end in zip(starting_points, end_points):
            if start in full_text and end in full_text:
                _, temp_prompt = full_text.split(start, 1)
                prompt, _ = temp_prompt.split(end, 1)
                prompt = prompt.strip()
                break

        if prompt is None:
            print(f"WARNING: No prompt extracted for text: {text}")
            prompt = ""  # use an empty string if no prompt is found

        prompts_and_texts.append((prompt, text))  # append the prompt and text to the list

    return prompts_and_texts


def calculate_cosine_similarities_for_dataset(dataset_name, model, tokenizer):
    """
    This function calculates cosine similarities for all (prompt, text) pairs in a dataset.

    Args:
    dataset_name (str): The name of the dataset.
    model: The language model used to generate word embeddings.
    tokenizer: The tokenizer used to tokenize the text.

    Returns:
    cosine_similarities (list of floats): The list of cosine similarities.
    """

    prompts_and_texts = extract_prompts_and_texts(dataset_name, data)

    cosine_similarities = []
    for prompt, text in prompts_and_texts:
        cosine_similarity = calculate_cosine_similarity(prompt, text, model, tokenizer)
        cosine_similarities.append(cosine_similarity)

    return cosine_similarities


def calculate_cosine_similarities_for_sentences_in_text(text, model, tokenizer):
    """
    This function calculates cosine similarities for all consecutive pairs of sentences in a single text.

    Args:
    text (str): The text for which to calculate cosine similarities.
    model: The language model used to generate word embeddings.
    tokenizer: The tokenizer used to tokenize the text.

    Returns:
    cosine_similarities (list of floats): The list of cosine similarities.
    """

    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    cosine_similarities = []

    for i in range(len(sentences) - 1):
        cosine_similarity = calculate_cosine_similarity(sentences[i], sentences[i + 1], model, tokenizer)
        cosine_similarities.append(cosine_similarity)

    return cosine_similarities


In [12]:
import pandas as pd
import os
import spacy

# Constants
nlp = spacy.load('en_core_web_sm')
FUNCTION_WORDS = {'a', 'in', 'of', 'the'}


def combine_data_files(data_files):
    """
    This function reads all the files from the given list of file paths and
    combines them into a large DataFrame.

    Args:
    data_files (list of str): The list of file paths.

    Returns:
    combined_data (DataFrame): A DataFrame combining all the data from the files.
    """
    # Initialize an empty list to store individual DataFrames
    data_frames = []

    for file in data_files:
        # Check if the file exists
        if os.path.isfile(file):
            # Load the file into a DataFrame and append it to the list
            data_frames.append(pd.read_csv(file))
        else:
            print(f"The file '{file}' does not exist.")

    # Concatenate all the DataFrames in the list
    combined_data = pd.concat(data_frames, ignore_index=True)

    return combined_data



def prepare_data_for_regression(data_file):
    """
    This function prepares the data for regression analysis by extracting features and labels from the data.

    Args:
    data_file (str): The path to the full_data.csv file.

    Returns:
    data_matrix (DataFrame): A DataFrame where each row represents a text, each column represents a feature,
                            and the last column is the label.
    """
    # Load the data
    data = pd.read_csv(data_file)

    # Convert the DataFrame to a list of tuples
    data = list(data.itertuples(index=False, name=None))

    # Initialize lists to store features and labels
    feature_list = []

    # Load the model and tokenizer
    model, tokenizer = load_model()

    # Remove prefixes
    texts, labels = remove_prefix(data)
    prompts_and_texts = extract_prompts_and_texts(data)

    for (prompt, text), label in zip(prompts_and_texts, labels):
        # Count POS tags in the text
        pos_counts, punctuation_counts, function_word_counts = count_pos_tags_and_special_elements(text)

        # Calculate the Flesch Reading Ease and Flesch-Kincaid Grade Level
        flesch_reading_ease, flesch_kincaid_grade_level = calculate_readability_scores(text)

        # Calculate the average word length
        avg_word_length = calculate_average_word_length([text])

        # Calculate the average sentence length
        avg_sentence_length = calculate_average_sentence_length([text])

        # Calculate the perplexity of the text and average sentence perplexity
        # Truncate the text to the first 512 tokens
        text_encoded = tokenizer.encode(text, truncation=True, max_length=510)
        text = tokenizer.decode(text_encoded)
        text = text.replace('<s>', '').replace('</s>', '')

        text_perplexity = calculate_perplexity(text, model, tokenizer)
        sentence_perplexities = [calculate_perplexity(sentence.text, model, tokenizer) for sentence in nlp(text).sents]
        sentence_perplexities = [p for p in sentence_perplexities if p is not None]
        avg_sentence_perplexity = sum(sentence_perplexities) / len(
            sentence_perplexities) if sentence_perplexities else None

        # Calculate the frequency of uppercase letters
        uppercase_freq = sum(1 for char in text if char.isupper()) / len(text)

        # Calculate the cosine similarity for the prompt and text
        prompt_text_cosine_similarity = calculate_cosine_similarity(prompt, text, model, tokenizer)

        # Calculate the average cosine similarity for sentences in the text
        sentence_cosine_similarities = calculate_cosine_similarities_for_sentences_in_text(text, model, tokenizer)
        avg_sentence_cosine_similarity = None
        if sentence_cosine_similarities:
            avg_sentence_cosine_similarity = sum(sentence_cosine_similarities) / len(sentence_cosine_similarities)
        else:
            print("WARNING: No sentence cosine similarities calculated for text:", text)

        # Prepare a dictionary to append to the feature list
        features = {
            'ADJ': pos_counts.get('ADJ', 0),
            'ADV': pos_counts.get('ADV', 0),
            'CONJ': pos_counts.get('CONJ', 0),
            'NOUN': pos_counts.get('NOUN', 0),
            'NUM': pos_counts.get('NUM', 0),
            'VERB': pos_counts.get('VERB', 0),
            'COMMA': punctuation_counts.get(',', 0),
            'FULLSTOP': punctuation_counts.get('.', 0),
            'SPECIAL-': punctuation_counts.get('-', 0),
            'FUNCTION-A': function_word_counts.get('a', 0),
            'FUNCTION-IN': function_word_counts.get('in', 0),
            'FUNCTION-OF': function_word_counts.get('of', 0),
            'FUNCTION-THE': function_word_counts.get('the', 0),
            'uppercase_freq': uppercase_freq,  # new feature
            'flesch_reading_ease': flesch_reading_ease,
            'flesch_kincaid_grade_level': flesch_kincaid_grade_level,
            'avg_word_length': avg_word_length,
            'avg_sentence_length': avg_sentence_length,
            'text_perplexity': text_perplexity,
            'avg_sentence_perplexity': avg_sentence_perplexity,
            'prompt_text_cosine_similarity': prompt_text_cosine_similarity,  # new feature
            'avg_sentence_cosine_similarity': avg_sentence_cosine_similarity,  # new feature
            'label': label
        }

        # Add the feature dictionary to the feature list
        feature_list.append(features)

    # Convert the list of dictionaries into a DataFrame
    data_matrix = pd.DataFrame(feature_list).fillna(0)

    # Check if the file already exists
    if os.path.exists('data_matrix.csv'):
        overwrite = input('File data_matrix.csv already exists. Do you want to overwrite it? (y/n): ')
        if overwrite.lower() == 'y':
            data_matrix.to_csv('data_matrix.csv', index=False)
    else:
        data_matrix.to_csv('data_matrix.csv', index=False)

    return data_matrix

In [13]:
data_matrix = prepare_data_for_regression('full_data_test.csv')



In [11]:
print(data_matrix)

   ADJ  ADV  CONJ  NOUN  NUM  VERB  COMMA  FULLSTOP  SPECIAL-  FUNCTION-A  \
0   11    3     0    24    0    10      5         4         0           3   
1    6    2     0     7    0     6      2         2         0           1   
2    3    0     0     3    0     0      0         1         0           1   
3   12    3     0    18    1     7      1         3         2           1   
4   12    5     0    34    0    14      8         6         1           0   

   ...  uppercase_freq  flesch_reading_ease  flesch_kincaid_grade_level  \
0  ...        0.029173                38.35                        13.9   
1  ...        0.025974                52.19                        10.7   
2  ...        0.013514                68.77                         6.4   
3  ...        0.023641                42.00                        12.5   
4  ...        0.013959                28.33                        13.7   

   avg_word_length  avg_sentence_length  text_perplexity  \
0         5.278351        