In [1]:
import datasets
import re
import pandas as pd
import os
import random
import tiktoken
import argparse
import glob

# Constants
DATASETS = ['pubmed_qa', 'writingprompts', 'cnn_dailymail', 'gpt']
DATA_PATH = './data/writingPrompts'
NUM_EXAMPLES = 200
TAGS = ['[ WP ]', '[ OT ]', '[ IP ]', '[ HP ]', '[ TT ]', '[ Punch ]', '[ FF ]', '[ CW ]', '[ EU ]', '[ CC ]', '[ RF ]',
        '[ wp ]', '[ Wp ]', '[ RF ]', '[ WP/MP ]']
directory = 'Labelled_Data/'


def strip_newlines(text):
    """
    Removes newline characters from a string.

    Args:
        text (str): Input text string.

    Returns:
        str: Text with newline characters removed.
    """
    return ' '.join(text.split())


def replace_text(text, replacements):
    """
    Performs a series of replacements in a string.

    Args:
        text (str): Input text string.
        replacements (dict): Dictionary mapping old substring to new substring.

    Returns:
        str: Text with specified replacements made.
    """
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text


def remove_whitespace_before_punctuations(text):
    """
    Removes whitespace before punctuation marks in a string.

    Args:
        text (str): Input text string.

    Returns:
        str: Text with whitespace removed before punctuation marks.
    """
    return re.sub(r'\s([?.!,:;](?:\s|$))', r'\1', text)


def load_pubmed(num_examples=NUM_EXAMPLES):
    """
    Loads the PubMed QA dataset.

    Args:
        num_examples (int, optional): Number of examples to load. Defaults to NUM_EXAMPLES.

    Returns:
        list: List of tuples where each tuple is a question-answer pair and a label (always 0).
    """
    data = datasets.load_dataset('pubmed_qa', 'pqa_labeled', split=f'train[:{num_examples}]')
    data = [(f'Question: {q} Answer: {a}', 0) for q, a in zip(data['question'], data['long_answer'])]
    return data


def load_gpt(dataset_name='gpt'):
    """
    Loads the GPT preprocessed dataset.

    Args:
        dataset_name (str, optional): Name of the preprocessed GPT dataset. Defaults to 'gpt'.

    Returns:
        list: List of tuples where each tuple is a text-label pair.
    """
    file_path = 'Labelled_Data/t1_preprocessed.csv'
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file '{file_path}' does not exist.")

    df = pd.read_csv(file_path)
    data = [(row['Text'], row['Label']) for index, row in df.iterrows()]

    return data


def load_writingPrompts(data_path=DATA_PATH, num_examples=NUM_EXAMPLES):
    """
    Loads the WritingPrompts dataset. Combines Prompts and Stories with additional formatting.

    Args:
        data_path (str, optional): Path to the dataset. Defaults to DATA_PATH.
        num_examples (int, optional): Number of examples to load. Defaults to NUM_EXAMPLES.

    Returns:
        list: List of tuples where each tuple is a prompt-story pair and a label (always 0).
    """
    with open(f'{data_path}/valid.wp_source', 'r', encoding='utf-8') as f:
        prompts = f.readlines()[:num_examples]
    with open(f'{data_path}/valid.wp_target', 'r', encoding='utf-8') as f:
        stories = f.readlines()[:num_examples]

    prompt_replacements = {tag: '' for tag in TAGS}
    prompts = [replace_text(prompt, prompt_replacements) for prompt in prompts]
    prompts = [remove_whitespace_before_punctuations(prompt) for prompt in prompts]

    story_replacements = {
        ' ,': ',',
        ' .': '.',
        ' ?': '?',
        ' !': '!',
        ' ;': ';',
        ' \'': '\'',
        ' ’ ': '\'',
        ' :': ':',
        '<newline>': '\n',
        '`` ': '"',
        ' \'\'': '"',
        '\'\'': '"',
        '.. ': '... ',
        ' )': ')',
        '( ': '(',
        ' n\'t': 'n\'t',
        ' i ': ' I ',
        ' i\'': ' I\'',
        '\\\'': '\'',
        '\n ': '\n',
    }
    stories = [replace_text(story, story_replacements).strip() for story in stories]
    joined = ["Prompt:" + prompt + " Story: " + story for prompt, story in zip(prompts, stories)]
    filtered = [story for story in joined if 'nsfw' not in story.lower()]
    data = [(story, 0) for story in filtered]
    return data


def load_cnn_daily_mail(num_examples=NUM_EXAMPLES):
    """
    Loads the CNN/Daily Mail dataset. Combines article and summary with additional formatting.

    Args:
        num_examples (int, optional): Number of examples to load. Defaults to NUM_EXAMPLES.

    Returns:
        list: List of tuples where each tuple is a summary-article pair and a label (always 0).
    """
    data = datasets.load_dataset('cnn_dailymail', '3.0.0', split=f'train[:{num_examples}]')

    processed_data = []
    for a, s in zip(data['article'], data['highlights']):
        # remove the string and the '--' from the start of the articles
        a = re.sub('^[^-]*--', '', a).strip()

        # remove the string 'E-mail to a friend.' from the articles, if present
        a = a.replace('E-mail to a friend .', '')
        s = s.replace('NEW:', '')
        a = a.replace(
            'Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, '
            'or redistributed.',
            '')

        # remove whitespace before punctuation marks in both article and summary
        a = remove_whitespace_before_punctuations(a)
        s = remove_whitespace_before_punctuations(s)

        processed_data.append((f'Summary: {s} Article: {a}', 0))
        data = processed_data

    return data


def load_data(dataset_name):
    """
       Loads a dataset based on its name.

       Args:
           dataset_name (str): Name of the dataset to load.

       Returns:
           list: List of data from the specified dataset.

       Raises:
           ValueError: If the dataset_name is not recognized.
    """
    if dataset_name == 'pubmed_qa':
        return load_pubmed()
    elif dataset_name == 'writingprompts':
        return load_writingPrompts()
    elif dataset_name == 'cnn_dailymail':
        return load_cnn_daily_mail()
    elif dataset_name == 'gpt':
        return load_gpt()
    else:
        raise ValueError(f"Dataset name {dataset_name} not recognized.")


def preprocess_data(dataset):
    """
        Preprocesses a dataset.

        Args:
            dataset (str): Name of the dataset to preprocess.

        Returns:
            list: List of preprocessed data from the specified dataset.

        Raises:
            ValueError: If the dataset_name is not recognized.
    """
    if dataset not in DATASETS:
        raise ValueError(f"Dataset name {dataset} not recognized.")

    data = load_data(dataset)
    data = list(dict.fromkeys(data))
    data = [(strip_newlines(q).strip(), a) for q, a in data]

    # Getting long-enough data, not done for PubMed due to most of respones being fairly short.
    if dataset == 'writingprompts' or dataset == 'cnn_dailymail':
        long_data = [(x, y) for x, y in data if len(x.split()) > 250]
        if len(long_data) > 0:
            data = long_data
        print(f"Loaded and pre-processed {len(data)} entries from the dataset {dataset}")  # debug
        # print
    else:
        print(f"Loaded and pre-processed {len(data)} entries from the dataset {dataset}")

    return data



In [2]:
import spacy
from collections import Counter
import torch
from statistics import mean
import seaborn as sns
import matplotlib.pyplot as plt
import textstat
import pandas as pd
import tiktoken
from transformers import RobertaTokenizer, RobertaForMaskedLM
import argparse
import os
from pathlib import Path
import time


# ------------------------------------------------------------------------------------------#
# Constants
nlp = spacy.load('en_core_web_sm')
FUNCTION_WORDS = {'a', 'in', 'of', 'the'}

def remove_prefix(dataset_name, data):
    """
    This function removes a predefined prefix from each text in a given dataset.

    Args:
    dataset_name (str): The name of the dataset.
    data (list of tuples): The data from the dataset. Each element of the list is a tuple, where the first element
    is the text and the second element is its label.

    Returns:
    texts (list): The list of texts after the prefix has been removed.
    labels (list): The list of labels corresponding to the texts.
    """

    texts, labels = zip(*data)

    if dataset_name == 'pubmed_qa':
        texts = [text.split("Answer:", 1)[1].strip() for text in texts if "Answer:" in text]
    elif dataset_name == 'writingprompts':
        texts = [text.split("Story:", 1)[1].strip() for text in texts if "Story:" in text]
    elif dataset_name == 'cnn_dailymail':
        texts = [text.split("Article:", 1)[1].strip() for text in texts if "Article:" in text]
    elif dataset_name == 'gpt':
        texts = [text.split("Answer:", 1)[1].strip() if "Answer:" in text else text for text in texts]
        texts = [text.split("Story:", 1)[1].strip() if "Story:" in text else text for text in texts]
        texts = [text.split("Article:", 1)[1].strip() if "Article:" in text else text for text in texts]

    return list(texts), list(labels)



def average_token_count(dataset_name, data):
    """
    Calculates the average number of tokens in the answers of a dataset.

    Returns:
        float: Average number of tokens in the answers of a dataset
    """
    texts, labels = remove_prefix(dataset_name, data)

    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

    total_tokens = 0

    for text in texts:
        num_tokens = len(encoding.encode(text))
        total_tokens += num_tokens

    average_tokens = total_tokens / len(texts)

    return average_tokens


# PUBMED = 54
# WP = 780
# CNN = 794


def count_pos_tags_and_special_elements(text):
    # CHECKED
    """
      This function counts the frequency of POS (Part of Speech) tags, punctuation marks, and function words in a given text.
      It uses the SpaCy library for POS tagging.

      Args:
      text (str): The text for which to count POS tags and special elements.

      Returns:
      pos_counts (dict): A dictionary where keys are POS tags and values are their corresponding count.
      punctuation_counts (dict): A dictionary where keys are punctuation marks and values are their corresponding count.
      function_word_counts (dict): A dictionary where keys are function words and values are their corresponding count.

    """
    # Use SpaCy to parse the text
    doc = nlp(text)

    # Create a counter of POS tags
    pos_counts = Counter(token.pos_ for token in doc)

    # Create a counter of punctuation marks
    punctuation_counts = Counter(token.text for token in doc if token.pos_ == 'PUNCT')

    # Create a counter of function words
    function_word_counts = Counter(token.text for token in doc if token.lower_ in FUNCTION_WORDS)

    return dict(pos_counts), dict(punctuation_counts), dict(function_word_counts)


def calculate_readability_scores(text):
    """
    This function calculates the Flesch Reading Ease and Flesch-Kincaid Grade Level of a text using the textstat library.

    Args:
    text (str): The text to score.

    Returns:
    flesch_reading_ease (float): The Flesch Reading Ease score of the text.
    flesch_kincaid_grade_level (float): The Flesch-Kincaid Grade Level of the text.

    """
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    flesch_kincaid_grade_level = textstat.flesch_kincaid_grade(text)

    return flesch_reading_ease, flesch_kincaid_grade_level


def load_and_count(dataset_name, data):
    """
       This function loads the texts from the dataset and calculates the frequency of POS tags, punctuation marks,
       and function words.

       Args:
       dataset_name (str): The name of the dataset.
       data (list of tuples): The data from the dataset. Each element of the list is a tuple, where the first element
       is the text and the second element is its label.

       Returns:
       overall_pos_counts (Counter): A Counter object of POS tag frequencies.
       overall_punctuation_counts (Counter): A Counter object of punctuation mark frequencies.
       overall_function_word_counts (Counter): A Counter object of function word frequencies.
    """

    # CHECKED
    # Extract texts
    texts, labels = remove_prefix(dataset_name, data)

    # Calculate POS tag frequencies for the texts
    pos_frequencies, punctuation_frequencies, function_word_frequencies = zip(
        *[count_pos_tags_and_special_elements(text) for text in texts])

    # Then, sum the dictionaries to get the overall frequencies
    overall_pos_counts = Counter()
    for pos_freq in pos_frequencies:
        overall_pos_counts += Counter(pos_freq)

    overall_punctuation_counts = Counter()
    for punct_freq in punctuation_frequencies:
        overall_punctuation_counts += Counter(punct_freq)

    overall_function_word_counts = Counter()
    for function_word_freq in function_word_frequencies:
        overall_function_word_counts += Counter(function_word_freq)

    return overall_pos_counts, overall_punctuation_counts, overall_function_word_counts


def load_model():
    # CHECKED
    """
      This function loads a pre-trained model and its corresponding tokenizer from the Hugging Face model hub.

      Returns:
      model: The loaded model.
      tokenizer: The tokenizer corresponding to the model.

    """
    # model_name = 'allenai/scibert_scivocab_uncased'
    # model = AutoModelForMaskedLM.from_pretrained(model_name)
    # tokenizer = AutoTokenizer.from_pretrained(model_name)

    model_name = 'roberta-base'
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = RobertaForMaskedLM.from_pretrained(model_name)
    return model, tokenizer


def calculate_average_word_length(texts):
    """
     This function calculates the average word length of a list of texts using the SpaCy library.

     Args:
     texts (list): The list of texts.

     Returns:
     (float): The average word length.

    """

    word_lengths = []

    for text in texts:
        doc = nlp(text)
        for token in doc:
            if not token.is_punct:  # ignore punctuation
                word_lengths.append(len(token.text))

    return mean(word_lengths)


def calculate_average_sentence_length(texts):
    # CHEKCED
    """
    This function calculates the average sentence length of a list of texts using the SpaCy library.

    Args:
    texts (list): The list of texts.

    Returns:
    avg_sentence_length (float): The average sentence length.
    """
    sentence_lengths = []

    for text in texts:
        doc = nlp(text)
        for sent in doc.sents:
            sentence_lengths.append(len(sent))

    return mean(sentence_lengths)


def calculate_perplexity(text, model, tokenizer):
    """
    Calculates the perplexity of a text using a language model and tokenizer.

    Args:
    text (str): The text for which perplexity will be calculated.
    model: The language model used to calculate perplexity.
    tokenizer: The tokenizer used to tokenize the text.

    Returns:
    perplexity (float or None): The calculated perplexity of the text, or None if the text is too long.
    """

    try:
        input_ids = tokenizer.encode(text, return_tensors='pt')
        # Truncate the text to the first 512 tokens
        input_ids = input_ids[:, :512]

        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss
            perplexity = torch.exp(loss)
        return perplexity.item()
    except Exception as e:
        print(f"An error occurred in calculate_perplexity: {e}")
        return None


def summary_statistics(dataset_name, data):
    # CHECKED
    """
       Calculates various summary statistics for a dataset.

       Args:
       dataset_name (str): The name of the dataset.
       data (dict): The data from the dataset.

       Returns:
       dict: A dictionary containing various summary statistics of the data.
   """
    texts, labels = remove_prefix(dataset_name, data)

    model, tokenizer = load_model()
    overall_pos_counts, overall_punctuation_counts, overall_function_word_counts = load_and_count(dataset_name, data)
    readability_scores = [calculate_readability_scores(text) for text in texts]
    average_flesch_reading_ease = mean(score[0] for score in readability_scores)
    average_flesch_kincaid_grade_level = mean(score[1] for score in readability_scores)
    average_word_length = calculate_average_word_length(texts)
    average_sentence_length = calculate_average_sentence_length(texts)
    text_perplexities = [calculate_perplexity(text, model, tokenizer) for text in texts]
    text_perplexities = [p for p in text_perplexities if p is not None]
    average_text_perplexity = sum(text_perplexities) / len(text_perplexities)
    sentences = [sentence.text for text in texts for sentence in nlp(text).sents]
    sentence_perplexities = [calculate_perplexity(sentence, model, tokenizer) for sentence in sentences]
    sentence_perplexities = [p for p in sentence_perplexities if p is not None]
    average_sentence_perplexity = sum(sentence_perplexities) / len(sentence_perplexities)
    return {
        'pos_freqs': overall_pos_counts,
        'punctuation_freqs': overall_punctuation_counts,
        'function_word_freqs': overall_function_word_counts,
        'average_word_length': average_word_length,
        'average_flesch_reading_ease': average_flesch_reading_ease,
        'average_flesch_kincaid_grade_level': average_flesch_kincaid_grade_level,
        'average_sentence_length': average_sentence_length,
        'average_text_perplexity': average_text_perplexity,
        'average_sentence_perplexity': average_sentence_perplexity,
        'sentence_perplexities': sentence_perplexities,  # added this
        'text_perplexities': text_perplexities  # and this
    }


def print_statistics(statistics):
    # CHECKED
    pos_freqs = statistics['pos_freqs']
    punctuation_freqs = statistics['punctuation_freqs']
    function_word_freqs = statistics['function_word_freqs']

    print(f"Frequency of adjectives: {pos_freqs.get('ADJ', 0)}")
    print(f"Frequency of adverbs: {pos_freqs.get('ADV', 0)}")
    print(f"Frequency of conjunctions: {pos_freqs.get('CCONJ', 0)}")
    print(f"Frequency of nouns: {pos_freqs.get('NOUN', 0)}")
    print(f"Frequency of numbers: {pos_freqs.get('NUM', 0)}")
    print(f"Frequency of pronouns: {pos_freqs.get('PRON', 0)}")
    print(f"Frequency of verbs: {pos_freqs.get('VERB', 0)}")
    print(f"Frequency of commas: {punctuation_freqs.get(',', 0)}")
    print(f"Frequency of fullstops: {punctuation_freqs.get('.', 0)}")
    print(f"Frequency of special character '-': {punctuation_freqs.get('-', 0)}")
    print(f"Frequency of function word 'a': {function_word_freqs.get('a', 0)}")
    print(f"Frequency of function word 'in': {function_word_freqs.get('in', 0)}")
    print(f"Frequency of function word 'of': {function_word_freqs.get('of', 0)}")
    print(f"Frequency of function word 'the': {function_word_freqs.get('the', 0)}")
    print(f"Average Flesch Reading Ease: {statistics['average_flesch_reading_ease']}")
    print(f"Average Flesch-Kincaid Grade Level: {statistics['average_flesch_kincaid_grade_level']}")
    print(f"Average word length: {statistics['average_word_length']}")
    print(f"Average sentence length: {statistics['average_sentence_length']}")
    print(f"Average sentence perplexity: {statistics['average_sentence_perplexity']}")
    print(f"Average text perplexity: {statistics['average_text_perplexity']}")



In [3]:
import torch
from scipy.spatial.distance import cosine

In [4]:
from sklearn.preprocessing import normalize

def calculate_cosine_similarity(text1, text2, model, tokenizer):
    """
    This function calculates cosine similarity between two texts.
    
    Args:
    text1 (str): The first text.
    text2 (str): The second text.
    model: The language model used to generate word embeddings.
    tokenizer: The tokenizer used to tokenize the text.

    Returns:
    cosine_similarity (float): The cosine similarity between the word embeddings of the two texts.
    """
    
    # Tokenize the texts
    input_ids1 = tokenizer.encode(text1, return_tensors="pt")
    input_ids2 = tokenizer.encode(text2, return_tensors="pt")

    # Generate word embeddings for the texts
    embeddings1 = model.roberta(input_ids1)[0].mean(dim=1).squeeze().detach()
    embeddings2 = model.roberta(input_ids2)[0].mean(dim=1).squeeze().detach()
    
    # Convert embeddings to numpy arrays
    embeddings1_np = embeddings1.numpy()
    embeddings2_np = embeddings2.numpy()

    # Apply L2 normalization to the embeddings
    normalized_embeddings1 = normalize(embeddings1_np.reshape(1, -1)).squeeze()
    normalized_embeddings2 = normalize(embeddings2_np.reshape(1, -1)).squeeze()

    # Convert back to torch tensors
    normalized_embeddings1 = torch.from_numpy(normalized_embeddings1)
    normalized_embeddings2 = torch.from_numpy(normalized_embeddings2)

    # Calculate cosine similarity
    cosine_similarity = 1 - cosine(embeddings1.numpy(), embeddings2.numpy())
    
    return cosine_similarity


In [5]:
def extract_prompts_and_texts(dataset_name, data):
    """
    This function extracts prompts and texts from the data for a specified dataset.

    Args:
    dataset_name (str): The name of the dataset.
    data (list of tuples): The data. Each tuple consists of a text (including prompt) and a label.

    Returns:
    prompts_and_texts (list of tuples): The list of tuples where each tuple contains a prompt and a text.
    """

    prompts_and_texts = []

    full_texts, _ = zip(*data)
    texts, labels = remove_prefix(dataset_name, data)

    starting_points = ["Question:", "Prompt:", "Article:"]
    end_points = ["Answer:", "Story:", "Summary:"]

    for full_text, text in zip(full_texts, texts):
        # Split the full_text depending on the dataset
        if dataset_name == 'pubmed_qa':
            split_text = full_text.split("Question:", 1)
            if len(split_text) == 2:
                _, temp_prompt = split_text
                prompt, _ = temp_prompt.split("Answer:", 1)
        elif dataset_name == 'writingprompts':
            split_text = full_text.split("Prompt:", 1)
            if len(split_text) == 2:
                _, temp_prompt = split_text
                prompt, _ = temp_prompt.split("Story:", 1)
        elif dataset_name == 'cnn_dailymail':
            split_text = full_text.split("Article:", 1)
            if len(split_text) == 2:
                _, temp_prompt = split_text
                prompt, _ = temp_prompt.split("Summary:", 1)
        elif dataset_name == 'gpt':
            # Identify the starting point for each entry in the 'gpt' dataset
            for starting_point in starting_points:
                if starting_point in full_text:
                    split_text = full_text.split(starting_point, 1)
                    if len(split_text) == 2:
                        _, temp_prompt = split_text
                        for end_point in end_points:
                            if end_point in temp_prompt:
                                prompt, _ = temp_prompt.split(end_point, 1)
                                break
                    break

        prompt = prompt.strip()  # remove leading and trailing whitespaces
        prompts_and_texts.append((prompt, text))  # append the prompt and text to the list

    return prompts_and_texts

In [6]:
def calculate_cosine_similarities_for_dataset(dataset_name, model, tokenizer):
    """
    This function calculates cosine similarities for all (prompt, text) pairs in a dataset.

    Args:
    dataset_name (str): The name of the dataset.
    model: The language model used to generate word embeddings.
    tokenizer: The tokenizer used to tokenize the text.

    Returns:
    cosine_similarities (list of floats): The list of cosine similarities.
    """
    
    prompts_and_texts = extract_prompts_and_texts(dataset_name, data)

    cosine_similarities = []
    for prompt, text in prompts_and_texts:
        cosine_similarity = calculate_cosine_similarity(prompt, text, model, tokenizer)
        cosine_similarities.append(cosine_similarity)

    return cosine_similarities

In [7]:
def calculate_cosine_similarities_for_sentences_in_text(text, model, tokenizer):
    """
    This function calculates cosine similarities for all consecutive pairs of sentences in a single text.

    Args:
    text (str): The text for which to calculate cosine similarities.
    model: The language model used to generate word embeddings.
    tokenizer: The tokenizer used to tokenize the text.

    Returns:
    cosine_similarities (list of floats): The list of cosine similarities.
    """
    
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    cosine_similarities = []

    for i in range(len(sentences) - 1):
        cosine_similarity = calculate_cosine_similarity(sentences[i], sentences[i+1], model, tokenizer)
        cosine_similarities.append(cosine_similarity)

    return cosine_similarities


In [8]:

def summary_statistics(dataset_name, data):
    # CHECKED
    """
       Calculates various summary statistics for a dataset.

       Args:
       dataset_name (str): The name of the dataset.
       data (dict): The data from the dataset.

       Returns:
       dict: A dictionary containing various summary statistics of the data.
       
   """
    
    model, tokenizer = load_model()

    prompts_and_responses = extract_prompts_and_texts(dataset_name, data)
    prompt_to_text_cosine_similarities = calculate_cosine_similarities_for_dataset(dataset_name, model, tokenizer)
    
    sentence_cosine_similarities = []
    for _, text in prompts_and_responses:
        sentence_cosine_similarities.extend(calculate_cosine_similarities_for_sentences_in_text(text, model, tokenizer))

    texts, labels = remove_prefix(dataset_name, data)

    overall_pos_counts, overall_punctuation_counts, overall_function_word_counts = load_and_count(dataset_name, data)
    readability_scores = [calculate_readability_scores(text) for text in texts]
    average_flesch_reading_ease = mean(score[0] for score in readability_scores)
    average_flesch_kincaid_grade_level = mean(score[1] for score in readability_scores)
    average_word_length = calculate_average_word_length(texts)
    average_sentence_length = calculate_average_sentence_length(texts)
    text_perplexities = [calculate_perplexity(text, model, tokenizer) for text in texts]
    text_perplexities = [p for p in text_perplexities if p is not None]
    average_text_perplexity = sum(text_perplexities) / len(text_perplexities)
    sentences = [sentence.text for text in texts for sentence in nlp(text).sents]
    sentence_perplexities = [calculate_perplexity(sentence, model, tokenizer) for sentence in sentences]
    sentence_perplexities = [p for p in sentence_perplexities if p is not None]
    average_sentence_perplexity = sum(sentence_perplexities) / len(sentence_perplexities)
    return {
        'pos_freqs': overall_pos_counts,
        'punctuation_freqs': overall_punctuation_counts,
        'function_word_freqs': overall_function_word_counts,
        'average_word_length': average_word_length,
        'average_flesch_reading_ease': average_flesch_reading_ease,
        'average_flesch_kincaid_grade_level': average_flesch_kincaid_grade_level,
        'average_sentence_length': average_sentence_length,
        'average_text_perplexity': average_text_perplexity,
        'average_sentence_perplexity': average_sentence_perplexity,
        'sentence_perplexities': sentence_perplexities,  # added this
        'text_perplexities': text_perplexities,  # and this
        'average_prompt_to_text_cosine_similarity': mean(prompt_to_text_cosine_similarities),
        'average_sentence_cosine_similarity': mean(sentence_cosine_similarities),
    }


def print_statistics(statistics):
    # CHECKED
    pos_freqs = statistics['pos_freqs']
    punctuation_freqs = statistics['punctuation_freqs']
    function_word_freqs = statistics['function_word_freqs']

    print(f"Frequency of adjectives: {pos_freqs.get('ADJ', 0)}")
    print(f"Frequency of adverbs: {pos_freqs.get('ADV', 0)}")
    print(f"Frequency of conjunctions: {pos_freqs.get('CCONJ', 0)}")
    print(f"Frequency of nouns: {pos_freqs.get('NOUN', 0)}")
    print(f"Frequency of numbers: {pos_freqs.get('NUM', 0)}")
    print(f"Frequency of pronouns: {pos_freqs.get('PRON', 0)}")
    print(f"Frequency of verbs: {pos_freqs.get('VERB', 0)}")
    print(f"Frequency of commas: {punctuation_freqs.get(',', 0)}")
    print(f"Frequency of fullstops: {punctuation_freqs.get('.', 0)}")
    print(f"Frequency of special character '-': {punctuation_freqs.get('-', 0)}")
    print(f"Frequency of function word 'a': {function_word_freqs.get('a', 0)}")
    print(f"Frequency of function word 'in': {function_word_freqs.get('in', 0)}")
    print(f"Frequency of function word 'of': {function_word_freqs.get('of', 0)}")
    print(f"Frequency of function word 'the': {function_word_freqs.get('the', 0)}")
    print(f"Average Flesch Reading Ease: {statistics['average_flesch_reading_ease']}")
    print(f"Average Flesch-Kincaid Grade Level: {statistics['average_flesch_kincaid_grade_level']}")
    print(f"Average word length: {statistics['average_word_length']}")
    print(f"Average sentence length: {statistics['average_sentence_length']}")
    print(f"Average sentence perplexity: {statistics['average_sentence_perplexity']}")
    print(f"Average text perplexity: {statistics['average_text_perplexity']}")
    print(f"Average prompt to text cosine simiarity: {statistics['average_prompt_to_text_cosine_similarity']}")
    print(f"Average sentence cosine simiarity: {statistics['average_sentence_cosine_similarity']}")

    


In [9]:
data = preprocess_data('pubmed_qa')


Found cached dataset pubmed_qa (C:/Users/atana/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924)


Loaded and pre-processed 200 entries from the dataset pubmed_qa


In [10]:
x = summary_statistics('pubmed_qa',data)


KeyboardInterrupt



In [None]:
print_statistics(x)

In [43]:
print_statistics(x)

Frequency of adjectives: 1189
Frequency of adverbs: 265
Frequency of conjunctions: 288
Frequency of nouns: 2642
Frequency of numbers: 58
Frequency of pronouns: 199
Frequency of verbs: 851
Frequency of commas: 264
Frequency of fullstops: 402
Frequency of special character '-': 112
Frequency of function word 'a': 156
Frequency of function word 'in': 236
Frequency of function word 'of': 372
Frequency of function word 'the': 342
Average Flesch Reading Ease: 30.3669
Average Flesch-Kincaid Grade Level: 14.3805
Average word length: 5.606309370235722
Average sentence length: 23.304455445544555
Average sentence perplexity: 1.085525385223993
Average text perplexity: 1.0855632001161575
Average prompt to text cosine simiarity: 0.9767077764868737
Average sentence cosine simiarity: 0.979069270047487


In [21]:
def prepare_data_for_regression(data, dataset_name):
    """
    This function prepares the data for regression analysis by extracting features and labels from the data.

    Args:
    data (list of tuples): The data from the dataset. Each element of the list is a tuple, where the first element
    is the text and the second element is its label.

    Returns:
    data_matrix (DataFrame): A DataFrame where each row represents a text, each column represents a feature,
                            and the last column is the label.
    """
    # Initialize lists to store features and labels
    feature_list = []

    # Load the model and tokenizer
    model, tokenizer = load_model()

    # Remove prefixes
    texts, labels = remove_prefix(dataset_name, data)
    prompts_and_texts = extract_prompts_and_texts(dataset_name, data)

    for (prompt, text), label in zip(prompts_and_texts, labels):
        # Count POS tags in the text
        pos_counts, punctuation_counts, function_word_counts = count_pos_tags_and_special_elements(text)

        # Calculate the Flesch Reading Ease and Flesch-Kincaid Grade Level
        flesch_reading_ease, flesch_kincaid_grade_level = calculate_readability_scores(text)

        # Calculate the average word length
        avg_word_length = calculate_average_word_length([text])

        # Calculate the average sentence length
        avg_sentence_length = calculate_average_sentence_length([text])

        # Calculate the perplexity of the text and average sentence perplexity
        # Truncate the text to the first 512 tokens
        text_encoded = tokenizer.encode(text, truncation=True, max_length=510)
        text = tokenizer.decode(text_encoded)
        text = text.replace('<s>', '').replace('</s>', '')


        text_perplexity = calculate_perplexity(text, model, tokenizer)
        sentence_perplexities = [calculate_perplexity(sentence.text, model, tokenizer) for sentence in nlp(text).sents]
        sentence_perplexities = [p for p in sentence_perplexities if p is not None]
        avg_sentence_perplexity = sum(sentence_perplexities) / len(
            sentence_perplexities) if sentence_perplexities else None

        # Calculate the frequency of uppercase letters
        uppercase_freq = sum(1 for char in text if char.isupper()) / len(text)
        
        # Calculate the cosine similarity for the prompt and text
        prompt_text_cosine_similarity = calculate_cosine_similarity(prompt, text, model, tokenizer)
        
        # Calculate the average cosine similarity for sentences in the text
        sentence_cosine_similarities = calculate_cosine_similarities_for_sentences_in_text(text, model, tokenizer)
        avg_sentence_cosine_similarity = None
        if sentence_cosine_similarities:
            avg_sentence_cosine_similarity = sum(sentence_cosine_similarities) / len(sentence_cosine_similarities)
        else:
            print("WARNING: No sentence cosine similarities calculated for text:", text)


        # Prepare a dictionary to append to the feature list
        features = {
            'ADJ': pos_counts.get('ADJ', 0),
            'ADV': pos_counts.get('ADV', 0),
            'CONJ': pos_counts.get('CONJ', 0),
            'NOUN': pos_counts.get('NOUN', 0),
            'NUM': pos_counts.get('NUM', 0),
            'VERB': pos_counts.get('VERB', 0),
            'COMMA': punctuation_counts.get(',', 0),
            'FULLSTOP': punctuation_counts.get('.', 0),
            'SPECIAL-': punctuation_counts.get('-', 0),
            'FUNCTION-A': function_word_counts.get('a', 0),
            'FUNCTION-IN': function_word_counts.get('in', 0),
            'FUNCTION-OF': function_word_counts.get('of', 0),
            'FUNCTION-THE': function_word_counts.get('the', 0),
            'uppercase_freq': uppercase_freq,  # new feature
            'flesch_reading_ease': flesch_reading_ease,
            'flesch_kincaid_grade_level': flesch_kincaid_grade_level,
            'avg_word_length': avg_word_length,
            'avg_sentence_length': avg_sentence_length,
            'text_perplexity': text_perplexity,
            'avg_sentence_perplexity': avg_sentence_perplexity,
            'prompt_text_cosine_similarity': prompt_text_cosine_similarity,  # new feature
            'avg_sentence_cosine_similarity': avg_sentence_cosine_similarity,  # new feature
            'label': label
        }

        # Add the feature dictionary to the feature list
        feature_list.append(features)

    # Convert the list of dictionaries into a DataFrame
    data_matrix = pd.DataFrame(feature_list).fillna(0)

    return data_matrix

In [22]:
y = prepare_data_for_regression(data,'pubmed_qa')





In [23]:
y.to_csv('output.csv', index=False)
