In [1]:
import datasets
import re
import pandas as pd
import os

# Constants
DATASETS = ['pubmed_qa', 'writingprompts', 'cnn_dailymail']
DATA_PATH = 'data/writingPrompts'
NUM_EXAMPLES = 150
TAGS = ['[ WP ]', '[ OT ]', '[ IP ]', '[ HP ]', '[ TT ]', '[ Punch ]', '[ FF ]', '[ CW ]', '[ EU ]']


def strip_newlines(text):
    """
    Removes newline characters from a string.
    """
    return ' '.join(text.split())


def process_text(text, replacements):
    """
    Performs a series of replacements in a string.
    """
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text


def remove_whitespace_before_punctuations(text):
    """
    Removes whitespace before punctuation marks in a string.
    """
    return re.sub(r'\s([?.!,:;](?:\s|$))', r'\1', text)


def load_pubmed(num_examples=NUM_EXAMPLES):
    """
    Loads the PubMed QA dataset.
    """
    data = datasets.load_dataset('pubmed_qa', 'pqa_labeled', split=f'train[:{num_examples}]')
    data = [(f'Question: {q} Answer: {a}', 0) for q, a in zip(data['question'], data['long_answer'])]
    return data


def load_writingPrompts(data_path=DATA_PATH, num_examples=NUM_EXAMPLES):
    """
    Loads the WritingPrompts dataset.
    """
    with open(f'{data_path}/valid.wp_source', 'r', encoding='utf-8') as f:
        prompts = f.readlines()[:num_examples]
    with open(f'{data_path}/valid.wp_target', 'r', encoding='utf-8') as f:
        stories = f.readlines()[:num_examples]

    prompt_replacements = {tag: '' for tag in TAGS}
    prompts = [process_text(prompt, prompt_replacements) for prompt in prompts]
    prompts = [remove_whitespace_before_punctuations(prompt) for prompt in prompts]

    story_replacements = {
        ' ,': ',',
        ' .': '.',
        ' ?': '?',
        ' !': '!',
        ' ;': ';',
        ' \'': '\'',
        ' ’ ': '\'',
        ' :': ':',
        '<newline>': '\n',
        '`` ': '"',
        ' \'\'': '"',
        '\'\'': '"',
        '.. ': '... ',
        ' )': ')',
        '( ': '(',
        ' n\'t': 'n\'t',
        ' i ': ' I ',
        ' i\'': ' I\'',
        '\\\'': '\'',
        '\n ': '\n',
    }
    stories = [process_text(story, story_replacements).strip() for story in stories]
    joined = ["Prompt:" + prompt + " Story: " + story for prompt, story in zip(prompts, stories)]
    filtered = [story for story in joined if 'nsfw' not in story.lower()]
    data = [(story, 0) for story in filtered]
    return data


def load_cnn_daily_mail(num_examples=NUM_EXAMPLES):
    """
    Loads the CNN/Daily Mail dataset.
    """
    data = datasets.load_dataset('cnn_dailymail', '3.0.0', split=f'train[:{num_examples}]')

    processed_data = []
    for a, s in zip(data['article'], data['highlights']):
        # remove the string and the '--' from the start of the articles
        a = re.sub('^[^-]*--', '', a).strip()

        # remove the string 'E-mail to a friend.' from the articles, if present
        a = a.replace('E-mail to a friend .', '')
        s = s.replace('NEW:', '')
        a = a.replace(
            'Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, '
            'or redistributed.',
            '')

        # remove whitespace before punctuation marks in both article and summary
        a = remove_whitespace_before_punctuations(a)
        s = remove_whitespace_before_punctuations(s)

        processed_data.append((f'Summary: {s} Article: {a}', 0))
        data = processed_data

    return data


def load_data(dataset_name):
    """
    Loads a dataset based on its name.
    """
    if dataset_name == 'pubmed_qa':
        return load_pubmed()
    elif dataset_name == 'writingprompts':
        return load_writingPrompts()
    elif dataset_name == 'cnn_dailymail':
        return load_cnn_daily_mail()
    else:
        raise ValueError(f"Dataset name {dataset_name} not recognized.")


def preprocess_data(dataset):
    """
    Preprocesses a dataset.
    """
    if dataset not in DATASETS:
        raise ValueError(f"Dataset name {dataset} not recognized.")

    data = load_data(dataset)
    data = list(dict.fromkeys(data))
    data = [(strip_newlines(q).strip(), a) for q, a in data]
    if dataset == 'pubmed_qa':
        print(f"Loaded and pre-processed {len(data)} questions from the dataset")  # debug print

    # Getting long-enough prompts, can do the same for the articles as well
    if dataset == 'writingprompts' or dataset == 'cnn_dailymail':
        long_data = [(x, y) for x, y in data if len(x.split()) > 250]
        if len(long_data) > 0:
            data = long_data
        print(f"Loaded and pre-processed {len(data)} prompts/stories[summaries/articles] from the dataset")  # debug
        # print

    return data


def convert_to_csv(data, dataset_name, directory='Labelled_Data'):
    """
    Converts the data to a DataFrame and saves it to a CSV file in the specified directory.
    """
    # Check if directory exists, if not, create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Convert data to DataFrame
    df = pd.DataFrame(data, columns=['text', 'label'])

    # Write DataFrame to CSVv
    df.to_csv(f'{directory}/{dataset_name}_Human_data.csv', index=False)

In [33]:
def prepare_data_for_regression(data, dataset_name):
    """
       This function prepares the data for regression analysis by extracting features and labels from the data.

       Args:
       data (list of tuples): The data from the dataset. Each element of the list is a tuple, where the first element
       is the text and the second element is its label.

       Returns:
       feature_matrix (DataFrame): A DataFrame where each row represents a text and each column represents a feature.
       label_vector (Series): A Series where each element is the label of a text.
    """
    # Initialize lists to store features and labels
    feature_list = []
    label_list = []

    # Load the model and tokenizer
    model, tokenizer = load_model()

    # Remove prefixes
    texts, labels = remove_prefix(dataset_name, data)
    print(texts[0])

    for text, label in zip(texts, labels):
        # Count POS tags in the text
        pos_counts, punctuation_counts, function_word_counts = count_pos_tags_and_special_elements(text)

        # Calculate the Flesch Reading Ease and Flesch-Kincaid Grade Level
        flesch_reading_ease, flesch_kincaid_grade_level = calculate_readability_scores(text)

        # Calculate the average word length
        avg_word_length = calculate_average_word_length([text])

        # Calculate the average sentence length
        avg_sentence_length = calculate_average_sentence_length([text])

        # Calculate the perplexity of the text and average sentence perplexity
        text_perplexity = calculate_perplexity(text, model, tokenizer)
        sentence_perplexities = [calculate_perplexity(sentence.text, model, tokenizer) for sentence in nlp(text).sents]
        sentence_perplexities = [p for p in sentence_perplexities if p is not None]
        avg_sentence_perplexity = sum(sentence_perplexities) / len(
            sentence_perplexities) if sentence_perplexities else None

        # Prepare a dictionary to append to the feature list
        features = {**pos_counts, **punctuation_counts, **function_word_counts,
                    'flesch_reading_ease': flesch_reading_ease,
                    'flesch_kincaid_grade_level': flesch_kincaid_grade_level,
                    'avg_word_length': avg_word_length, 'avg_sentence_length': avg_sentence_length,
                    'text_perplexity': text_perplexity, 'avg_sentence_perplexity': avg_sentence_perplexity}

        # Add the feature dictionary and the label to their respective lists
        feature_list.append(features)
        label_list.append(label)

    # Convert the list of dictionaries into a DataFrame
    feature_matrix = pd.DataFrame(feature_list).fillna(0)

    # Convert the list of labels into a Series
    label_vector = pd.Series(label_list)

    return feature_matrix, label_vector

In [34]:
data = preprocess_data('writingprompts')

Found cached dataset pubmed_qa (C:/Users/atana/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924)


Loaded and pre-processed 150 questions from the dataset


In [35]:
feature,label = prepare_data_for_regression(data,'pubmed_qa')

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Results depicted mitochondrial dynamics in vivo as PCD progresses within the lace plant, and highlight the correlation of this organelle with other organelles during developmental PCD. To the best of our knowledge, this is the first report of mitochondria and chloroplasts moving on transvacuolar strands to form a ring structure surrounding the nucleus during developmental PCD. Also, for the first time, we have shown the feasibility for the use of CsA in a whole plant system. Overall, our findings implicate the mitochondria as playing a critical and early role in developmentally regulated PCD in the lace plant.


In [36]:
print(feature.head())

   NOUN  VERB  ADJ  ADP  SCONJ  PROPN   DET  PUNCT  CCONJ  PRON  ...    (  \
0    24  10.0   11   17    1.0    7.0  14.0      9    3.0   4.0  ...  0.0   
1     7   6.0    6    5    0.0    4.0   5.0      4    0.0   1.0  ...  0.0   
2     3   0.0    3    1    0.0    0.0   2.0      3    0.0   0.0  ...  0.0   
3    18   7.0   12    5    1.0    2.0   9.0      9    1.0   1.0  ...  1.0   
4    34  14.0   12   12    3.0    3.0   8.0     17    0.0   2.0  ...  1.0   

     )    ;  The   ):  SYM    :    A   In    X  
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 38 columns]


In [37]:
feature.shape

(150, 38)

In [38]:
label.shape

(150,)

In [40]:
data2 = preprocess_data('writingprompts')
feature2,label2 = prepare_data_for_regression(data2,'writingprompts')


Loaded and pre-processed 125 prompts/stories[summaries/articles] from the dataset


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Clancy Marguerian, 154, private first class of the 150+ army, sits in his foxhole. Tired cold, wet and hungry, the only thing preventing him from laying down his rifle and walking towards the enemy lines in surrender is the knowledge that however bad he has it here, life as a 50-100 POW is surely much worse. He's fighting to keep his eyes open and his rifle ready when the mortar shells start landing near him. He hunkers lower. After a few minutes under the barrage, Marguerian hears hurried footsteps, a grunt, and a thud as a soldier leaps into the foxhole. The man's uniform is tan, he must be a 50-100. The two men snarl and grab at eachother, grappling in the small foxhole. Abruptly, their faces come together. "Clancy?" "Rob?" Rob Hall, 97, Corporal in the 50-100 army grins, as the situation turns from life or death struggle, to a meeting of two college friends. He lets go of Marguerian's collar. "Holy shit Clancy, you're the last person I expected to see here" "Yeah" "Shit man, I didn

In [41]:
print(feature2.head())

   PROPN  PUNCT   NUM  ADJ  NOUN  ADP  DET  VERB  PRON  CCONJ  ...   ..  \
0   27.0  107.0  16.0   33    92   45   44    61    67     12  ...  0.0   
1   40.0  151.0   3.0   51   130   81   76    82    64     20  ...  0.0   
2    0.0   33.0   0.0   25    48   34   25    35    38      9  ...  0.0   
3   10.0   82.0   2.0   25    58   32   30    59    58     21  ...  0.0   
4    9.0   42.0   1.0   35    63   28   31    56    26     23  ...  0.0   

   .......  Herep-    ~  ......  'Get  'Tim  'And  'You   =)  
0      0.0     0.0  0.0     0.0   0.0   0.0   0.0   0.0  0.0  
1      0.0     0.0  0.0     0.0   0.0   0.0   0.0   0.0  0.0  
2      0.0     0.0  0.0     0.0   0.0   0.0   0.0   0.0  0.0  
3      0.0     0.0  0.0     0.0   0.0   0.0   0.0   0.0  0.0  
4      0.0     0.0  0.0     0.0   0.0   0.0   0.0   0.0  0.0  

[5 rows x 78 columns]


In [43]:
data3 = preprocess_data('cnn_dailymail')
feature3,label3 = prepare_data_for_regression(data3,'cnn_dailymail')

Found cached dataset cnn_dailymail (C:/Users/atana/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


Loaded and pre-processed 139 prompts/stories[summaries/articles] from the dataset


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how he'll mark his landmark birt

KeyboardInterrupt: 