In [6]:
import datasets
import re
import pandas as pd
import os
import random

# Constants
DATASETS = ['pubmed_qa', 'writingprompts', 'cnn_dailymail']
DATA_PATH = '../data/writingPrompts'
NUM_EXAMPLES = 150
TAGS = ['[ WP ]', '[ OT ]', '[ IP ]', '[ HP ]', '[ TT ]', '[ Punch ]', '[ FF ]', '[ CW ]', '[ EU ]', '[ CC ]', '[ RF ]',
        '[ wp ]', '[ Wp ]', '[ RF ]', '[ WP/MP ]']


def strip_newlines(text):
    """
    Removes newline characters from a string.

    Args:
        text (str): Input text string.

    Returns:
        str: Text with newline characters removed.
    """
    return ' '.join(text.split())


def process_text(text, replacements):
    """
    Performs a series of replacements in a string.

    Args:
        text (str): Input text string.
        replacements (dict): Dictionary mapping old substring to new substring.

    Returns:
        str: Text with specified replacements made.
    """
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text


def remove_whitespace_before_punctuations(text):
    """
    Removes whitespace before punctuation marks in a string.

    Args:
        text (str): Input text string.

    Returns:
        str: Text with whitespace removed before punctuation marks.
    """
    return re.sub(r'\s([?.!,:;](?:\s|$))', r'\1', text)


def load_pubmed(num_examples=NUM_EXAMPLES):
    """
    Loads the PubMed QA dataset.

    Args:
        num_examples (int, optional): Number of examples to load. Defaults to NUM_EXAMPLES.

    Returns:
        list: List of tuples where each tuple is a question-answer pair and a label (always 0).
    """
    data = datasets.load_dataset('pubmed_qa', 'pqa_labeled', split=f'train[:{num_examples}]')
    data = [(f'Question: {q} Answer: {a}', 0) for q, a in zip(data['question'], data['long_answer'])]
    return data


def load_writingPrompts(data_path=DATA_PATH, num_examples=NUM_EXAMPLES):
    """
    Loads the WritingPrompts dataset.

    Args:
        data_path (str, optional): Path to the dataset. Defaults to DATA_PATH.
        num_examples (int, optional): Number of examples to load. Defaults to NUM_EXAMPLES.

    Returns:
        list: List of tuples where each tuple is a prompt-story pair and a label (always 0).
    """
    with open(f'{data_path}/valid.wp_source', 'r', encoding='utf-8') as f:
        prompts = f.readlines()[:num_examples]
    with open(f'{data_path}/valid.wp_target', 'r', encoding='utf-8') as f:
        stories = f.readlines()[:num_examples]

    prompt_replacements = {tag: '' for tag in TAGS}
    prompts = [process_text(prompt, prompt_replacements) for prompt in prompts]
    prompts = [remove_whitespace_before_punctuations(prompt) for prompt in prompts]

    story_replacements = {
        ' ,': ',',
        ' .': '.',
        ' ?': '?',
        ' !': '!',
        ' ;': ';',
        ' \'': '\'',
        ' ’ ': '\'',
        ' :': ':',
        '<newline>': '\n',
        '`` ': '"',
        ' \'\'': '"',
        '\'\'': '"',
        '.. ': '... ',
        ' )': ')',
        '( ': '(',
        ' n\'t': 'n\'t',
        ' i ': ' I ',
        ' i\'': ' I\'',
        '\\\'': '\'',
        '\n ': '\n',
    }
    stories = [process_text(story, story_replacements).strip() for story in stories]
    joined = ["Prompt:" + prompt + " Story: " + story for prompt, story in zip(prompts, stories)]
    filtered = [story for story in joined if 'nsfw' not in story.lower()]
    data = [(story, 0) for story in filtered]
    return data


def load_cnn_daily_mail(num_examples=NUM_EXAMPLES):
    """
    Loads the CNN/Daily Mail dataset.

    Args:
        num_examples (int, optional): Number of examples to load. Defaults to NUM_EXAMPLES.

    Returns:
        list: List of tuples where each tuple is a summary-article pair and a label (always 0).
    """
    data = datasets.load_dataset('cnn_dailymail', '3.0.0', split=f'train[:{num_examples}]')

    processed_data = []
    for a, s in zip(data['article'], data['highlights']):
        # remove the string and the '--' from the start of the articles
        a = re.sub('^[^-]*--', '', a).strip()

        # remove the string 'E-mail to a friend.' from the articles, if present
        a = a.replace('E-mail to a friend .', '')
        s = s.replace('NEW:', '')
        a = a.replace(
            'Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, '
            'or redistributed.',
            '')

        # remove whitespace before punctuation marks in both article and summary
        a = remove_whitespace_before_punctuations(a)
        s = remove_whitespace_before_punctuations(s)

        processed_data.append((f'Summary: {s} Article: {a}', 0))
        data = processed_data

    return data


def load_data(dataset_name):
    """
       Loads a dataset based on its name.

       Args:
           dataset_name (str): Name of the dataset to load.

       Returns:
           list: List of data from the specified dataset.

       Raises:
           ValueError: If the dataset_name is not recognized.
    """
    if dataset_name == 'pubmed_qa':
        return load_pubmed()
    elif dataset_name == 'writingprompts':
        return load_writingPrompts()
    elif dataset_name == 'cnn_dailymail':
        return load_cnn_daily_mail()
    else:
        raise ValueError(f"Dataset name {dataset_name} not recognized.")


def preprocess_data(dataset):
    """
        Preprocesses a dataset.

        Args:
            dataset (str): Name of the dataset to preprocess.

        Returns:
            list: List of preprocessed data from the specified dataset.

        Raises:
            ValueError: If the dataset_name is not recognized.
    """
    if dataset not in DATASETS:
        raise ValueError(f"Dataset name {dataset} not recognized.")

    data = load_data(dataset)
    data = list(dict.fromkeys(data))
    data = [(strip_newlines(q).strip(), a) for q, a in data]
    if dataset == 'pubmed_qa':
        print(f"Loaded and pre-processed {len(data)} questions from the dataset")  # debug print

    # Getting long-enough prompts, can do the same for the articles as well
    if dataset == 'writingprompts' or dataset == 'cnn_dailymail':
        long_data = [(x, y) for x, y in data if len(x.split()) > 250]
        if len(long_data) > 0:
            data = long_data
        print(f"Loaded and pre-processed {len(data)} prompts/stories[summaries/articles] from the dataset")  # debug
        # print

    return data


def convert_to_csv(data, dataset_name, directory='Labelled_Data'):
    """
        Converts the data to a DataFrame and saves it to a CSV file in the specified directory.

        Args:
            data (list): List of data to be converted to CSV.
            dataset_name (str): Name of the dataset.
            directory (str, optional): Name of the directory to save the CSV file. Defaults to 'Labelled_Data'.

        Returns:
            None
    """
    # Check if directory exists, if not, create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Convert data to DataFrame
    df = pd.DataFrame(data, columns=['text', 'label'])

    # Write DataFrame to CSVv
    df.to_csv(f'{directory}/{dataset_name}_Human_data.csv', index=False)


def combine_datasets(datasets=DATASETS, extract_prompts=False, directory='Labelled_Data'):
    """
    Combines data from multiple datasets into a single dataset. If specified, extracts prompts based on dataset names,
    and saves the result to a CSV file.

    Args:
        directory: Where the file will be saved
        datasets (list, optional): List of datasets to combine. Defaults to DATASETS.
        extract_prompts (bool, optional): Whether to extract prompts from the combined data. Defaults to False.

    Returns:
        None
    """
    # Initialize a list to store the combined data
    combined_data = []

    # If specified, also store the extracted prompts
    extracted_prompts = [] if extract_prompts else None

    # Load and preprocess data from each dataset
    for dataset in datasets:
        data = preprocess_data(dataset)
        combined_data.extend(data)

        # If specified, extract prompts
        if extract_prompts:
            extracted_prompts.extend(extract_prompt(data, dataset))

    # Shuffle the combined data to ensure a mix of data from all datasets
    # random.shuffle(combined_data)
    # random.shuffle(extracted_prompts) if extract_prompts else None

    # Save the combined data to a CSV file
    convert_to_csv(combined_data, 'combined')

    # If specified, save the extracted prompts to a CSV file
    if extract_prompts:
        df = pd.DataFrame(extracted_prompts, columns=['text'])
        df.to_csv(f'{directory}/prompts.csv', index=False)


def extract_prompt(data, dataset_name):
    """
    Extracts the prompts from a preprocessed dataset.

    Args:
        data (list): Preprocessed data.
        dataset_name (str): Name of the dataset the data is from.

    Returns:
        list: List of extracted prompts.
    """
    prompts = []
    if dataset_name == 'pubmed_qa':
        prompts = [text.split('Answer:')[0] + 'Answer:' for text, label in data]
    elif dataset_name == 'cnn_dailymail':
        # Split the text into article and summary, then only append the summary
        prompts = [
            'Write a news article based on the following summary: ' + text.split('Summary:')[1].split('Article:')[
                0].strip() for text, label in data]
    elif dataset_name == 'writingprompts':
        prompts = [text.replace('Prompt:', '').split('Story:')[0].strip() + ' Continue the story:' for text, label in data]
    return prompts




In [7]:
combine_datasets(extract_prompts=True)

Found cached dataset pubmed_qa (C:/Users/atana/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924)


Loaded and pre-processed 150 questions from the dataset
Loaded and pre-processed 125 prompts/stories[summaries/articles] from the dataset


Found cached dataset cnn_dailymail (C:/Users/atana/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


Loaded and pre-processed 139 prompts/stories[summaries/articles] from the dataset


In [9]:


# After combining the datasets into a single DataFrame
combined_df = pd.read_csv("Labelled_Data/prompts.csv")


# Check if combined_df contains all entries from the datasets
actual_length = len(combined_df)

print(f"Actual length: {actual_length}")



Actual length: 414


In [15]:
x= preprocess_data('pubmed_qa')

Found cached dataset pubmed_qa (C:/Users/atana/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924)


Loaded and pre-processed 150 questions from the dataset


In [17]:
print(x[2][0])

Question: Syncope during bathing in infants, a pediatric form of water-induced urticaria? Answer: "Aquagenic maladies" could be a pediatric form of the aquagenic urticaria.


In [12]:
df = pd.read_csv("Labelled_Data/prompts.csv")
prompts = df['text'].tolist()

# Prepare data for new CSV file
data = []

# For each prompt, send it to the GPT-3 model and get a response
for prompt in prompts:
    print(prompt)

Question: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death? Answer:
Question: Landolt C and snellen e acuity: differences in strabismus amblyopia? Answer:
Question: Syncope during bathing in infants, a pediatric form of water-induced urticaria? Answer:
Question: Are the long-term results of the transanal pull-through equal to those of the transabdominal pull-through? Answer:
Question: Can tailored interventions increase mammography use among HMO women? Answer:
Question: Double balloon enteroscopy: is it efficacious and safe in a community setting? Answer:
Question: 30-Day and 1-year mortality in emergency general surgery laparotomies: an area of concern and need for improvement? Answer:
Question: Is adjustment for reporting heterogeneity necessary in sleep disorders? Answer:
Question: Do mutations causing low HDL-C promote increased carotid intima-media thickness? Answer:
Question: A short stay or 23-hour ward in a general and academic children'

In [13]:
import tiktoken


In [25]:
def token_count(csv_files):
    """
    Counts the number of tokens in a CSV file.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        None
    """

    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

    for csv_file in csv_files:
        # Load prompts from CSV file
        df = pd.read_csv(csv_file)
        prompts = df['text'].tolist()

        # Initialize a counter for total tokens
        total_tokens = 0

        for prompt in prompts:
            num_tokens = len(encoding.encode(prompt))
            total_tokens += num_tokens

        print(f"File '{csv_file}' has {total_tokens} tokens.")

        # Estimate cost
        if csv_file == 'Labelled_Data/prompts.csv':
            cost = (total_tokens / 1000) * 0.003
            print(f"Estimated cost for '{csv_file}' is ${cost:.2f}")
       

In [27]:
token_count(['Labelled_Data/prompts.csv'])





File 'Labelled_Data/prompts.csv' has 16220 tokens.
Estimated cost for 'Labelled_Data/prompts.csv' is $0.05


In [35]:
data = preprocess_data('pubmed_qa')

texts, labels = zip(*data)

texts = [text.split("Answer:", 1)[1].strip() for text in texts]  # Strip the 'Answer:' prefix'
   
    
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

for text in texts:
    num_tokens = len(encoding.encode(text))
    total_tokens += num_tokens
    
average_tokens_pub = total_tokens / len(texts)
print(average_tokens_pub)
    


Found cached dataset pubmed_qa (C:/Users/atana/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924)


Loaded and pre-processed 150 questions from the dataset
215.34666666666666


150


In [58]:
def average_token_count(dataset_name,data):
    """
    Calculates the average number of tokens in the answers of the pubmed_qa dataset.

    Returns:
        float: Average number of tokens in the answers of the pubmed_qa dataset.
    """
    texts, labels = remove_prefix(dataset_name,data)
    
    print(texts[0])
   
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

    total_tokens = 0

    for text in texts:
        num_tokens = len(encoding.encode(text))
        total_tokens += num_tokens

    average_tokens = total_tokens / len(texts)
    print(len(texts))

    return average_tokens

def remove_prefix(dataset_name, data):
    """
    This function removes a predefined prefix from each text in a given dataset.

    Args:
    dataset_name (str): The name of the dataset.
    data (list of tuples): The data from the dataset. Each element of the list is a tuple, where the first element
    is the text and the second element is its label.

    Returns:
    texts (list): The list of texts after the prefix has been removed.
    labels (list): The list of labels corresponding to the texts.
    """
    texts, labels = zip(*data)

    if dataset_name == 'pubmed_qa':
        texts = [text.split("Answer:", 1)[1].strip() for text in texts]  # Strip the 'Answer:' prefix'
    elif dataset_name == 'writingprompts':
        texts = [text.split("Story:", 1)[1].strip() for text in texts]  # Stripping the 'Story: ' string
    elif dataset_name == 'cnn_dailymail':
        texts = [text.split("Article:", 1)[1].strip() for text in texts]  # Stripping the 'Article: ' string

    return texts, labels


In [59]:
data = preprocess_data('pubmed_qa')

data2 = preprocess_data('writingprompts')


data3 = preprocess_data('cnn_dailymail')


Found cached dataset pubmed_qa (C:/Users/atana/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924)


Loaded and pre-processed 150 questions from the dataset
Loaded and pre-processed 125 prompts/stories[summaries/articles] from the dataset


Found cached dataset cnn_dailymail (C:/Users/atana/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


Loaded and pre-processed 139 prompts/stories[summaries/articles] from the dataset


In [60]:
print(average_token_count('pubmed_qa',data))

Results depicted mitochondrial dynamics in vivo as PCD progresses within the lace plant, and highlight the correlation of this organelle with other organelles during developmental PCD. To the best of our knowledge, this is the first report of mitochondria and chloroplasts moving on transvacuolar strands to form a ring structure surrounding the nucleus during developmental PCD. Also, for the first time, we have shown the feasibility for the use of CsA in a whole plant system. Overall, our findings implicate the mitochondria as playing a critical and early role in developmentally regulated PCD in the lace plant.
150
53.60666666666667


In [56]:
print(average_token_count('writingprompts',data2))

Clancy Marguerian, 154, private first class of the 150+ army, sits in his foxhole. Tired cold, wet and hungry, the only thing preventing him from laying down his rifle and walking towards the enemy lines in surrender is the knowledge that however bad he has it here, life as a 50-100 POW is surely much worse. He's fighting to keep his eyes open and his rifle ready when the mortar shells start landing near him. He hunkers lower. After a few minutes under the barrage, Marguerian hears hurried footsteps, a grunt, and a thud as a soldier leaps into the foxhole. The man's uniform is tan, he must be a 50-100. The two men snarl and grab at eachother, grappling in the small foxhole. Abruptly, their faces come together. "Clancy?" "Rob?" Rob Hall, 97, Corporal in the 50-100 army grins, as the situation turns from life or death struggle, to a meeting of two college friends. He lets go of Marguerian's collar. "Holy shit Clancy, you're the last person I expected to see here" "Yeah" "Shit man, I didn

In [57]:
print(average_token_count('cnn_dailymail',data3))

Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how he'll mark his landmark birt

In [53]:
print(data[0])

('Question: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death? Answer: Results depicted mitochondrial dynamics in vivo as PCD progresses within the lace plant, and highlight the correlation of this organelle with other organelles during developmental PCD. To the best of our knowledge, this is the first report of mitochondria and chloroplasts moving on transvacuolar strands to form a ring structure surrounding the nucleus during developmental PCD. Also, for the first time, we have shown the feasibility for the use of CsA in a whole plant system. Overall, our findings implicate the mitochondria as playing a critical and early role in developmentally regulated PCD in the lace plant.', 0)
