In [7]:
# Vijay Venkatesan
# CS 6120 (Natural Language Processing)

In [164]:
# libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tqdm import tqdm

## Part 1

In [21]:
# Read in each of the csv files and store it in a dataframe
# Add a column to each dataframe called "RealNews?"
# Concatenate both dataframes into a new dataframe called df

df_real = pd.read_csv('True.csv')
df_real['RealNews?'] = True
df_fake = pd.read_csv('Fake.csv')
df_fake['RealNews?'] = False
df = pd.concat([df_real, df_fake], ignore_index = True)

In [23]:
# Creation of a new column called 'document' which combines the content in the 'title' and 'text' columns

df['document'] = df[['title', 'text']].agg(' '.join, axis = 1)

In [25]:
# Normalization of the data in the 'document' column by lowercasing the text which helps reduce the feature space

df['document'] = df['document'].apply(lambda x : x.lower())

In [21]:
# The dataset contains examples of both fake news/real news where the label associated with each example indicates whether it is real or fake

In [51]:
# Split the dataset into train, test, and validation sets using an 80/10/10 split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_validation, df_test = train_test_split(df_test, test_size=0.5, random_state=42)

In [63]:
# function to prepare the first n examples from df_train into a prompt that will be returned to the caller
# The returned prompt will be used for few-shot training

def prepare(n):
    df_train_first_n_rows = df_train.iloc[0:n]
    
    prepared_text = []
    
    for _, row in df_train_first_n_rows.iterrows():
        text = row['document']
        label = str(row['RealNews?'])
        concatenated_text = " => ".join([text, label])
        prepared_text.append(concatenated_text)

    return "\n".join(prepared_text)

In [65]:
# Calling the prepare function with a small value of n (to verify it works properly) 

print(prepare(2))

hillary lies: remember when hillary disclosed she was named after a famous person? this story is from 2006 and is the first in a long series of hillary s lies that we ll be exposing over the next couple of months for more than a decade, one piece of senator hillary rodham clinton s informal biography has been that she was named for sir edmund hillary, the conqueror of mount everest. the story was even recounted in bill clinton s autobiography.but yesterday, mrs. clinton s campaign said she was not named for sir edmund after all. it was a sweet family story her mother shared to inspire greatness in her daughter, to great results i might add,  said jennifer hanley, a spokeswoman for the campaign.in may 1953, sir edmund and his sherpa guide, tenzing norgay, became the first men to reach the summit of mount everest. in 1995, shortly after meeting sir edmund, mrs. clinton said that her mother, dorothy rodham, had long told her she was named for the famous mountaineer.  it had two l s, which

In [83]:
# function to append a row from validation to the prompt and return the complete prompt to the caller

def generate_prompt(n, validation_row):
    prompt = prepare(n)
    text = validation_row['document'].iloc[0]
    text = " => ".join([text, ''])
    return "\n".join([prompt, text])

In [85]:
# Calling the generate_prompt function to ensure it works properly

n = 2
validation_row = df_validation.sample(n=1)

prompt = generate_prompt(n, validation_row)

print(prompt)

hillary lies: remember when hillary disclosed she was named after a famous person? this story is from 2006 and is the first in a long series of hillary s lies that we ll be exposing over the next couple of months for more than a decade, one piece of senator hillary rodham clinton s informal biography has been that she was named for sir edmund hillary, the conqueror of mount everest. the story was even recounted in bill clinton s autobiography.but yesterday, mrs. clinton s campaign said she was not named for sir edmund after all. it was a sweet family story her mother shared to inspire greatness in her daughter, to great results i might add,  said jennifer hanley, a spokeswoman for the campaign.in may 1953, sir edmund and his sherpa guide, tenzing norgay, became the first men to reach the summit of mount everest. in 1995, shortly after meeting sir edmund, mrs. clinton said that her mother, dorothy rodham, had long told her she was named for the famous mountaineer.  it had two l s, which

In [89]:
# Loading the pretrained model and its corresponding tokenizer

model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [117]:
# function which passes the prompt to the pretrained model, gpt2, and returns the corresponding prediction

def generate_label(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    outputs = model(**inputs)
    
    logits = outputs.logits
    last_token_logits = logits[:, -1, :]
    
    true_token_id = tokenizer.encode("True", add_special_tokens=False)[0]
    false_token_id = tokenizer.encode("False", add_special_tokens=False)[0]
    
    true_logit = last_token_logits[0, true_token_id].item()
    false_logit = last_token_logits[0, false_token_id].item()
    
    predicted_label = "True" if true_logit >= false_logit else "False"
    return predicted_label
    

In [119]:
# calling the generate_label function to ensure it works properly

generate_label(prompt)

'True'

In [174]:
# function to compute the F1 score on df_test
# Compares the predicted label to the ground truth (gold label)

def score(n, df_test):
    
    predicted_labels = []
    
    for index, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Processing Rows"):
        row = row.to_frame().T
        
        prompt = generate_prompt(n, row)
        predicted_label = generate_label(prompt)
        predicted_labels.append(predicted_label)

    y_true = df_test['RealNews?'].astype("string").tolist()
    y_pred = predicted_labels

    f1_result = f1_score(y_true, y_pred, pos_label="True")

    return f1_result

In [185]:
# Using a subset of the rows from df_validation to use in the score function

df_validation_sample = df_validation.sample(n=100, random_state=42)

In [187]:
# Calculating the F1 score for n rangining between 1 and 5 inclusive to analyze differences in performance

for n in range(1, 6):
    f1_result = score(n, df_validation_sample)
    print(f"F1 score : {f1_result} when n equals {n} on df_validation_sample")
    
    

Processing Rows: 100%|██████████| 100/100 [01:07<00:00,  1.48it/s]


F1 score : 0.47863247863247865 when n equals 1 on df_validation_sample


Processing Rows: 100%|██████████| 100/100 [01:14<00:00,  1.34it/s]


F1 score : 0.5588235294117647 when n equals 2 on df_validation_sample


Processing Rows: 100%|██████████| 100/100 [01:25<00:00,  1.17it/s]


F1 score : 0.6111111111111112 when n equals 3 on df_validation_sample


Processing Rows: 100%|██████████| 100/100 [01:30<00:00,  1.10it/s]


F1 score : 0.6111111111111112 when n equals 4 on df_validation_sample


Processing Rows: 100%|██████████| 100/100 [01:27<00:00,  1.14it/s]

F1 score : 0.6111111111111112 when n equals 5 on df_validation_sample





In [189]:
# Using a subset of the rows from df_test to use in the score function

df_test_sample = df_test.sample(n=100, random_state=42)

In [191]:
# Calling the score function on df_test_sample

n = 3

f1_result = score(n, df_test_sample)
print(f"F1 score : {f1_result} when n equals {n} on df_test_sample") 

Processing Rows: 100%|██████████| 100/100 [01:26<00:00,  1.16it/s]

F1 score : 0.6754966887417219 when n equals 3 on df_test_sample





## Part 2