# "Truth"seeker Project Recreation

In [29]:
# Load the autoreload extension
%load_ext autoreload

# Set autoreload to automatically reload all modules
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
import sys
from pathlib import Path

# Add the src directory to sys.path
sys.path.append(str(Path().resolve() / "src"))

# Import util
import util

In [31]:
import pandas as pd
from torch import cuda
from sklearn.model_selection import train_test_split
from datasets import Dataset

device = 'cuda' if cuda.is_available() else 'cpu'

device

'cpu'

In [32]:
DATA_PATH = "../data/truthseeker.csv"
df = pd.read_csv(DATA_PATH)
# Comment on on actual execution. For development, this small sample should run quickly on the CPU
df = df.sample(frac=0.0001,  random_state=27)
df.drop(columns=["Unnamed: 0"], inplace=True)
df = df[~df["5_label_majority_answer"].isin(["NO MAJORITY", "Unrelated"])]

clean_cols = ["statement", "tweet"]
df[clean_cols] = df[clean_cols].applymap(util.clean_text)

print(df.shape)
df.head()

(10, 8)


  df[clean_cols] = df[clean_cols].applymap(util.clean_text)


Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer
58262,April Hunt,Unlike marijuana medical cannabis oil cannot g...,True,1.0,"medical canabis, cannot, high",You cannot open a medical cannabis dispensary...,Agree,Agree
40146,Louis Jacobson,There are more words in the IRS code than the...,True,1.0,"More words, IRS code, Bible",There are more words in the IRS code than the...,Mostly Disagree,Disagree
77627,Ciara O'Rourke,In 38 days Pelosi and Schiff are up for reelec...,False,0.0,"Pelosi, Schiff, reelection",All of the players in this coup need to be inv...,Agree,Agree
103400,Jon Greenberg,21 of people are having serious adverse events...,False,0.0,"21%,adverse,moderna",_AskMD Not hypothesis Judge forced FDA to FOI...,Disagree,Disagree
52117,Tom Kertscher,Says Donald Trump won Arizona.,False,0.0,trump Arizona win,Meghan you have been removed from the Republi...,Agree,Agree


In [33]:
# Combine tweet + statement + "ground truth" value for training.
df['full'] = df['target'].astype(str) + ' Statement: ' + df['statement'] + '| Tweet: ' + df['tweet']
df['full']

58262     True Statement: Unlike marijuana medical canna...
40146     True Statement:  There are more words in the I...
77627     False Statement: In 38 days Pelosi and Schiff ...
103400    False Statement: 21 of people are having serio...
52117     False Statement: Says Donald Trump won Arizona...
6117      True Statement: Murders this year have spiked ...
59604     True Statement: More people are apprehended af...
1211      True Statement: In Afghanistan over 100 billio...
55551     False Statement:  Records suggest more than 10...
69439     True Statement:  Texas has the highest rate of...
Name: full, dtype: object

In [34]:
#CATEGORY = "5_label_majority_answer"
# TARGET = 4
CATEGORY = "3_label_majority_answer"
TARGET = 2
df["consensus"] = df.apply(util.compute_consensus, axis=1, args=(CATEGORY, TARGET))
df["labels"] = df[CATEGORY].astype("category").cat.codes
df.head(3)

Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer,full,consensus,labels
58262,April Hunt,Unlike marijuana medical cannabis oil cannot g...,True,1.0,"medical canabis, cannot, high",You cannot open a medical cannabis dispensary...,Agree,Agree,True Statement: Unlike marijuana medical canna...,True,0
40146,Louis Jacobson,There are more words in the IRS code than the...,True,1.0,"More words, IRS code, Bible",There are more words in the IRS code than the...,Mostly Disagree,Disagree,True Statement: There are more words in the I...,False,1
77627,Ciara O'Rourke,In 38 days Pelosi and Schiff are up for reelec...,False,0.0,"Pelosi, Schiff, reelection",All of the players in this coup need to be inv...,Agree,Agree,False Statement: In 38 days Pelosi and Schiff ...,False,0


In [35]:
statements = df['statement'].unique()

# Split data into train/test (80/20 split)
train_statements,  test_statements = train_test_split(statements, test_size=0.2, random_state=27)

train_df = df[df['statement'].isin(train_statements)]
test_df = df[df['statement'].isin(test_statements)]

train_df.head(3)

Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer,full,consensus,labels
58262,April Hunt,Unlike marijuana medical cannabis oil cannot g...,True,1.0,"medical canabis, cannot, high",You cannot open a medical cannabis dispensary...,Agree,Agree,True Statement: Unlike marijuana medical canna...,True,0
40146,Louis Jacobson,There are more words in the IRS code than the...,True,1.0,"More words, IRS code, Bible",There are more words in the IRS code than the...,Mostly Disagree,Disagree,True Statement: There are more words in the I...,False,1
103400,Jon Greenberg,21 of people are having serious adverse events...,False,0.0,"21%,adverse,moderna",_AskMD Not hypothesis Judge forced FDA to FOI...,Disagree,Disagree,False Statement: 21 of people are having serio...,True,1


In [36]:
# Convert train and test DataFrames into Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(util.tokenize_function, batched=True)
test_dataset = test_dataset.map(util.tokenize_function, batched=True)

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [None]:
OUT = "2_class"
# OUT = "4_class"
util.train(train_dataset, test_dataset, OUT)