# Data Preprocessing

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from transformers import DistilBertTokenizer, DistilBertModel, Trainer, TrainingArguments
from datasets import Dataset

import torch
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

device

'cpu'

In [None]:
PATH = "../data/truthseeker.csv"
df = pd.read_csv(PATH)
# Temporarily scale down dataset to ensure pipeline is functional
# df = df.sample(frac=0.001, random_state=27)
df.drop(columns=["Unnamed: 0"], inplace=True)
df = df[~df["5_label_majority_answer"].isin(["NO MAJORITY", "Unrelated"])]
df.head()

Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer
58262,April Hunt,"Unlike marijuana, medical cannabis oil cannot ...",True,1.0,"medical canabis, cannot, high",@G19106 @PhillyInquirer You cannot open a medi...,Agree,Agree
40146,Louis Jacobson,"""There are more words in the IRS code than the...",True,1.0,"More words, IRS code, Bible","""There are more words in the IRS code than the...",Mostly Disagree,Disagree
77627,Ciara O'Rourke,In 38 days Pelosi and Schiff are up for reelec...,False,0.0,"Pelosi, Schiff, reelection",All of the players in this coup need to be inv...,Agree,Agree
103400,Jon Greenberg,21% of people are having serious adverse event...,False,0.0,"21%,adverse,moderna",@LauraM_AskMD Not hypothesis\n\nJudge forced F...,Disagree,Disagree
52117,Tom Kertscher,Says Donald Trump won Arizona.,False,0.0,trump Arizona win,@MeghanMcCain Meghan you have been removed fro...,Agree,Agree


In [27]:
df["3_label_majority_answer"].unique()

array(['Agree', 'Disagree'], dtype=object)

In [None]:
df = df.drop(columns=["3_label_majority_answer"])
df.rename(columns={"5_label_majority_answer": "majority_answer"}, inplace=True)

def compute_truthfulness(row):
    if row["BinaryNumTarget"] == 1:  # Statement is True
        if row["majority_answer"] in ["Agree"]:
            return "True"
        elif row["majority_answer"] in ["Mostly Agree"]:
            return "Mostly True"
        elif row["majority_answer"] in ["Disagree"]:
            return "False"
        elif row["majority_answer"] in ["Mostly Disagree"]:
            return "Mostly False"
    elif row["BinaryNumTarget"] == 0:  # Statement is False
        if row["majority_answer"] in ["Agree"]:
            return "False"
        elif row["majority_answer"] in ["Mostly Agree"]:
            return "Mostly False"
        elif row["majority_answer"] in ["Disagree"]:
            return "True"
        elif row["majority_answer"] in ["Mostly Disagree"]:
            return "Mostly True"

df["credibility_value"] = df.apply(compute_truthfulness, axis=1)

df["labels"] = df["credibility_value"].astype("category").cat.codes
df.head()


Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,majority_answer,agreement_value,labels
58262,April Hunt,"Unlike marijuana, medical cannabis oil cannot ...",True,1.0,"medical canabis, cannot, high",@G19106 @PhillyInquirer You cannot open a medi...,Agree,True,1
40146,Louis Jacobson,"""There are more words in the IRS code than the...",True,1.0,"More words, IRS code, Bible","""There are more words in the IRS code than the...",Disagree,False,0
77627,Ciara O'Rourke,In 38 days Pelosi and Schiff are up for reelec...,False,0.0,"Pelosi, Schiff, reelection",All of the players in this coup need to be inv...,Agree,False,0
103400,Jon Greenberg,21% of people are having serious adverse event...,False,0.0,"21%,adverse,moderna",@LauraM_AskMD Not hypothesis\n\nJudge forced F...,Disagree,True,1
52117,Tom Kertscher,Says Donald Trump won Arizona.,False,0.0,trump Arizona win,@MeghanMcCain Meghan you have been removed fro...,Agree,False,0


In [None]:
statements = df['statement'].unique()

# Split data into train/test (80/20 split)
train_statements,  test_statements = train_test_split(statements, test_size=0.2, random_state=27)

train_df = df[df['statement'].isin(train_statements)]
test_df = df[df['statement'].isin(test_statements)]

train_df.head()

Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,majority_answer,agreement_value,labels
58262,April Hunt,"Unlike marijuana, medical cannabis oil cannot ...",True,1.0,"medical canabis, cannot, high",@G19106 @PhillyInquirer You cannot open a medi...,Agree,True,1
40146,Louis Jacobson,"""There are more words in the IRS code than the...",True,1.0,"More words, IRS code, Bible","""There are more words in the IRS code than the...",Disagree,False,0
77627,Ciara O'Rourke,In 38 days Pelosi and Schiff are up for reelec...,False,0.0,"Pelosi, Schiff, reelection",All of the players in this coup need to be inv...,Agree,False,0
103400,Jon Greenberg,21% of people are having serious adverse event...,False,0.0,"21%,adverse,moderna",@LauraM_AskMD Not hypothesis\n\nJudge forced F...,Disagree,True,1
52117,Tom Kertscher,Says Donald Trump won Arizona.,False,0.0,trump Arizona win,@MeghanMcCain Meghan you have been removed fro...,Agree,False,0


In [None]:
# Convert train and test DataFrames into Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the tweet text
def tokenize_function(examples):
    # TODO: Include statement as well?
    return tokenizer(examples['tweet'], padding="max_length", truncation=True)

# Define compute_metrics function to calculate accuracy
def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)  # Get predicted labels
    labels = p.label_ids  # Get true labels
    acc = accuracy_score(labels, preds)  # Compute accuracy
    return {"accuracy": acc}

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

In [None]:
# Define a custom model class that adds a classification head for binary classification
class DistilBertClassifier(torch.nn.Module):
    def __init__(self):
        super(DistilBertClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased", max_length=410, num_labels=4)
        self.classifier = torch.nn.Linear(self.distilbert.config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask, labels=None):
        # Get hidden states from DistilBERT
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state
        # Use the [CLS] token's embedding for classification
        pooled_output = hidden_state[:, 0]  # First token is the [CLS] token
        logits = self.classifier(pooled_output)
        
        if labels is not None:
            # Binary crossentropy loss
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1), labels.view(-1).float())  # Ensure labels are float for BCE loss
            return loss, logits
        else:
            return logits


# Load DistilBERT model
model = DistilBertClassifier()
model.cuda()
model

DistilBertClassifier(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (li

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results-4class',          # output directory
    fp16=True,
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=32,   # batch size for training
    per_device_eval_batch_size=32,    # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs-4class',            # directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

  trainer3 = Trainer(


  0%|          | 0/42 [00:00<?, ?it/s]

{'loss': 0.7118, 'grad_norm': 2.99511981010437, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.71}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.7011153697967529, 'eval_accuracy': 0.36363636363636365, 'eval_runtime': 15.2791, 'eval_samples_per_second': 1.44, 'eval_steps_per_second': 0.262, 'epoch': 1.0}
{'loss': 0.6987, 'grad_norm': 3.0389857292175293, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.43}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.6983593106269836, 'eval_accuracy': 0.36363636363636365, 'eval_runtime': 15.0149, 'eval_samples_per_second': 1.465, 'eval_steps_per_second': 0.266, 'epoch': 2.0}
{'loss': 0.6997, 'grad_norm': 2.785733699798584, 'learning_rate': 3e-06, 'epoch': 2.14}
{'loss': 0.6856, 'grad_norm': 1.3125437498092651, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.86}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.6904094815254211, 'eval_accuracy': 0.36363636363636365, 'eval_runtime': 15.1363, 'eval_samples_per_second': 1.453, 'eval_steps_per_second': 0.264, 'epoch': 3.0}
{'train_runtime': 464.4177, 'train_samples_per_second': 0.53, 'train_steps_per_second': 0.09, 'train_loss': 0.697171443984622, 'epoch': 3.0}


TrainOutput(global_step=42, training_loss=0.697171443984622, metrics={'train_runtime': 464.4177, 'train_samples_per_second': 0.53, 'train_steps_per_second': 0.09, 'total_flos': 0.0, 'train_loss': 0.697171443984622, 'epoch': 3.0})

In [None]:
# Evaluate the model
results = trainer.evaluate()
results

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

({'eval_loss': 0.6812520623207092,
  'eval_accuracy': 0.36363636363636365,
  'eval_runtime': 15.0546,
  'eval_samples_per_second': 1.461,
  'eval_steps_per_second': 0.266,
  'epoch': 3.0},
 {'eval_loss': 0.6812520623207092,
  'eval_accuracy': 0.36363636363636365,
  'eval_runtime': 15.1231,
  'eval_samples_per_second': 1.455,
  'eval_steps_per_second': 0.264,
  'epoch': 3.0})