In [None]:
!.\venv\Scripts\activate

In [2]:
import pandas as pd

# Load datasets
train_df = pd.read_csv("../data/processed/liar2_train.csv")
val_df = pd.read_csv("../data/processed/liar2_val.csv")
test_df = pd.read_csv("../data/processed/liar2_test.csv")

# Quick look
print(train_df.shape)
train_df.head()


(18369, 16)


Unnamed: 0,id,label,statement,date,subject,speaker,speaker_description,state_info,true_counts,mostly_true_counts,half_true_counts,mostly_false_counts,false_counts,pants_on_fire_counts,context,justification
0,13847,5,"90 percent of Americans ""support universal bac...","October 2, 2017",government regulation;polls and public opinion...,chris abele,"Chris Abele is Milwaukee County Executive, a p...",wisconsin,1,4,5,3,5,2,a tweet,"""Universal"" is the term for background checks ..."
1,13411,1,Last year was one of the deadliest years ever ...,"May 19, 2017",after the fact;congress;criminal justice;histo...,thom tillis,Thom Tillis is a Republican who serves as U.S....,north carolina,0,2,7,3,2,0,a press release supporting the Back The Blue A...,"Sen. Thom Tillis, a North Carolina Republican,..."
2,10882,0,"Bernie Sanders's plan is ""to raise your taxes ...","October 28, 2015",taxes,chris christie,"Chris Christie announced June 6, 2023 that he ...",national,21,20,27,11,17,8,"Boulder, Colo","Christie said that Sanders’s plan is ""to raise..."
3,20697,4,Voter ID is supported by an overwhelming major...,"December 8, 2021",voter id laws,lee zeldin,Lee Zeldin is a Republican representing New Yo...,new york,1,2,0,0,0,0,a Tweet,Zeldin claimed voter identification requiremen...
4,6095,2,"Says Barack Obama ""robbed Medicare (of) $716 b...","August 12, 2012",federal budget;history;medicare;retirement,mitt romney,Mitt Romney is a U.S. senator from Utah. He ra...,national,31,33,58,35,32,19,"an interview on ""60 Minutes""","Romney said, ""There's only one president that ..."


In [3]:
train_df.columns


Index(['id', 'label', 'statement', 'date', 'subject', 'speaker',
       'speaker_description', 'state_info', 'true_counts',
       'mostly_true_counts', 'half_true_counts', 'mostly_false_counts',
       'false_counts', 'pants_on_fire_counts', 'context', 'justification'],
      dtype='object')

In [4]:
train_df["label"].value_counts().sort_index()


label
0    2425
1    5284
2    2882
3    2967
4    2743
5    2068
Name: count, dtype: int64

In [5]:
label_map = {
    0: "pants_on_fire",
    1: "false",
    2: "barely_true",
    3: "half_true",
    4: "mostly_true",
    5: "true"
}
train_df["label_str"] = train_df["label"].map(label_map)
train_df[["statement", "label_str"]].head()


Unnamed: 0,statement,label_str
0,"90 percent of Americans ""support universal bac...",true
1,Last year was one of the deadliest years ever ...,false
2,"Bernie Sanders's plan is ""to raise your taxes ...",pants_on_fire
3,Voter ID is supported by an overwhelming major...,mostly_true
4,"Says Barack Obama ""robbed Medicare (of) $716 b...",barely_true


In [6]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text

train_df["clean_statement"] = train_df["statement"].apply(clean_text)
train_df[["statement", "clean_statement"]].head()


Unnamed: 0,statement,clean_statement
0,"90 percent of Americans ""support universal bac...",90 percent of americans support universal back...
1,Last year was one of the deadliest years ever ...,last year was one of the deadliest years ever ...
2,"Bernie Sanders's plan is ""to raise your taxes ...",bernie sanderss plan is to raise your taxes to...
3,Voter ID is supported by an overwhelming major...,voter id is supported by an overwhelming major...
4,"Says Barack Obama ""robbed Medicare (of) $716 b...",says barack obama robbed medicare of 716 billi...


___
Step 2: Tokenize and Vocabulary building

In [7]:
train_df["tokens"] = train_df["clean_statement"].apply(lambda x: x.split())
train_df["tokens"].head()


0    [90, percent, of, americans, support, universa...
1    [last, year, was, one, of, the, deadliest, yea...
2    [bernie, sanderss, plan, is, to, raise, your, ...
3    [voter, id, is, supported, by, an, overwhelmin...
4    [says, barack, obama, robbed, medicare, of, 71...
Name: tokens, dtype: object

In [8]:
from collections import Counter

# Flatten all tokens into one big list
all_tokens = [token for tokens in train_df["tokens"] for token in tokens]

# Count token frequencies
token_counts = Counter(all_tokens)

# Create vocab with special tokens
vocab = {"<PAD>": 0, "<UNK>": 1}
for i, (token, _) in enumerate(token_counts.items(), start=2):
    vocab[token] = i

# Check vocab size
len(vocab)


19061

In [9]:
def encode_tokens(tokens, vocab):
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens]

train_df["input_ids"] = train_df["tokens"].apply(lambda tokens: encode_tokens(tokens, vocab))
train_df[["tokens", "input_ids"]].head()


Unnamed: 0,tokens,input_ids
0,"[90, percent, of, americans, support, universa...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"
1,"[last, year, was, one, of, the, deadliest, yea...","[13, 14, 15, 16, 4, 17, 18, 19, 20, 10, 21, 22..."
2,"[bernie, sanderss, plan, is, to, raise, your, ...","[24, 25, 26, 27, 28, 29, 30, 31, 28, 2, 3]"
3,"[voter, id, is, supported, by, an, overwhelmin...","[32, 33, 27, 34, 35, 36, 37, 38, 4, 39, 40, 41..."
4,"[says, barack, obama, robbed, medicare, of, 71...","[48, 49, 50, 51, 52, 4, 53, 54, 28, 55, 10, 56]"


In [10]:
import numpy as np

MAX_LEN = 30

def pad_sequence(seq, max_len):
    if len(seq) < max_len:
        return seq + [vocab["<PAD>"]] * (max_len - len(seq))
    else:
        return seq[:max_len]

train_df["padded_ids"] = train_df["input_ids"].apply(lambda x: pad_sequence(x, MAX_LEN))


In [11]:
print(train_df["padded_ids"].iloc[0])
print(len(train_df["padded_ids"].iloc[0]))  # should be 30


[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
30


___
Step 3

In [13]:
import torch
from torch.utils.data import Dataset

class Liar2Dataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_tensor = torch.tensor(self.inputs[idx], dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)
        return input_tensor, label_tensor


In [14]:
from torch.utils.data import DataLoader

# Use padded_ids and label
train_dataset = Liar2Dataset(train_df["padded_ids"].tolist(), train_df["label"].tolist())

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [15]:
for batch_inputs, batch_labels in train_loader:
    print("Batch inputs shape:", batch_inputs.shape)
    print("Batch labels shape:", batch_labels.shape)
    break


Batch inputs shape: torch.Size([32, 30])
Batch labels shape: torch.Size([32])


___
Step 4
