### **Assignment 1: Text Analysis**



In [1]:
# Install & import libraries
!pip install -q transformers datasets accelerate scikit-learn pandas matplotlib seaborn joblib

In [2]:
# Import Python libraries
import json
from pathlib import Path
import pandas as pd

# Step 1: Load datasets
def load_jsonl_to_df(path):
    rows = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            rows.append(obj)
    return pd.DataFrame(rows)

# Paths to manually uploaded files

train_path = "/content/z639_assignment1_training.json"
test_path  = "/content/z639_assignment1_test.json"

# Load datasets into pandas
train_df = load_jsonl_to_df(train_path)
test_df  = load_jsonl_to_df(test_path)

# Quick check of dataset sizes
print("Train rows:", len(train_df), "Test rows:", len(test_df))
train_df.head()


Train rows: 4000 Test rows: 500


Unnamed: 0,text,parent_comment,article_title,article_url,platform,platform_id,composite_toxic
0,"WTF, y'all never made MRE fart balloons in the...",,Triangular UFO hovers over California military...,https://www.dailymail.co.uk/news/article-12112...,reddit,jlcm021,"[[False, 74], [True, 323], [False, 1028], [Fal..."
1,No apologies !! McCall has balls ! Ccp is not...,,China sentences elderly US citizen to life in ...,https://www.cnn.com/2023/05/15/china/china-jai...,youtube,Ugws8gNW7eJyE9VHeM14AaABAg,"[[False, 216], [False, 197], [False, 1039], [F..."
2,What ever you need to tell yourself to sleep a...,I wonder how many undercover agents will be go...,Jan. 6 defendant who put foot on desk in Pelos...,https://www.cbsnews.com/news/richard-barnett-j...,youtube,UgxHlqwNcVssLHUr4yF4AaABAg.9q7kOunSlu-9q7lHH4he6S,"[[True, 192], [True, 193], [True, 260], [True,..."
3,@exZACKly @CBSNews Fuck off Nazi,@NCmylo @CBSNews Lol. Stop choosing to be an ...,19-year-old Missouri man arrested in U-Haul cr...,https://www.cbsnews.com/news/u-haul-crash-lafa...,twitter,1661025155047637000,"[[True, 92], [False, 218], [True, 69], [True, ..."
4,Texas is a republican sponsored killing ground...,,At Least 8 Killed After Driver Plows Car Into ...,https://www.nytimes.com/2023/05/07/us/car-pede...,youtube,UgwpAfn9RIV0cHfhp4R4AaABAg,"[[False, 56], [True, 207], [False, 218], [Fals..."


In [3]:
# Process labels (majority vote)
def majority_vote(composite):
# Define function to convert 5 human annotations into single label
    if composite is None:
        return False
    true_count = sum(1 for lab,_ in composite if str(lab).lower() == 'true')
    return True if true_count >= 3 else False

# Apply majority vote to training set
train_df['label'] = train_df['composite_toxic'].apply(majority_vote)
train_df['label_int'] = train_df['label'].astype(int)  # 1 = toxic (true), 0 = not toxic
train_df['text'] = train_df['text'].fillna('').astype(str)

# Check label distribution and sample data
print(train_df['label'].value_counts())
train_df[['text','label']].sample(5, random_state=42)


label
False    2974
True     1026
Name: count, dtype: int64


Unnamed: 0,text,label
555,@realTuckFrumper Let him rot. I hope someone p...,True
3491,Bro he lied about so much shit he did this to ...,False
527,2 cops were shot.,False
3925,@guardian .@guardian Putin is a nobody. Garbag...,False
2989,@IvryGrrl88 @kkdumez You're going to need a ci...,False


In [4]:
# Text cleaning
import re
def clean_text(s):
    s = s.strip()
    s = re.sub(r'\s+', ' ', s)               # collapse whitespace
    s = re.sub(r'http\S+', '', s)            # remove URLs
    return s


In [5]:
# Combine article_title, parent_comment, text
def combine_text(row):
    parts = []
    if row.get("article_title"): parts.append(row["article_title"])
    if row.get("parent_comment"): parts.append(row["parent_comment"])
    if row.get("text"): parts.append(row["text"])
    return " [SEP] ".join(parts)

train_df['full_text'] = train_df.apply(combine_text, axis=1)
test_df['full_text'] = test_df.apply(combine_text, axis=1)

# Clean text
train_df['full_text_clean'] = train_df['full_text'].apply(clean_text)
test_df['full_text_clean'] = test_df['full_text'].apply(clean_text)


In [6]:
# Baseline TF-IDF + Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
import joblib
import numpy as np

# split (stratified)
train, val = train_test_split(train_df, test_size=0.2, stratify=train_df['label_int'], random_state=42)

# TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_features=50000)
X_train = vectorizer.fit_transform(train['full_text_clean'])
X_val   = vectorizer.transform(val['full_text_clean'])


# Logistic Regression (balanced)
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr.fit(X_train, train['label_int'])

# Evaluate
y_val_pred = lr.predict(X_val)
y_val_prob = lr.predict_proba(X_val)[:,1]
acc = accuracy_score(val['label_int'], y_val_pred)
prec, rec, f1, _ = precision_recall_fscore_support(val['label_int'], y_val_pred, average='binary', pos_label=1)
roc = roc_auc_score(val['label_int'], y_val_prob)

print("Baseline LR — Acc {:.4f} Prec {:.4f} Rec {:.4f} F1 {:.4f} ROC {:.4f}".format(acc, prec, rec, f1, roc))
print("Confusion matrix:\n", confusion_matrix(val['label_int'], y_val_pred))

# Save artifacts
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
joblib.dump(lr, "baseline_lr.joblib")


Baseline LR — Acc 0.7575 Prec 0.5242 Rec 0.5805 F1 0.5509 ROC 0.7554
Confusion matrix:
 [[487 108]
 [ 86 119]]


['baseline_lr.joblib']

In [7]:
!pip install -U transformers datasets accelerate



In [8]:
# Prepare Hugging Face Dataset & Tokenizer

from transformers import AutoTokenizer
from datasets import Dataset
import torch

# Load a pretrained tokenizer (BERT base uncased)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Define a helper function for tokenization
def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Convert pandas DataFrames into Hugging Face Datasets
hf_train = Dataset.from_pandas(
    train[['text', 'label_int']].rename(columns={'text': 'text', 'label_int': 'label'})
)
hf_val = Dataset.from_pandas(
    val[['text', 'label_int']].rename(columns={'text': 'text', 'label_int': 'label'})
)

# Apply tokenizer to both datasets
hf_train = hf_train.map(tokenize_batch, batched=True)
hf_val = hf_val.map(tokenize_batch, batched=True)

# Keep only relevant columns for training
cols = ['input_ids', 'attention_mask', 'label']
hf_train.set_format(type='torch', columns=cols)
hf_val.set_format(type='torch', columns=cols)

# Create PyTorch DataLoaders
from torch.utils.data import DataLoader

train_loader = DataLoader(hf_train, batch_size=16, shuffle=True)
val_loader = DataLoader(hf_val, batch_size=16)

print("✅ Tokenization complete. DataLoaders ready!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

✅ Tokenization complete. DataLoaders ready!


In [9]:
# Load BERT model & prepare for training
!pip install -q transformers torch datasets scikit-learn

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Detect the correct text column name
possible_text_cols = ["text", "text_clean", "comment", "content"]
for c in possible_text_cols:
    if c in hf_train.column_names:
        TEXT_COL = c
        break
else:
    raise KeyError("No valid text column found in hf_train!")

print(f"✅ Using text column: {TEXT_COL}")

# Convert Hugging Face dataset to regular Python list of dicts
train_data = hf_train.to_dict()
val_data   = hf_val.to_dict()

train_examples = [{TEXT_COL: train_data[TEXT_COL][i], "label": train_data["label"][i]} for i in range(len(train_data[TEXT_COL]))]
val_examples   = [{TEXT_COL: val_data[TEXT_COL][i], "label": val_data["label"][i]} for i in range(len(val_data[TEXT_COL]))]

# Tokenization function for DataLoader
def collate_fn(batch):
    texts = [x[TEXT_COL] for x in batch]
    labels = torch.tensor([x["label"] for x in batch])
    encodings = tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=128
    )
    encodings["labels"] = labels
    return encodings

# Prepare DataLoaders
train_loader = DataLoader(train_examples, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_examples, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop for fine-tuning BERT
epochs = 2
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

# Evaluate
model.eval()
preds, true_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        preds.extend(predictions)
        true_labels.extend(batch["labels"].cpu().numpy())

accuracy = accuracy_score(true_labels, preds)
f1 = f1_score(true_labels, preds)
print(f"\n✅ Validation Accuracy: {accuracy:.4f}")
print(f"✅ F1 Score: {f1:.4f}")

# Save fine-tuned model locally
model.save_pretrained("./bert_finetuned_toxic_manual")
tokenizer.save_pretrained("./bert_finetuned_toxic_manual")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Using text column: text


Epoch 1/2: 100%|██████████| 200/200 [00:56<00:00,  3.54it/s]


Epoch 1 Loss: 0.5127


Epoch 2/2: 100%|██████████| 200/200 [00:56<00:00,  3.51it/s]


Epoch 2 Loss: 0.3642

✅ Validation Accuracy: 0.8075
✅ F1 Score: 0.6131


('./bert_finetuned_toxic_manual/tokenizer_config.json',
 './bert_finetuned_toxic_manual/special_tokens_map.json',
 './bert_finetuned_toxic_manual/vocab.txt',
 './bert_finetuned_toxic_manual/added_tokens.json',
 './bert_finetuned_toxic_manual/tokenizer.json')

In [10]:
import torch
import pandas as pd
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from tqdm import tqdm

# Load the test dataset (manual upload version)
test_df = pd.read_json("/content/z639_assignment1_test.json", lines=True)

# Automatically detect text column
TEXT_COL = "text" if "text" in test_df.columns else test_df.columns[0]
print(f"✅ Using text column: {TEXT_COL}")

# Load same tokenizer & trained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = model  # re-use the trained model from previous cell
model.eval()

# Prepare DataLoader
def collate_fn(batch):
    texts = [x[TEXT_COL] for x in batch]
    encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
    return encodings

test_data = test_df.to_dict(orient="records")
test_loader = DataLoader(test_data, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Generate predictions
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

all_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())

# Save results to CSV
test_df["predicted_label"] = all_preds
test_df.to_csv("/content/bert_test_predictions.csv", index=False)

print("\n🎯 Predictions complete! Saved to /content/bert_test_predictions.csv")
test_df.head()


✅ Using text column: text


Predicting: 100%|██████████| 32/32 [00:03<00:00,  9.80it/s]



🎯 Predictions complete! Saved to /content/bert_test_predictions.csv


Unnamed: 0,text,parent_comment,article_title,article_url,platform,platform_id,predicted_label
0,Ukrainian Bullshit.,Russian Propaganda,Kremlin drone: Zelensky denies Ukraine attacke...,https://www.bbc.com/news/world-europe-65471904,youtube,UgxjV6HRpnD6FUmw8aV4AaABAg.9pH-CgX5yEH9pH7BMIfAz5,1
1,@LibDems No one likes you.\nYou denied democra...,,"UK economy shrank 0.3% in March, ONS figures show",https://news.sky.com/story/uk-economy-shrank-0...,twitter,1657052099564150784,1
2,@EPurpera @BBCNews POS terrorist dictator Putr...,@BBCNews They should make peace talk.,Ukraine war: Kyiv hit by new massive Russian d...,https://www.bbc.com/news/world-65736730,twitter,1662672469205958656,1
3,@howardfineman @darkblue714 Bullshit. CNN set ...,,Opinion | Why CNN's Trump town hall was always...,https://www.msnbc.com/opinion/msnbc-opinion/cn...,twitter,1656508255454019587,0
4,"The war will be won by who ""wins"" the race bet...",What is the pope gonna do? Pray and throw a co...,Zelenskyy to meet with Pope Francis at Vatican...,https://apnews.com/article/zelenskyy-italy-vis...,reddit,jk1pm1m,0


In [13]:
# ---------------------------
# Test set predictions (True/False) - Fixed
# ---------------------------
import pandas as pd
from torch.utils.data import DataLoader
from tqdm import tqdm
from google.colab import files

# Load test dataset
test_df = pd.read_json("/content/z639_assignment1_test.json", lines=True)

# Combine article_title, parent_comment, text
def combine_text(row):
    parts = []
    if row.get("article_title"): parts.append(row["article_title"])
    if row.get("parent_comment"): parts.append(row["parent_comment"])
    if row.get("text"): parts.append(row["text"])
    return " [SEP] ".join(parts)

test_df['full_text'] = test_df.apply(combine_text, axis=1)

# Clean text
def clean_text(s):
    s = s.strip()
    s = re.sub(r'\s+', ' ', s)               # collapse whitespace
    s = re.sub(r'http\S+', '', s)            # remove URLs
    return s

test_texts = test_df['full_text'].apply(clean_text).tolist()

# Tokenize test data
encodings = tokenizer(
    test_texts,
    padding=True,
    truncation=True,
    return_tensors="pt",
    max_length=128
)

# Create test DataLoader
test_loader = DataLoader(
    list(range(len(test_texts))),
    batch_size=16,
    shuffle=False,
    collate_fn=lambda idx: {k: v[idx] for k, v in encodings.items()}
)

# Run predictions
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

all_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())

# Convert numeric predictions to True/False strings
test_df["prediction"] = ["true" if p == 1 else "false" for p in all_preds]

# Prepare submission DataFrame
submission_df = test_df[["platform_id", "prediction"]]

# Define output path
output_path = "/content/Vedika-Assignment1-Prediction.csv"

# Save CSV
submission_df.to_csv(output_path, index=False)

# Download in Colab
files.download(output_path)

print(f"\n🎯 Predictions complete! Saved to: {output_path}")


Predicting: 100%|██████████| 32/32 [00:03<00:00,  8.81it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🎯 Predictions complete! Saved to: /content/Vedika-Assignment1-Prediction.csv
