In [4]:
import os
import sys
import pandas as pd


In [5]:
import os, sys
import pandas as pd


In [6]:
DATA_PATH = "../data/raw/training.1600000.processed.noemoticon.csv"

df = pd.read_csv(DATA_PATH, encoding="latin-1", header=None)
df.columns = ["target","ids","date","flag","user","text"]

df = df[["target","text"]].dropna()
df["sentiment"] = df["target"].map({0:0, 4:1})
df = df.drop(columns=["target"])

df.shape


(1600000, 2)

In [7]:
df_small = (
    df.groupby("sentiment", group_keys=False)
      .sample(n=10000, random_state=42)
      .reset_index(drop=True)
)

df_small.shape, df_small["sentiment"].value_counts()


((20000, 2),
 sentiment
 0    10000
 1    10000
 Name: count, dtype: int64)

In [11]:
import os, sys
PROJECT_ROOT = os.path.abspath("..")
sys.path.append(PROJECT_ROOT)


In [12]:
from src.preprocessing import split_data

X_train, X_val, X_test, y_train, y_val, y_test = split_data(df_small)

train_df = pd.DataFrame({"text": X_train.values, "label": y_train.values})
val_df   = pd.DataFrame({"text": X_val.values,   "label": y_val.values})
test_df  = pd.DataFrame({"text": X_test.values,  "label": y_test.values})

train_df.shape, val_df.shape, test_df.shape


((16000, 2), (2000, 2), (2000, 2))

In [13]:
from datasets import Dataset
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=64)

train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
test_ds  = test_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

train_ds.set_format("torch")
val_ds.set_format("torch")
test_ds.set_format("torch")




Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [14]:
import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

acc = evaluate.load("accuracy")
f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="binary")["f1"],
    }

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

args = TrainingArguments(
    output_dir="../outputs/distilbert_sentiment_20k",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=25,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4393,0.399856,0.8205,0.821482


TrainOutput(global_step=500, training_loss=0.4708407897949219, metrics={'train_runtime': 172.0335, 'train_samples_per_second': 93.005, 'train_steps_per_second': 2.906, 'total_flos': 264934797312000.0, 'train_loss': 0.4708407897949219, 'epoch': 1.0})

In [15]:
trainer.evaluate(test_ds)




{'eval_loss': 0.41608726978302,
 'eval_accuracy': 0.8035,
 'eval_f1': 0.805541810984661,
 'eval_runtime': 5.5334,
 'eval_samples_per_second': 361.443,
 'eval_steps_per_second': 5.783,
 'epoch': 1.0}

In [16]:
trainer.save_model("../outputs/distilbert_sentiment_20k/best_model")
tokenizer.save_pretrained("../outputs/distilbert_sentiment_20k/best_model")


('../outputs/distilbert_sentiment_20k/best_model/tokenizer_config.json',
 '../outputs/distilbert_sentiment_20k/best_model/special_tokens_map.json',
 '../outputs/distilbert_sentiment_20k/best_model/vocab.txt',
 '../outputs/distilbert_sentiment_20k/best_model/added_tokens.json',
 '../outputs/distilbert_sentiment_20k/best_model/tokenizer.json')

In [17]:
import torch
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(model=trainer.model, tokenizer=tokenizer, device=-1)

sample = test_df.sample(200, random_state=42).reset_index(drop=True)
preds = pipe(sample["text"].tolist(), truncation=True, max_length=64)
pred_labels = [1 if p["label"].endswith("1") else 0 for p in preds]
sample["pred"] = pred_labels
sample["correct"] = (sample["pred"] == sample["label"])

sample[~sample["correct"]].head(10)


Device set to use mps:0


Unnamed: 0,text,label,pred,correct
6,@bonnie_booo it ain't gonna happen...i'v got g...,1,0,False
9,Chillin @ jareds and corys apartment. Level 32...,0,1,False
11,@RobyLa i saw this hotpink G-shock &amp; i was...,0,1,False
13,Heading to the Carpathians in about 1.5 hours....,0,1,False
15,Reading about quitting smoking. I smoke a pack...,0,1,False
16,"@tommcfly mmm Tasty, you'r making me hungry now",0,1,False
22,yeah Twitter is back to work after maintenance,1,0,False
24,In Atlanta waiting for my flight to Montreal.,0,1,False
27,@Gooshy1 So tempting to come back tonight,0,1,False
43,@rakelgerero coaching???? haha i knew you'd th...,0,1,False


In [18]:
import numpy as np
import pandas as pd

pred_out = trainer.predict(test_ds)
logits = pred_out.predictions
probs = np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True)  # softmax
pred = logits.argmax(axis=1)

results = test_df.copy().reset_index(drop=True)
results["pred"] = pred
results["correct"] = (results["pred"] == results["label"])
results["prob_pos"] = probs[:, 1]
results["confidence"] = probs.max(axis=1)

results.head()




Unnamed: 0,text,label,pred,correct,prob_pos,confidence
0,@mileycyrus If there is anything I can do to h...,0,1,False,0.719917,0.719917
1,F1 teams set to breakaway from the championshi...,0,1,False,0.64276,0.64276
2,Sooo...one day on the beach and i am completel...,0,0,True,0.448236,0.551764
3,@iBANG im goin to try my best,1,1,True,0.817355,0.817355
4,@xXAudioMonkXx hey it's us.... Thought it was...,1,1,True,0.938284,0.938284


In [19]:
results[results["correct"] == False].sample(30, random_state=42)[["text","label","pred","confidence","prob_pos"]]


Unnamed: 0,text,label,pred,confidence,prob_pos
369,@Dancegurl91 haha it's actually summer vacatio...,1,0,0.850805,0.149195
1405,@brandiev great list! i'd add one: have a defi...,0,1,0.761142,0.761142
1268,loungin' on the deck I could do this for the r...,1,0,0.730161,0.269839
280,@weremoo me too today *sends hugs*,0,1,0.860405,0.860405
1980,@BelieverNLove they were!!,0,1,0.800119,0.800119
1000,Bustling about the office...then TWO new music...,1,0,0.929457,0.070543
229,"@aissuperbodoh Hey its noon here, and i was ta...",1,0,0.854694,0.145306
1174,"just so you guys know, im not a miley hater an...",1,0,0.795621,0.204379
685,I do believe my teaching certificate came today.,1,0,0.792456,0.207544
555,None of friends ever/want to sleepover so when...,1,0,0.543245,0.456755
