# Import necessary libraries

## Drive + pip install

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [11]:
file_pathh = "/content/drive/My Drive/Colab Notebooks/DL final"
data_path = "/content/drive/My Drive/Colab Notebooks/DL final/Data"

In [3]:
!pip install datasets evaluate
!pip install accelerate -U

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6

## Other libraries

In [20]:
import torch
from transformers import TrainerCallback
import matplotlib.pyplot as plt
import json
import pandas as pd
from datasets import Dataset,  DatasetDict
from sklearn.model_selection import train_test_split

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Config

In [6]:
max_token_length = 128
epochs = 2
sample_used = 10
testing = True

batch_size = 64
learning_rate = 2e-5
weight_decay = 1e-3

# Data preprocessing

In [59]:
df = pd.read_csv(data_path+"/data.csv")

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115719 entries, 0 to 115718
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   comment    115719 non-null  object
 1   sentiment  115719 non-null  object
dtypes: object(2)
memory usage: 1.8+ MB


In [61]:
df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neutral,38573
positive,38573
negative,38573


In [62]:
samples_per_label = 1000
balanced_df = (
    df.groupby("sentiment", group_keys=False)
    .apply(lambda x: x.sample(n=samples_per_label, random_state=42))
)

print(balanced_df["sentiment"].value_counts())

sentiment
negative    1000
neutral     1000
positive    1000
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=samples_per_label, random_state=42))


In [63]:
balanced_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000 entries, 60937 to 109561
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   comment    3000 non-null   object
 1   sentiment  3000 non-null   object
dtypes: object(2)
memory usage: 70.3+ KB


In [64]:
df = balanced_df

In [65]:
df = df.rename(columns={"comment": "sentence", "sentiment": "label"})

label_mapping = {"positive": 2, "neutral": 1, "negative": 0}
df["label"] = df["label"].map(label_mapping)

temp_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
train_df, val_df = train_test_split(temp_df, test_size=0.2, random_state=42, stratify=temp_df['label'])

train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [66]:
dataset = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})

In [67]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', '__index_level_0__'],
        num_rows: 1920
    })
    validation: Dataset({
        features: ['sentence', 'label', '__index_level_0__'],
        num_rows: 480
    })
    test: Dataset({
        features: ['sentence', 'label', '__index_level_0__'],
        num_rows: 600
    })
})


In [68]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [69]:
# Tokenization function with truncation and padding
def tokenize_function(examples):
    return tokenizer(
        examples["sentence"],
        padding="max_length",  # Pad sequences to the model's max length
        truncation=True,       # Truncate sequences longer than max length
        max_length=128,        # Set max length (adjust based on your model)
        return_tensors="pt"    # Return PyTorch tensors
    )

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [71]:
tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_val = dataset["validation"].map(preprocess_function, batched=True)
tokenized_test = dataset["test"].map(preprocess_function, batched=True)

Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [70]:
tokenized_train

Dataset({
    features: ['sentence', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 74060
})

In [72]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length", max_length=max_token_length)

# Model

In [73]:
id2label = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
label2id = {"NEGATIVE": 0, "NEUTRAL": 1, "POSITIVE": 2}

In [74]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",  # Use the correct model path or name
    num_labels=3,              # Set the number of labels to 3
    id2label=id2label,
    label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Metrics

In [75]:
import evaluate

accuracy = evaluate.load("accuracy")

In [76]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Train

In [78]:
from transformers import TrainingArguments, Trainer
loss_history = LossHistoryCallback()
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    report_to="wandb",  # Optional: Log results to W&B
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [79]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.66493
2,No log,0.707197
3,0.609500,0.862121


TrainOutput(global_step=720, training_loss=0.48985797034369577, metrics={'train_runtime': 4565.0165, 'train_samples_per_second': 1.262, 'train_steps_per_second': 0.158, 'total_flos': 190756455874560.0, 'train_loss': 0.48985797034369577, 'epoch': 3.0})

# Final evaluation

In [82]:
predictions, labels, metrics = trainer.predict(tokenized_val)

In [83]:
data_dict = compute_metrics((predictions, labels))

In [84]:
data_dict_new = trainer.evaluate()
for key, item in data_dict_new.items():
    data_dict[key] = item

with open("./eval/final_eval.json", "w") as file:
    json.dump(data_dict, file)

data_dict

{'accuracy': 0.7270833333333333,
 'eval_loss': 0.862121045589447,
 'eval_runtime': 100.5482,
 'eval_samples_per_second': 4.774,
 'eval_steps_per_second': 0.597,
 'epoch': 3.0}

# Inference

In [85]:
text = "fucking bitch"

In [86]:
from transformers import pipeline

model_name = "stevhliu/my_awesome_model"

def inference_from_pipeline(text, model_name):
    classifier = pipeline("sentiment-analysis", model=model_name)
    return classifier(text)

def inference_directly(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")

    # Ensure the inputs are on the correct device
    inputs = {k: v.to(trainer.model.device) for k, v in inputs.items()}

    # Generate predictions
    with torch.no_grad():
        outputs = trainer.model(**inputs)
        logits = outputs.logits

    predictions = np.argmax(logits, axis=1)
    pred_class = trainer.model.config.id2label[int(predictions)]

    #decoded_preds = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return pred_class, logits


In [87]:
inference_directly(text)

('NEGATIVE', tensor([[ 3.6345, -2.4733, -2.3136]]))

In [90]:
pred_classes = np.argmax(predictions, axis=1)


In [91]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

precision = precision_score(labels, pred_classes, average="weighted")
recall = recall_score(labels, pred_classes, average="weighted")
f1 = f1_score(labels, pred_classes, average="weighted")

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Precision: 0.7287279632849136
Recall: 0.7270833333333333
F1 Score: 0.7266440636145937


In [92]:
report = classification_report(labels, pred_classes, target_names=list(trainer.model.config.id2label.values()))
print(report)


              precision    recall  f1-score   support

    NEGATIVE       0.82      0.76      0.79       160
     NEUTRAL       0.66      0.62      0.64       160
    POSITIVE       0.71      0.79      0.75       160

    accuracy                           0.73       480
   macro avg       0.73      0.73      0.73       480
weighted avg       0.73      0.73      0.73       480

