<a href="https://colab.research.google.com/github/udayPatil45/LS_Intro_to_ML_NLP/blob/main/sentiment_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ASSIGNMENT 3

In [2]:
import os
import pandas as pd
import numpy as np
from datasets import Dataset, load_metric
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import evaluate
import torch

In [3]:
# -------------------- Step 1: Load Dataset from .parquet --------------------
# Load all 3 splits
train_df = pd.read_parquet("/content/train-00000-of-00001.parquet")
val_df = pd.read_parquet("/content/validation-00000-of-00001.parquet")
test_df = pd.read_parquet("/content/test-00000-of-00001.parquet")

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

train_df.head(5)

Unnamed: 0,id,verse_text,label
0,0,with pale blue berries. in these peaceful shad...,1
1,1,"it flows so long as falls the rain,",2
2,2,"and that is why, the lonesome day,",0
3,3,"when i peruse the conquered fame of heroes, an...",3
4,4,of inward strife for truth and liberty.,3


In [4]:
# -------------------- Step 2: Tokenization --------------------
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Identify the correct text column
print("Train dataset columns:", train_dataset.column_names)

# Replace 'text' with the actual column name (likely 'content')
def tokenize_function(example):
    return tokenizer(
        example["verse_text"],  # <-- change if column name is different
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
columns_to_remove = ["id", "verse_text"]
if "__index_level_0__" in train_dataset.column_names:
    columns_to_remove.append("__index_level_0__")

tokenized_train = tokenized_train.remove_columns(columns_to_remove)
tokenized_val = tokenized_val.remove_columns(columns_to_remove)
tokenized_test = tokenized_test.remove_columns(columns_to_remove)

# Set PyTorch format
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")
tokenized_test.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Train dataset columns: ['id', 'verse_text', 'label']


Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [5]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# -------------------- Step 4: Define Metrics --------------------
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    }

In [7]:
# -------------------- Step 5: Training Configuration --------------------
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    save_total_limit=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

In [8]:
!pip install -U transformers
import transformers
print(transformers.__version__)


4.53.0


In [9]:
# -------------------- Step 6: Train the Model --------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)



In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

In [12]:
# -------------------- Step 7: Evaluate on Test Set --------------------
metrics = trainer.evaluate(tokenized_test)
print("Test set metrics:", metrics)


[34m[1mwandb[0m: Currently logged in as: [33mudayanrajepatil45[0m ([33mudayanrajepatil45-iit-bombay[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Test set metrics: {'eval_loss': 0.9107636213302612, 'eval_model_preparation_time': 0.0062, 'eval_accuracy': 0.6538461538461539, 'eval_f1': 0.5276653171390013, 'eval_runtime': 49.8901, 'eval_samples_per_second': 2.085, 'eval_steps_per_second': 0.261}


In [13]:
# -------------------- Step 8: Save Model --------------------
model.save_pretrained("poem_sentiment_bert")
tokenizer.save_pretrained("poem_sentiment_bert")


('poem_sentiment_bert/tokenizer_config.json',
 'poem_sentiment_bert/special_tokens_map.json',
 'poem_sentiment_bert/vocab.txt',
 'poem_sentiment_bert/added_tokens.json')

In [14]:
# -------------------- Step 9: Inference Example --------------------
from transformers import pipeline

# Load pipeline from saved model
sentiment_pipeline = pipeline("sentiment-analysis", model="poem_sentiment_bert", tokenizer="poem_sentiment_bert")

# Try on a new sentence
example_text = "This poem is absolutely beautiful and uplifting."
print(sentiment_pipeline(example_text))

Device set to use cpu


[{'label': 'LABEL_2', 'score': 0.5556082725524902}]
