## **Overview :-**
This project uses **DistilBERT**, a smaller, faster, and lighter version of BERT, for detecting emotions from text. The model is fine-tuned on a text dataset and deployed as an interactive web application using **Streamlit**.


## **Dataset :-** https://huggingface.co/datasets/dair-ai/emotion

## **Importing Libraries**

In [None]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn

In [None]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
import torch

In [None]:
import pandas as pd
from datasets import load_dataset

ds = load_dataset("dair-ai/emotion", "split")

In [None]:
ds

In [None]:
ds.set_format(type='pandas')

## **Training Data**

In [None]:
df = ds['train'][:]
df.head()

In [None]:
ds['train']

In [None]:
ds['train'].features

In [None]:
classes = ds['train'].features['label'].names
classes

In [None]:
df['label_name'] = df['label'].apply(lambda x: classes[x])

In [None]:
df.head()

## **Validation and Test Data**

In [None]:
df_valid = ds['validation'][:]
df_valid.head()

In [None]:
df_valid.shape

In [None]:
df_test = ds['test'][:]
df_test.head()

In [None]:
df_test.shape

## **Dataset Analysis**

In [None]:
df.info()

In [None]:
df['label_name'].value_counts(ascending = True)

In [None]:
df['label_name'].value_counts(ascending = True)/len(df)*100

In [None]:
import matplotlib.pyplot as plt

label_counts = df['label_name'].value_counts(ascending=True)
label_counts.plot.barh()
plt.title('Frequency of Classes')
plt.show()

In [None]:
df['Words Per Tweet'] = df['text'].str.split().apply(len)

In [None]:
df.head()

In [None]:
df['Words Per Tweet'].max()

In [None]:
df.boxplot("Words Per Tweet", by='label_name')

## **Text to Tokens Conversion**

In [None]:
from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
tokenizer.vocab_size, tokenizer.model_max_length

## **Tokenization of the Emotion Data**

In [None]:
ds.reset_format()

In [None]:
ds

In [None]:
def tokenize(batch):
  temp =tokenizer(batch['text'], padding=True, truncation=True)
  return temp

print(tokenize(ds["train"][:5]))

In [None]:
emotions_encoded = ds.map(tokenize, batched=True, batch_size=None)

In [None]:
emotions_encoded

In [None]:
# Get the max length of tokenized inputs from the emotions_encoded dataset
max_length_train = max([len(item['input_ids']) for item in emotions_encoded['train']])
max_length_validation = max([len(item['input_ids']) for item in emotions_encoded['validation']])
max_length_test = max([len(item['input_ids']) for item in emotions_encoded['test']])

In [None]:
max_length_train , max_length_validation , max_length_test

## **Model Building**

In [None]:
model_ckpt

In [None]:
from transformers import AutoModel
import torch

model = AutoModel.from_pretrained(model_ckpt)

In [None]:
model

## **Fine-Tuning DistilBERT**

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = len(classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = num_labels).to(device)

In [None]:
device

In [None]:
from transformers import TrainingArguments

In [None]:
batch_size = 64
model_name = "distilbert-finetuned-emotion"

training_args = TrainingArguments(output_dir = model_name,
                                 #run_name="my-emotion-finetuning-run", # Add this line
                                 num_train_epochs=2,
                                 learning_rate = 2e-5,
                                 per_device_train_batch_size= batch_size,
                                 per_device_eval_batch_size = batch_size,
                                 weight_decay=0.01,
                                 #evaluation_strategy = "epoch",
                                 disable_tqdm=False)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average='weighted')
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded['train'],
                  eval_dataset=emotions_encoded['validation'],
                  tokenizer=tokenizer)

In [None]:
!pip install wandb -qU
import wandb
wandb.login()
# Before calling trainer.train()
wandb.init(project="my-emotion-finetuning-run", name="distilbert-finetuned-emotion")

# Now call the train method
trainer.train()

In [None]:
preds_outputs = trainer.predict(emotions_encoded['test'])
preds_outputs.metrics

## **Saving the Model**

In [None]:
trainer.save_model("./distilbert-finetuned-emotion-saved")

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the fine-tuned model and tokenizer
model_path = "./distilbert-finetuned-emotion-saved"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Save the model and tokenizer with reduced precision (e.g., fp16)
# This significantly reduces the model size.
model.save_pretrained(model_path, max_shard_size="100MB", safe_serialization=True)
tokenizer.save_pretrained(model_path)

## **Loading the Model**

In [None]:
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)

print(loaded_model)

## **Classification Report**

In [None]:
import numpy as np
y_preds = np.argmax(preds_outputs.predictions, axis=1)
y_true = emotions_encoded['test'][:]['label']

In [None]:
from sklearn.metrics import classification_report
print(classes)
print(classification_report(y_true, y_preds))

In [None]:
label_counts

## **Prediction on Single Text**

In [None]:
device

In [None]:
text = 'i want to kill you'
input_encoded = tokenizer(text, return_tensors='pt')
# Move the input tensor to the same device as the model
input_encoded = input_encoded.to('cpu')
with torch.no_grad():
  outputs = model(**input_encoded)

logits = outputs.logits
pred = torch.argmax(logits, dim=1).item()
pred, classes[pred]

In [None]:
outputs