In [None]:
!pip install transformers datasets accelerate bitsandbytes peft trl wandb

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d pashupatigupta/emotion-detection-from-text
!unzip emotion-detection-from-text.zip


Dataset URL: https://www.kaggle.com/datasets/pashupatigupta/emotion-detection-from-text
License(s): CC0-1.0
Downloading emotion-detection-from-text.zip to /content
100% 1.56M/1.56M [00:01<00:00, 1.72MB/s]
100% 1.56M/1.56M [00:01<00:00, 1.56MB/s]
Archive:  emotion-detection-from-text.zip
  inflating: tweet_emotions.csv      


In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={"train": "/content/tweet_emotions.csv"})

print(dataset["train"].features)

Generating train split: 0 examples [00:00, ? examples/s]

{'tweet_id': Value(dtype='int64', id=None), 'sentiment': Value(dtype='string', id=None), 'content': Value(dtype='string', id=None)}


In [None]:
dataset = dataset.remove_columns(["tweet_id"])
dataset = dataset.rename_columns({"content": "text", "sentiment": "label"})

In [None]:
dataset["train"][2]

{'label': 'sadness', 'text': 'Funeral ceremony...gloomy friday...'}

In [None]:
from datasets import ClassLabel

unique_labels = list(set(dataset["train"]["label"]))
unique_labels.sort()  # Sort alphabetically for consistency

class_label = ClassLabel(num_classes=len(unique_labels), names=unique_labels)
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}

print("Label to ID mapping:", label2id)
print("ID to Label mapping:", id2label)

Label to ID mapping: {'anger': 0, 'boredom': 1, 'empty': 2, 'enthusiasm': 3, 'fun': 4, 'happiness': 5, 'hate': 6, 'love': 7, 'neutral': 8, 'relief': 9, 'sadness': 10, 'surprise': 11, 'worry': 12}
ID to Label mapping: {0: 'anger', 1: 'boredom', 2: 'empty', 3: 'enthusiasm', 4: 'fun', 5: 'happiness', 6: 'hate', 7: 'love', 8: 'neutral', 9: 'relief', 10: 'sadness', 11: 'surprise', 12: 'worry'}


In [None]:
dataset = dataset.map(lambda example: {"label": class_label.str2int(example["label"])})


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [None]:
!pip install transformers datasets evaluate torch

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BitsAndBytesConfig
from datasets import load_dataset
import evaluate

In [None]:
# Use a small transformer model for classification
model_name = "distilbert-base-uncased"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [None]:
train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [None]:
torch.cuda.is_available()

True

In [None]:
from peft import LoraConfig, get_peft_model

use_cuda = torch.cuda.is_available()
num_labels = len(unique_labels)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enables 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,  # Uses 16-bit precision for computations
    bnb_4bit_use_double_quant=True,  # Uses double quantization for better efficiency
)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_cuda,
    bnb_4bit_compute_dtype=torch.float16 if use_cuda else torch.float32,  # Use float16 for GPU, float32 for CPU
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    quantization_config=bnb_config if use_cuda else None,  # Use quantization only if GPU is available
)
if use_cuda:
    model.to(torch.device("cuda"))


`low_cpu_mem_usage` was None, now default to True since model is quantized.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
lora_config = LoraConfig(
    r=8,  # Rank of LoRA matrix (smaller values = less memory)
    lora_alpha=16,  # Scaling factor for LoRA updates
    lora_dropout=0.1,  # Dropout for regularization
    target_modules=["q_lin", "v_lin"],  # Apply LoRA to attention layers
)
qlora_model = get_peft_model(model, lora_config)

In [None]:
untrained_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
qlora_model.print_trainable_parameters()


trainable params: 147,456 || all params: 67,110,925 || trainable%: 0.2197


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Increase if needed
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)



In [None]:
trainer = Trainer(
    model=qlora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.0812,No log
2,2.0219,No log
3,2.0656,No log


TrainOutput(global_step=12000, training_loss=2.1260364583333335, metrics={'train_runtime': 1133.9303, 'train_samples_per_second': 84.661, 'train_steps_per_second': 10.583, 'total_flos': 1.2762851475456e+16, 'train_loss': 2.1260364583333335, 'epoch': 3.0})

In [None]:
output_dir = "./fine_tuned_qlora_model"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

('./fine_tuned_qlora_model/tokenizer_config.json',
 './fine_tuned_qlora_model/special_tokens_map.json',
 './fine_tuned_qlora_model/vocab.txt',
 './fine_tuned_qlora_model/added_tokens.json',
 './fine_tuned_qlora_model/tokenizer.json')

In [None]:
def predict_mood(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}  # Move to GPU if available
    model.to("cuda")  # Ensure model is on GPU if available
    model.eval()  # Set to evaluation mode

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits).item()

    # Convert class ID back to emotion label
    predicted_label = id2label[predicted_class]

    return predicted_label

In [None]:
untrained_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
test_sentence = "I am so scared"

# Prediction from untrained model
untrained_prediction = predict_mood(untrained_model, untrained_tokenizer, test_sentence)
print(f"Untrained Model Prediction: {untrained_prediction}")

# Prediction from fine-tuned model
fine_tuned_prediction = predict_mood(model, tokenizer, test_sentence)
print(f"Fine-Tuned Model Prediction: {fine_tuned_prediction}")

Untrained Model Prediction: hate
Fine-Tuned Model Prediction: worry
