In [1]:
# Import required libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset, DatasetDict

import json
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from collections import Counter
import panel as pn
import warnings; warnings.filterwarnings('ignore')
import plotly.express as px

from transformers import Trainer, TrainingArguments

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
# Step 1: Load and preprocess the dataset

# Read data
data = []
with open('/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))
 
f.close()
emotion = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_id = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv')


# Create DataFrame from JSON
df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'text': _source.apply(lambda x: x['text']),
})

# Merge emotion and data_identification
df = df.merge(emotion, on='tweet_id', how='left')  # Add emotion column
df = df.merge(data_id, on='tweet_id', how='left')  # Add identification column

# Display resulting DataFrame
# Check if all columns are included: tweet_id, text, emotion, identification
print(df.head())  

# Split into train_data and test_data
train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

# Verify splits
print(f"Train data: {len(train_data)} rows")
print(f"Test data: {len(test_data)} rows")

   tweet_id                                               text       emotion  \
0  0x376b20  People who post "add me on #Snapchat" must be ...  anticipation   
1  0x2d5350  @brianklaas As we see, Trump is dangerous to #...       sadness   
2  0x28b412  Confident of your obedience, I write to you, k...           NaN   
3  0x1cd5b0                Now ISSA is stalking Tasha 😂😂😂 <LH>          fear   
4  0x2de201  "Trust is not the same as faith. A friend is s...           NaN   

  identification  
0          train  
1          train  
2           test  
3          train  
4           test  
Train data: 1455563 rows
Test data: 411972 rows


In [5]:

# Encode labels
class_names = ['anger', 'anticipation', 'disgust', 'fear', 'sadness', 'surprise', 'trust', 'joy']
train_data['label'] = train_data['emotion'].astype("category").cat.codes



In [8]:
# Reset index before conversion to remove the index column
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Convert to Hugging Face DatasetDict
emotions = DatasetDict({
    "train": Dataset.from_pandas(train_data[['text', 'label']]),
    "test": Dataset.from_pandas(test_data[['text']])
})

emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1455563
    })
    test: Dataset({
        features: ['text'],
        num_rows: 411972
    })
})

In [10]:
# Step 2: Tokenize the datasets
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=32)

emotions = emotions.map(tokenize, batched=True)
emotions = emotions.rename_column("label", "labels")
emotions.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/1455563 [00:00<?, ? examples/s]

Map:   0%|          | 0/411972 [00:00<?, ? examples/s]

ValueError: Original column name label not in the dataset. Current columns in the dataset: ['text', 'input_ids', 'attention_mask']

In [None]:
# Step 3: Load the model
num_labels = len(class_names)  # Number of classes updated to 8
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Step 4: Define performance metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [None]:
# Step 5: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    log_level="error",
    report_to="none",
)

In [None]:
# Step 6: Fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=emotions["train"],
    tokenizer=tokenizer
)

trainer.train()

In [None]:
# Step 7: Predict on the test set
emotions["test"] = emotions["test"].map(tokenize, batched=True)
predictions = trainer.predict(emotions["test"])

In [None]:
# Step 8: Prepare submission
test_data["emotion"] = [class_names[pred] for pred in predictions.predictions.argmax(-1)]
test_data[["id", "emotion"]].to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")