In [None]:
import torch
import re
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import Dataset as HFDataset
import pandas as pd
from datasets import Dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("datatattle/covid-19-nlp-text-classification")

print("Path to dataset files:", path)

In [None]:
file = "/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv"
df = pd.read_csv(file)
df.head()

In [None]:
print("columns:", df.columns.tolist())

In [None]:
df = df[['OriginalTweet', 'Sentiment']].dropna().reset_index(drop=True)
df.head()

In [None]:
def clean_text(text):
  text = re.sub(r'http\S+', '', text)
  text = re.sub(r'@\w+', '', text)
  text = re.sub(r'#(w+)', r'\1', text)
  text = re.sub(r"[^A-Za-z0-9']+", " ", text)
  text = re.sub(r"\s+", " ", text).strip().lower()
  return text

In [None]:
df['clean_text'] = df['OriginalTweet'].apply(clean_text)
df.head()

In [None]:
labels = sorted(df['Sentiment'].unique())
label2id = {label: i for i, label in enumerate(labels)}
df['label_id'] = df['Sentiment'].map(label2id)
print("Label Mapping:", label2id)

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [None]:
def tokenize(text):
  return re.findall(r"\w+|[^\w\s]", text)

counter = Counter()
for t in train_df["clean_text"]:
  counter.update(tokenize(t))

In [None]:
itos = ['<pad>', '<unk>'] + [w for w, c in counter.items() if c>= 2]
stoi = {w: i for i, w in enumerate(itos)}

In [None]:
def encode(text):
  return [stoi.get(t, stoi["<unk>"]) for t in tokenize(text)]

class TweetDataset(Dataset):
  def __init__(self, df):
    self.text = df['clean_text'].tolist()
    self.labels = df['label_id'].tolist()
  def __len__(self):
    return len(self.text)
  def __getitem__(self, index):
    return torch.tensor(encode(self.text[index])), torch.tensor(self.labels[index])

def collate(batch):
  texts, labels = zip(*batch)
  texts = pad_sequence(texts, batch_first=True, padding_value=stoi["<pad>"])
  labels = torch.stack(labels)
  return texts, labels

In [None]:
train_rnn = DataLoader(TweetDataset(train_df), batch_size=32, shuffle=True, collate_fn=collate)
val_rnn = DataLoader(TweetDataset(val_df), batch_size=64, collate_fn=collate)
test_rnn = DataLoader(TweetDataset(test_df), batch_size=64, collate_fn=collate)
print(f"Vocab size: {len(itos)}")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

In [None]:
class LSTM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, bidirectional,n_layers, dropout, pad_idx):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, bidirectional = True,num_layers=n_layers, dropout=dropout, batch_first=True)
    self.fc = nn.Linear(hidden_dim*2, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, text):
    embedded = self.dropout(self.embedding(text))
    output, (hidden, cell) = self.rnn(embedded)
    pooled = torch.max(output, dim=1)[0]  
    pooled = self.dropout(pooled)
    return self.fc(pooled)

In [None]:
classes = np.unique(df['label_id'])
weights = compute_class_weight("balanced", classes=classes, y=df['label_id'])
weights = torch.tensor(weights, dtype=torch.float).to(device)
n_layers = 3
model = LSTM(len(itos), 16, 128, len(set(df['label_id'])), True,n_layers, 0.2, stoi["<pad>"]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(weight = weights)

In [None]:
def training(loader):
  model.train()
  total_loss, correct, total = 0, 0, 0
  for texts, labels in train_rnn:
    texts, labels = texts.to(device), labels.to(device)
    
    texts = texts.long()
    
    optimizer.zero_grad()
    logits = model(texts)
    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item() * texts.size(0)
    preds = logits.argmax(1)
    correct += (preds == labels).sum().item()
    total += labels.size(0)
  return total_loss / total, correct / total

In [None]:
def evaluate(loader):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for texts, labels in loader:
            texts, labels = texts.to(device), labels.to(device)
            logits = model(texts)
            loss = criterion(logits, labels)
            total_loss += loss.item() * texts.size(0)
            preds = logits.argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return total_loss / total, correct / total

In [None]:
for epoch in range(75):
    train_loss, train_acc = training(train_rnn)
    val_loss, val_acc = evaluate(val_rnn)
    print(f"Epoch {epoch+1}: Train Loss {train_loss:.4f}, Acc {train_acc:.3f} | Val Loss {val_loss:.4f}, Acc {val_acc:.3f}")

In [None]:
test_loss, test_acc = evaluate(test_rnn)
print(f"Test Accuracy: {test_acc:.3f}")