In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm

In [None]:
df_train=pd.read_csv("/kaggle/input/brand-sentiment-analysis-dataset/Dataset - Train.csv")
df_test=pd.read_csv("/kaggle/input/brand-sentiment-analysis-dataset/Dataset - Test.csv")

In [None]:
df_train["is_there_an_emotion_directed_at_a_brand_or_product"] = df_train["is_there_an_emotion_directed_at_a_brand_or_product"].replace("I can't tell", "No emotion toward brand or product")


In [None]:

label_encoder = LabelEncoder()
df_train["sentiment_encoded"] = label_encoder.fit_transform(df_train["is_there_an_emotion_directed_at_a_brand_or_product"])   

In [None]:
df_train["sentiment_encoded"].value_counts()

In [None]:
!pip install nlpaug
!pip install nltk

In [None]:
import nltk
nltk.download('all')

In [None]:
import nlpaug.augmenter.word as naw
#from collections import counter

augmenter= naw.SynonymAug(aug_src='wordnet', lang='eng')
class_counts= df_train["sentiment_encoded"].value_counts()
max_class= class_counts.max()
balanced_data=[]

for label in class_counts.index:
    class_df= df_train[df_train['sentiment_encoded']==label]
    samples_needed=max_class-len(class_df)
    balanced_data.append(class_df)
    
    if samples_needed > 0:
        augmented_texts = []
        while len(augmented_texts) < samples_needed:
            for text in class_df['tweet_text']:
                aug_text = augmenter.augment(text)
                if aug_text != text:  # Avoid unchanged sentences
                    augmented_texts.append(aug_text)
                if len(augmented_texts) >= samples_needed:
                    break

        aug_df = pd.DataFrame({
            'tweet_text': augmented_texts,
            'sentiment_encoded': [label] * len(augmented_texts)
        })
        balanced_data.append(aug_df)

balanced_df = pd.concat(balanced_data, ignore_index=True)

# Check final balance
print(balanced_df['sentiment_encoded'].value_counts())
        

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    balanced_df["tweet_text"].values, balanced_df["sentiment_encoded"].values,
    test_size=0.2,random_state=42,)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
       text = str(self.texts[idx])  # Ensure text is a string
       label = self.labels[idx]
       encoding = self.tokenizer(
        text,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
       return {
        "input_ids": encoding["input_ids"].squeeze(),
        "attention_mask": encoding["attention_mask"].squeeze(),
        "labels": torch.tensor(label, dtype=torch.long)
    }

max_len = 128
train_dataset = ClassificationDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = ClassificationDataset(val_texts, val_labels, tokenizer, max_len)

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained( "bert-base-uncased",
                                 num_labels=len(label_encoder.classes_))
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=7e-7)

In [None]:
epochs = 18
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct=0
    total=0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        predictions = torch.argmax(outputs.logits, dim=1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())
        accuracy = correct / total
        
    print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")
    print(f"Epoch {epoch} accuracy: {accuracy}")

In [None]:
from sklearn.metrics import f1_score
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)
        
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        
accuracy = correct /total
f1 = f1_score(all_labels, all_preds, average='weighted')  # or 'macro', 'micro', etc.
f1_micro = f1_score(all_labels, all_preds, average='micro')
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"F1 score: {f1:.4f}")
print(f"F1 score: {f1_micro:.4f}")


In [None]:
model.save_pretrained("bert_resume_classifier")
tokenizer.save_pretrained("bert_resume_classifier")
torch.save(label_encoder, "label_encoder.pth")