In [48]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from transformers import BertTokenizerFast
from transformers import BertForSequenceClassification, AdamW  
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm

In [50]:
train_df = pd.read_csv("../data/train.csv")
val_df = pd.read_csv("../data/validation.csv")
test_df = pd.read_csv("../data/test.csv")

In [53]:
tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-mini')

def tokenize(data, max_length=87):
    return tokenizer(
        data["Comment_Adj"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )

class CommentsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenize(train_df)
val_encodings = tokenize(val_df)
test_encodings = tokenize(test_df)

train_dataset = CommentsDataset(train_encodings, train_df['Result_Bin'])
val_dataset = CommentsDataset(val_encodings, val_df['Result_Bin'])
test_dataset = CommentsDataset(test_encodings, test_df['Result_Bin'])

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=10)
test_loader = DataLoader(test_dataset, batch_size=10)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-mini', num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

total_epochs = 6

for epoch in range(total_epochs):
    with tqdm(train_loader, unit="batch", desc=f"Epoch {epoch + 1}/{total_epochs}") as pbar:
        for batch in pbar:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            pbar.set_postfix(loss=f"{loss.item():.4f}")

print("Training completed.")

model.save_pretrained('./bert_pth')

In [41]:
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())
        true_labels.extend(batch['labels'].tolist())

precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
print("Precision on Test:", round(precision,3))
print("Recall on Test:", round(recall,3))
print("F1 Score on Test:", round(f1,3)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Precision on Test: 0.657
Recall on Test: 0.705
F1 Score on Test: 0.68
