In [81]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.metrics import accuracy_score

In [82]:
class IndicSentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [83]:
import pandas as pd

# Load the dataset from a CSV file
df = pd.read_csv('dataset_test.csv')

# Print the column names of the DataFrame
print(df.columns)

Index(['GENERIC CATEGORIES', 'CATEGORY', 'SUB-CATEGORY', 'PRODUCT', 'BRAND',
       'ASPECTS', 'ASPECT COMBO', 'ENGLISH REVIEW', 'LABEL', 'INDIC REVIEW'],
      dtype='object')


In [84]:
# Extract the reviews and labels
reviews = df['INDIC REVIEW'].tolist()  
labels = df['LABEL'].tolist()

# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

In [85]:
# Split the dataset into train and test sets
reviews_train, reviews_test, labels_train, labels_test = train_test_split(reviews, labels_encoded, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

encodings_train = tokenizer(reviews_train, truncation=True, padding=True)
encodings_test = tokenizer(reviews_test, truncation=True, padding=True)

dataset_train = IndicSentimentDataset(encodings_train, labels_train)
dataset_test = IndicSentimentDataset(encodings_test, labels_test)

In [86]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_encoder.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
def train(model, dataloader, optimizer, device, scheduler):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    avg_loss = total_loss / len(dataloader)
    return avg_loss

def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy().tolist())
            true_labels.extend(labels.cpu().numpy().tolist())
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy

In [88]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

batch_size = 16
epochs = 5
learning_rate = 2e-5
warmup_steps = 100

train_dataloader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    avg_train_loss = train(model, train_dataloader, optimizer, device, scheduler)
    accuracy = evaluate(model, test_dataloader, device)
    print(f"Train Loss: {avg_train_loss:.2f}")
    print(f"Test Accuracy: {accuracy:.2f}")




Epoch 1/5


KeyboardInterrupt: 