BERT Base MultiLingual cased

In [1]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load datasets
bjp_data = pd.read_csv('BJP.csv', encoding='ISO-8859-1')
congress_data = pd.read_csv('congress.csv', encoding='ISO-8859-1')

# Combine datasets
combined_data = pd.concat([bjp_data, congress_data], ignore_index=True)

# Drop rows with NaN or non-string values in 'commentText'
combined_data = combined_data.dropna(subset=['commentText'])
combined_data = combined_data[combined_data['commentText'].apply(lambda x: isinstance(x, str))]

# Split the dataset
train_data, val_data = train_test_split(combined_data, test_size=0.2, random_state=42)

# Tokenize input texts
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_tokenized = tokenizer(list(train_data['commentText']), padding=True, truncation=True, return_tensors="pt")
val_tokenized = tokenizer(list(val_data['commentText']), padding=True, truncation=True, return_tensors="pt")

# Create PyTorch datasets
train_labels = torch.tensor(train_data['Label'].tolist())
val_labels = torch.tensor(val_data['Label'].tolist())

train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], train_labels)
val_dataset = TensorDataset(val_tokenized['input_ids'], val_tokenized['attention_mask'], val_labels)

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Validation loop
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss}, Validation Accuracy: {accuracy}')

# Save the trained model
torch.save(model.state_dict(), 'bert_model.pth')


ModuleNotFoundError: No module named 'transformers.utils'

In [None]:
import numpy as np

# Function to plot confusion matrix with labels
def plot_confusion_matrix(true_labels, predicted_labels):
    cm = confusion_matrix(true_labels, predicted_labels)
    labels = ['Negative', 'Positive']  # Assuming 0 is Negative and 1 is Positive

    # Calculate TN, FN, FP, TP
    tn, fp, fn, tp = cm.ravel()

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix\nTN={}, FP={}, FN={}, TP={}'.format(tn, fp, fn, tp))
    plt.show()

# Visualize confusion matrix
plot_confusion_matrix(all_labels, all_preds)


In [None]:
import matplotlib.pyplot as plt

# Assuming train_losses, val_losses, and val_accuracies are lists containing the corresponding values for each epoch
train_losses = [0.033, 0.035, 0.025, 0.017, 0.028]  # Example values, replace with actual data
val_losses = [0.675, 0.829, 0.815, 0.824, 0.70]   # Example values, replace with actual data
val_accuracies = [0.81, 0.83, 0.84, 0.82, 0.84]  # Example values, replace with actual data
'''
Epoch 1/5, Loss: 100.22155168652534, Validation Accuracy: 0.8184143222506394
Epoch 2/5, Loss: 62.78327962011099, Validation Accuracy: 0.8363171355498721
Epoch 3/5, Loss: 37.99849247466773, Validation Accuracy: 0.8465473145780051
Epoch 4/5, Loss: 30.744250578805804, Validation Accuracy: 0.8260869565217391
Epoch 5/5, Loss: 16.05345555371605, Validation Accuracy: 0.8465473145780051
Epoch 1/5, Average Training Loss: 0.03302575466763561, Average Validation Loss: 0.675539287966581
Epoch 2/5, Average Training Loss: 0.035095677736312704, Average Validation Loss: 0.8297827266577195
Epoch 3/5, Average Training Loss: 0.02527293006501075, Average Validation Loss: 0.815096401910022
Epoch 4/5, Average Training Loss: 0.01761852964157196, Average Validation Loss: 0.8244812604472307
Epoch 5/5, Average Training Loss: 0.028966635979096673, Average Validation Loss: 0.7092047963167863
'''

def plot_learning_curve(train_losses, val_losses, val_accuracies):
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(10, 5))
    plt.plot(epochs, train_losses, label='Train Loss')
    plt.plot(epochs, val_losses, label='Validation Loss')
    plt.plot(epochs, val_accuracies, label='Validation Accuracy')
    plt.title('Learning Curve')
    plt.xlabel('Epoch')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.show()

# Plotting the learning curve
plot_learning_curve(train_losses, val_losses, val_accuracies)


Code for Input-Output


In [None]:
# Function to classify input text
def classify_text(text):
    # Tokenize input text
    tokenized_text = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    input_ids = tokenized_text['input_ids'].to(device)
    attention_mask = tokenized_text['attention_mask'].to(device)

    # Get model prediction
    with torch.no_grad():
        model.eval()
        output = model(input_ids, attention_mask=attention_mask)
        logits = output.logits
        pred_label = torch.argmax(logits, dim=1).item()

    # Map predicted label to sentiment
    sentiment = "Positive" if pred_label == 1 else "Negative"
    return sentiment

# Example usage
input_text = "modi ne mst kaam kiya hai!"
output_sentiment = classify_text(input_text)
print("Input Text:", input_text)
print("Predicted Sentiment:", output_sentiment)
