<a href="https://colab.research.google.com/github/yashdubey20/Digiplus-Assessment/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load datasets
bjp_data = pd.read_csv('BJP.csv', encoding='ISO-8859-1')
congress_data = pd.read_csv('congress.csv', encoding='ISO-8859-1')

# Combine datasets
combined_data = pd.concat([bjp_data, congress_data], ignore_index=True)

# Drop rows with NaN or non-string values in 'commentText'
combined_data = combined_data.dropna(subset=['commentText'])
combined_data = combined_data[combined_data['commentText'].apply(lambda x: isinstance(x, str))]

# Split the dataset
train_data, val_data = train_test_split(combined_data, test_size=0.2, random_state=42)

# Tokenize input texts
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_tokenized = tokenizer(list(train_data['commentText']), padding=True, truncation=True, return_tensors="pt")
val_tokenized = tokenizer(list(val_data['commentText']), padding=True, truncation=True, return_tensors="pt")

# Create PyTorch datasets
train_labels = torch.tensor(train_data['Label'].tolist())
val_labels = torch.tensor(val_data['Label'].tolist())

train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], train_labels)
val_dataset = TensorDataset(val_tokenized['input_ids'], val_tokenized['attention_mask'], val_labels)

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

# Initialize best accuracy and best model state
best_accuracy = 0.0
best_model_state = None

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Validation loop
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss}, Validation Accuracy: {accuracy}')

    # Check if this model is the best so far
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_state = model.state_dict()

# Save the best model
torch.save(best_model_state, 'best_bert_model.pth')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 107.78369000554085, Validation Accuracy: 0.8363171355498721
Epoch 2/5, Loss: 74.09129936993122, Validation Accuracy: 0.8439897698209718
Epoch 3/5, Loss: 55.415508760139346, Validation Accuracy: 0.8363171355498721
Epoch 4/5, Loss: 38.05685165245086, Validation Accuracy: 0.8542199488491049
Epoch 5/5, Loss: 26.402069519739598, Validation Accuracy: 0.8567774936061381


In [2]:
def classify_text(text):
    # Tokenize input text
    tokenized_text = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    input_ids = tokenized_text['input_ids'].to(device)
    attention_mask = tokenized_text['attention_mask'].to(device)

    # Get model prediction
    with torch.no_grad():
        model.eval()
        output = model(input_ids, attention_mask=attention_mask)
        logits = output.logits
        pred_label = torch.argmax(logits, dim=1).item()

    # Map predicted label to sentiment
    sentiment = "Positive" if pred_label == 1 else "Negative"
    return sentiment

# Load the best model state
model.load_state_dict(torch.load('best_bert_model.pth'))

# Example usage
input_text = "Aaayegi to modi sarkar hi"
output_sentiment = classify_text(input_text)
print("Input Text:", input_text)
print("Predicted Sentiment:", output_sentiment)


Input Text: Aaayegi to modi sarkar hi
Predicted Sentiment: Positive
