# Mounting

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


# Project Setup

In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
Coll

In [4]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset

In [5]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained mBERT model and tokenizer

In [6]:
model_name = 'bert-base-multilingual-cased'
num_classes = 3  # Update with the number of classes in your task
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

# Loading dataset

In [7]:
# Load CSV data
dir = "/content/drive/MyDrive/Research/VITD/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/dataset/task datasets/original/"
train_data = pd.read_csv(f'{dir}train.csv')
test_data = pd.read_csv(f'{dir}test.csv')
val_data = pd.read_csv(f'{dir}validation.csv')

# Model setup

In [8]:
def preprocess_data(data, max_length):
    encodings = tokenizer(data['sentence1'].tolist(), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    labels = torch.tensor(data['label'].tolist()).to(device)
    dataset = TensorDataset(encodings.input_ids.to(device), encodings.attention_mask.to(device), labels)
    return dataset

In [9]:
max_length = 256  # Set your desired max sequence length
train_dataset = preprocess_data(train_data, max_length)
val_dataset = preprocess_data(val_data, max_length)
test_dataset = preprocess_data(test_data, max_length)

In [10]:
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [11]:
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training (skip this for saved model)

In [12]:
# Move the model to the same device as device
model = model.to(device)

In [13]:
# # Define early stopping parameters
# early_stopping_patience = 3
# best_val_loss = float('inf')
# epochs_since_last_improvement = 0

# # Training loop
# num_epochs = 20  # Increased the number of epochs
# for epoch in range(num_epochs):
#     model.train()
#     for batch in train_loader:
#         optimizer.zero_grad()
#         input_ids, attention_mask, labels = batch
#         input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

#     model.eval()
#     val_loss = 0
#     correct = 0
#     total = 0
#     with torch.no_grad():
#         for batch in val_loader:
#             input_ids, attention_mask, labels = batch
#             input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             loss = outputs.loss
#             val_loss += loss.item()
#             predicted_labels = torch.argmax(outputs.logits, dim=1)
#             correct += (predicted_labels == labels).sum().item()
#             total += labels.size(0)

#     val_accuracy = correct / total
#     avg_val_loss = val_loss / len(val_loader)

#     print(f"Epoch [{epoch+1}/{num_epochs}]")
#     print(f"Validation Accuracy: {val_accuracy:.4f}")
#     print(f"Avg. Validation Loss: {avg_val_loss:.4f}")

#     # Early stopping check
#     if avg_val_loss < best_val_loss:
#         best_val_loss = avg_val_loss
#         epochs_since_last_improvement = 0
#     else:
#         epochs_since_last_improvement += 1
#         if epochs_since_last_improvement >= early_stopping_patience:
#             print("Early stopping triggered. Stopping training.")
#             break


# Saving tuned model

In [14]:
from transformers import AutoModel, AutoConfig

# Define the path to the saved model
model_path = "/content/drive/MyDrive/Research/VITD/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/mbert/mbert_trained_model.pth"

# Save the model
torch.save(model.state_dict(), model_path)

In [15]:
# Create an instance of the model
model_name = 'bert-base-multilingual-cased'
num_classes = 3  # Update with the number of classes in your task
model_path = "/content/drive/MyDrive/Research/VITD/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/mbert/mbert_trained_model.pth"

model2 = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# Load the saved parameters into the model
model2.load_state_dict(torch.load(model_path))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

# Report on test set

In [16]:
import numpy as np
from sklearn.metrics import classification_report

In [17]:
# Evaluate on the test set
model2.to(device)  # Move the model to the same device as tensors
model2.eval()
test_labels_list = []
predicted_labels_list = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = model2(input_ids, attention_mask=attention_mask)
        predicted_labels = torch.argmax(outputs.logits, dim=1)

        test_labels_list.extend(labels.cpu().numpy())
        predicted_labels_list.extend(predicted_labels.cpu().numpy())

test_labels_np = np.array(test_labels_list)
predicted_labels_np = np.array(predicted_labels_list)

In [20]:
# Testing loop
from sklearn.metrics import accuracy_score
model2.to(device)  # Move the model to the same device as tensors
model2.eval()
test_preds = []
test_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
        labels = batch[2]

        outputs = model2(**inputs)
        logits = outputs.logits

        predicted_labels = torch.argmax(logits, dim=1)
        test_preds.extend(predicted_labels.cpu().numpy())
        test_true_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(test_true_labels, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Generate classification report
class_names = ["Non Violence", "Passive Violence", "Direct Violence"]  # Replace with your actual class names
report = classification_report(test_true_labels, test_preds, target_names=class_names, digits=4)
print("Classification Report:")
print(report)

Test Accuracy: 0.1280
Classification Report:
                  precision    recall  f1-score   support

    Non Violence     0.5000    0.0027    0.0054      1096
Passive Violence     0.5172    0.0834    0.1437       719
 Direct Violence     0.1030    0.9701    0.1862       201

        accuracy                         0.1280      2016
       macro avg     0.3734    0.3521    0.1118      2016
    weighted avg     0.4666    0.1280    0.0728      2016



In [21]:
# Generate classification report
from sklearn.metrics import classification_report
target_names = ["0", "1", "2"]  # Replace with your class names
report = classification_report(test_labels_np, predicted_labels_np, target_names=target_names, digits=4)

# Print the classification report
print(report)

NameError: ignored

In [None]:
# # Create a DataFrame with index and prediction columns
# result_df = pd.DataFrame({
#     'index': range(len(test_labels_np)),
#     'prediction': predicted_labels_np
# })

# # Save the DataFrame to a CSV file
# result_df.to_csv('/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/mbert/predictions.csv', index=False)
# print("Predictions saved to 'predictions.csv'")

# Probability ensemble

In [None]:
# import torch
# import numpy as np
# import pandas as pd

# model_name = 'bert-base-multilingual-cased'
# num_classes = 3  # Update with the number of classes in your task
# model_path = "/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/mbert/mbert_trained_model.pth"
# model2 = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# # Load the saved parameters into the model
# model2.load_state_dict(torch.load(model_path))
# model2.to(device)  # Move the model to the same device as tensors
# model2.eval()

# class_probs_list = []  # List to store predicted class probabilities

# with torch.no_grad():
#     for batch in test_loader:
#         input_ids, attention_mask, _ = batch  # No need for labels in this case
#         input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
#         outputs = model2(input_ids, attention_mask=attention_mask)
#         predicted_probabilities = torch.softmax(outputs.logits, dim=1)

#         class_probs_list.extend(predicted_probabilities.cpu().numpy())

# class_probs_np = np.array(class_probs_list)

# # Now you have an array of class probabilities for each example
# # class_probs_np.shape will be (num_examples, num_classes)

# # Create a DataFrame with index and predicted class probabilities
# result_df = pd.DataFrame(class_probs_np, columns=[f'Class_{i}' for i in range(num_classes)])
# result_df['index'] = range(len(class_probs_np))

# # Reorder the columns so 'index' comes first
# result_df = result_df[['index'] + [f'Class_{i}' for i in range(num_classes)]]

# # Save the DataFrame to a CSV file
# result_df.to_csv('/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/mbert/probability_ensemble.csv', index=False)
# print("Probabilities saved to probability_ensemble.csv")
