# Mouting to Google Drive

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%pwd

'/content'

# Importing

In [3]:
!pip install transformers



In [4]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load pre-trained model and tokenizer


In [5]:
!pip install sentencepiece



In [6]:
!apt-get install cmake build-essential pkg-config libgoogle-perftools-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
libgoogle-perftools-dev is already the newest version (2.9.1-0ubuntu3).
pkg-config is already the newest version (0.29.2-1ubuntu3).
cmake is already the newest version (3.22.1-1ubuntu1.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


In [7]:
model_name = "xlm-roberta-base"
num_labels = 3

model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [8]:
def tokenize_data(data, tokenizer, max_length):
    input_ids = []
    attention_masks = []

    for text in data['sentence1']:
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids.append(inputs["input_ids"])
        attention_masks.append(inputs["attention_mask"])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(data['label'])

    return input_ids, attention_masks, labels


In [10]:
datasets_dir = "/content/drive/MyDrive/Research/VITD/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/dataset/task datasets/original/"

In [11]:
!pip install pandas



In [12]:
import pandas as pd
train_df = pd.read_csv(datasets_dir+'train.csv')
val_df = pd.read_csv(datasets_dir+'validation.csv')
test_df = pd.read_csv(datasets_dir+'test.csv')

In [13]:
train_df

Unnamed: 0,sentence1,label
0,যে দেশে সন্ত্রাসরা দেশ চালায়সে দেশে শান্তি কিভ...,1
1,এই বিচার শেষ বিচার নয়।আসল বিচার হবে আল্লাহর আদ...,0
2,আরব দেশগুলোকে বলব ভারতের সাথে সব ব্যবসা বাণিজ্...,2
3,দেশটা সুস্থ নাই,0
4,আপনার কথা দুঃখ জনক আগে বিডিও থাকলে কেন ধরা হলন...,0
...,...,...
2695,"হল বন্ধ করার আগে নিউমার্কেট বন্ধ করা উচিৎ,,।",2
2696,এ স্বাধীন দেশে ভোটটা অন্ততঃ আমাদের দিতে দেন।জা...,0
2697,আল হামদুলিল্লাহ্...প্রিয় ভিউয়ার্স আপনাদের ভালব...,0
2698,এই গুলা ত বিচার করবায় ঐ তোমরা ত ইন্ডিয়ান না,0


In [14]:
max_length = 512
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_df, tokenizer, max_length)
val_input_ids, val_attention_masks, val_labels = tokenize_data(val_df, tokenizer, max_length)
test_input_ids, test_attention_masks, test_labels = tokenize_data(test_df, tokenizer, max_length)


In [15]:
# Create DataLoader
batch_size = 16
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


In [16]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

# Training (skip for using saved model)

In [17]:
# Hyperparameters for early stopping
patience = 3  # Number of epochs to wait for improvement
min_delta = 0.001  # Minimum change in validation accuracy to be considered an improvement
best_val_accuracy = 0
epochs_without_improvement = 0

# Initialize the best_model_state
best_model_state = None

# Training loop
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 20  # Adjust as needed

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    train_preds = []
    train_true_labels = []

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
        labels = batch[2]

        optimizer.zero_grad()
        outputs = model(**inputs)
        logits = outputs.logits

        loss = torch.nn.functional.cross_entropy(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        predicted_labels = torch.argmax(logits, dim=1)
        train_preds.extend(predicted_labels.cpu().numpy())
        train_true_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_loss / len(train_dataloader)
    train_accuracy = accuracy_score(train_true_labels, train_preds)

    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

    # Validation loop
    model.eval()
    val_preds = []
    val_true_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
            labels = batch[2]

            outputs = model(**inputs)
            logits = outputs.logits

            predicted_labels = torch.argmax(logits, dim=1)
            val_preds.extend(predicted_labels.cpu().numpy())
            val_true_labels.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(val_true_labels, val_preds)
    print(f"Epoch {epoch+1}, Validation Accuracy: {val_accuracy:.4f}")

    # Early stopping check
    if val_accuracy > best_val_accuracy + min_delta:
        best_val_accuracy = val_accuracy
        epochs_without_improvement = 0
        # Save the current best model state
        best_model_state = model.state_dict()
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= patience:
        print(f"Early stopping after {epoch+1} epochs.")
        # Load the best model state
        model.load_state_dict(best_model_state)
        break




Epoch 1, Train Loss: 1.0040, Train Accuracy: 0.5022
Epoch 1, Validation Accuracy: 0.5617
Epoch 2, Train Loss: 0.9028, Train Accuracy: 0.5933
Epoch 2, Validation Accuracy: 0.6662
Epoch 3, Train Loss: 0.7588, Train Accuracy: 0.6752
Epoch 3, Validation Accuracy: 0.7308
Epoch 4, Train Loss: 0.6454, Train Accuracy: 0.7363
Epoch 4, Validation Accuracy: 0.7654
Epoch 5, Train Loss: 0.5307, Train Accuracy: 0.7941
Epoch 5, Validation Accuracy: 0.7429
Epoch 6, Train Loss: 0.4094, Train Accuracy: 0.8437
Epoch 6, Validation Accuracy: 0.7316
Epoch 7, Train Loss: 0.3327, Train Accuracy: 0.8774
Epoch 7, Validation Accuracy: 0.7654
Early stopping after 7 epochs.


# Saving tuned model

In [19]:
from sklearn.metrics import classification_report

In [3]:
from transformers import AutoModel, AutoConfig

# Define the path to the saved model
model_path = "/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/xlmroberta/xlmroberta_trained_model.pth"

# Save the model
torch.save(model.state_dict(), model_path)

ModuleNotFoundError: ignored

# Loading saved model

In [17]:
# Create an instance of the model
model2 = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model_path = "/content/drive/MyDrive/Research/VITD/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/xlmroberta/xlmroberta_trained_model.pth"

# Load the saved parameters into the model
model2.load_state_dict(torch.load(model_path))

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [20]:
# Testing loop
model2.to(device)  # Move the model to the same device as tensors
model2.eval()
test_preds = []
test_true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
        labels = batch[2]

        outputs = model2(**inputs)
        logits = outputs.logits

        predicted_labels = torch.argmax(logits, dim=1)
        test_preds.extend(predicted_labels.cpu().numpy())
        test_true_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(test_true_labels, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Generate classification report
class_names = ["Non Violence", "Passive Violence", "Direct Violence"]  # Replace with your actual class names
report = classification_report(test_true_labels, test_preds, target_names=class_names, digits=4)
print("Classification Report:")
print(report)

Test Accuracy: 0.7222
Classification Report:
                  precision    recall  f1-score   support

    Non Violence     0.8016    0.8075    0.8045      1096
Passive Violence     0.7419    0.5716    0.6457       719
 Direct Violence     0.4469    0.7960    0.5725       201

        accuracy                         0.7222      2016
       macro avg     0.6635    0.7250    0.6742      2016
    weighted avg     0.7450    0.7222    0.7248      2016



In [22]:
# # Create a DataFrame with index and prediction columns
# result_df = pd.DataFrame({
#     'index': range(len(test_true_labels)),
#     'prediction': test_preds
# })

# # Save the DataFrame to a CSV file
# result_df.to_csv('/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/xlmroberta/predictions.csv', index=False)

# Proability ensemble

In [23]:
# import torch
# import numpy as np
# import pandas as pd

# model_name = "xlm-roberta-base"
# num_labels = 3
# model_path = "/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/xlmroberta/xlmroberta_trained_model.pth"

# # Create an instance of the model
# model2 = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
# # Load the saved parameters into the model
# model2.load_state_dict(torch.load(model_path))

# # Testing loop
# model2.to(device)  # Move the model to the same device as tensors
# model2.eval()
# test_probs = []

# with torch.no_grad():
#     for batch in test_dataloader:
#         batch = tuple(t.to(device) for t in batch)
#         inputs = {"input_ids": batch[0], "attention_mask": batch[1]}

#         outputs = model2(**inputs)
#         logits = outputs.logits
#         probs = torch.softmax(logits, dim=1)  # Apply softmax to get class probabilities

#         test_probs.extend(probs.cpu().numpy())

# # Convert the probabilities to a numpy array
# test_probs = np.array(test_probs)

# class_names = ["Class_0", "Class_1", "Class_2"]  # Replace with your actual class names

# # Now you can create a DataFrame with index and probability columns
# result_df = pd.DataFrame(test_probs, columns=class_names)

# # Save the DataFrame to a CSV file
# result_df.to_csv('/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/xlmroberta/probability_ensemble.csv', index=False)
# print("Probabilities saved to probability_ensemble.csv")