<a href="https://colab.research.google.com/github/Tariquzzaman-faisal/VITD/blob/main/mbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mounting

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!nvidia-smi

Tue Aug 15 05:23:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Project Setup

In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.8 MB/s[0m eta [36m0:00:

In [4]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset

In [5]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained mBERT model and tokenizer

In [6]:
model_name = 'bert-base-multilingual-cased'
num_classes = 3  # Update with the number of classes in your task
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

# Loading dataset

In [7]:
# Load CSV data
dir = '/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/dataset/Tariq/split/'
train_data = pd.read_csv(f'{dir}train.csv')
test_data = pd.read_csv(f'{dir}test.csv')
val_data = pd.read_csv(f'{dir}validation.csv')

# Model setup

In [8]:
def preprocess_data(data, max_length):
    encodings = tokenizer(data['sentence1'].tolist(), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    labels = torch.tensor(data['label'].tolist()).to(device)
    dataset = TensorDataset(encodings.input_ids.to(device), encodings.attention_mask.to(device), labels)
    return dataset

In [9]:
max_length = 512  # Set your desired max sequence length
train_dataset = preprocess_data(train_data, max_length)
val_dataset = preprocess_data(val_data, max_length)
test_dataset = preprocess_data(test_data, max_length)

In [10]:
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [11]:
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training

In [12]:
# Move the model to the same device as device
model = model.to(device)

In [13]:
# Define early stopping parameters
early_stopping_patience = 3
best_val_loss = float('inf')
epochs_since_last_improvement = 0

# Training loop
num_epochs = 20  # Increased the number of epochs
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            predicted_labels = torch.argmax(outputs.logits, dim=1)
            correct += (predicted_labels == labels).sum().item()
            total += labels.size(0)

    val_accuracy = correct / total
    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Avg. Validation Loss: {avg_val_loss:.4f}")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_since_last_improvement = 0
    else:
        epochs_since_last_improvement += 1
        if epochs_since_last_improvement >= early_stopping_patience:
            print("Early stopping triggered. Stopping training.")
            break


Epoch [1/20]
Validation Accuracy: 0.6925
Avg. Validation Loss: 0.7407
Epoch [2/20]
Validation Accuracy: 0.7023
Avg. Validation Loss: 0.6817
Epoch [3/20]
Validation Accuracy: 0.7361
Avg. Validation Loss: 0.6670
Epoch [4/20]
Validation Accuracy: 0.7391
Avg. Validation Loss: 0.8284
Epoch [5/20]
Validation Accuracy: 0.7383
Avg. Validation Loss: 0.8557
Epoch [6/20]
Validation Accuracy: 0.7271
Avg. Validation Loss: 0.9582
Early stopping triggered. Stopping training.


# Saving tuned model

In [26]:
from transformers import AutoModel, AutoConfig

# Define the path to the saved model
model_path = "/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/mbert/mbert_trained_model.pth"

# Save the model
torch.save(model.state_dict(), model_path)

In [27]:
# Create an instance of the model
model2 = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# Load the saved parameters into the model
model2.load_state_dict(torch.load(model_path))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

# Report on test set

In [16]:
import numpy as np
from sklearn.metrics import classification_report

In [30]:
# Evaluate on the test set
model2.to(device)  # Move the model to the same device as tensors
model2.eval()
test_labels_list = []
predicted_labels_list = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = model2(input_ids, attention_mask=attention_mask)
        predicted_labels = torch.argmax(outputs.logits, dim=1)

        test_labels_list.extend(labels.cpu().numpy())
        predicted_labels_list.extend(predicted_labels.cpu().numpy())

test_labels_np = np.array(test_labels_list)
predicted_labels_np = np.array(predicted_labels_list)

In [31]:
# Generate classification report
from sklearn.metrics import classification_report
target_names = ["0", "1", "2"]  # Replace with your class names
report = classification_report(test_labels_np, predicted_labels_np, target_names=target_names)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       1.00      0.63      0.77      2016
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0

    accuracy                           0.63      2016
   macro avg       0.33      0.21      0.26      2016
weighted avg       1.00      0.63      0.77      2016



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# Create a DataFrame with index and prediction columns
result_df = pd.DataFrame({
    'index': range(len(test_labels_np)),
    'prediction': predicted_labels_np
})

# Save the DataFrame to a CSV file
result_df.to_csv('/content/drive/MyDrive/Research/Shared Task/Violence Inciting Text Detection (VITD) Bangla/models/mbert/predictions.csv', index=False)
print("Predictions saved to 'predictions.csv'")

Predictions saved to 'predictions.csv'
