In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/proccess-data/Processed Data.csv
/kaggle/input/nlp-project/BSMDD_v3_textcleaned - 21K (2).xlsx


# 5.1 Bangla Bert

In [2]:
!pip install transformers




# 5.2 Bert_SGD_Optimizer (8 batchsize) 10 epoch

In [3]:
# Install required libraries
!pip install transformers

# Import necessary libraries
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Load the dataset
dataset_path = "/kaggle/input/proccess-data/Processed Data.csv"  # Adjust path for Kaggle
df = pd.read_csv(dataset_path)

# Display first few rows of the dataset to understand its structure
print("First few rows of the dataset:")
print(df.head())

# Check if 'text' and 'label' columns exist
if 'text' not in df.columns or 'label' not in df.columns:
    raise ValueError("Dataset must contain 'text' and 'label' columns.")

# Tokenize and prepare data
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Ensure the 'text' column is correctly passed as a list of strings
X_encodings = tokenizer(list(df['text']), truncation=True, padding=True, max_length=512, return_tensors="pt")
y_tensor = torch.tensor(df['label'].values)

# Check if tokenization is successful
print(f"Tokenization successful. Encoded input shape: {X_encodings['input_ids'].shape}")

# Stratified K-Fold setup
NUM_FOLDS = 5
BATCH_SIZE = 8
LEARNING_RATE = 1e-3
MOMENTUM = 0.9
NUM_EPOCHS = 10
DROPOUT = 0.1

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

# Define the BERT classifier model
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, hidden_dim, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert_model = bert_model
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, input_ids, attention_mask):
        output = self.bert_model(input_ids, attention_mask)
        pooled_output = output['pooler_output']
        logits = self.fc(self.dropout(pooled_output))
        return logits

# Training and validation
for fold, (train_index, val_index) in enumerate(skf.split(X_encodings['input_ids'], y_tensor), 1):
    print(f"Processing Fold: {fold}")
    
    X_train_fold = {key: val[train_index] for key, val in X_encodings.items()}
    X_val_fold = {key: val[val_index] for key, val in X_encodings.items()}
    y_train_fold = y_tensor[train_index]
    y_val_fold = y_tensor[val_index]

    train_dataset = TensorDataset(X_train_fold['input_ids'], X_train_fold['attention_mask'], y_train_fold)
    val_dataset = TensorDataset(X_val_fold['input_ids'], X_val_fold['attention_mask'], y_val_fold)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Initialize model, optimizer, and loss function
    bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
    model = BERTClassifier(bert_model, bert_model.config.hidden_size, num_classes=2).to(DEVICE)
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
    criterion = nn.CrossEntropyLoss()

    best_accuracy = 0.0
    for epoch in range(NUM_EPOCHS):
        model.train()
        train_loss, train_correct = 0.0, 0

        for batch in train_loader:
            input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * input_ids.size(0)
            train_correct += (torch.argmax(logits, dim=1) == labels).sum().item()

        train_accuracy = 100 * train_correct / len(train_dataset)

        # Validation
        model.eval()
        val_loss, val_correct = 0.0, 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                val_loss += loss.item() * input_ids.size(0)
                val_correct += (torch.argmax(logits, dim=1) == labels).sum().item()

        val_accuracy = 100 * val_correct / len(val_dataset)
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Acc: {train_accuracy:.2f}%, Val Acc: {val_accuracy:.2f}%")

        # Save the best model
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), f"best_model_fold{fold}.pth")

    print(f"Best Validation Accuracy for Fold {fold}: {best_accuracy:.2f}%")

# Inference and evaluation can be added here as needed


Using device: cuda
First few rows of the dataset:
   label                                               text
0      1  কারণ আমরা একলাফে চাকরি পাওয়ার সপ্ন দেখি ছোটখাট...
1      0  অরন্যদেব ছবিতে যে ছিল আমার কন্যা , সে আজ থেকে ...
2      1                ঢাকার কষ্টের আরেক নাম ব্যাচেলর জীবন
3      0  ইয়াশ রোহানতটিনী সেরা জুটিতটিনীকে আমার অনেক ভাল...
4      1  জনগণের ভোট চুরি করে ক্ষমতায় আসা ওই নিয়ত আমাদ...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Tokenization successful. Encoded input shape: torch.Size([9376, 151])
Processing Fold: 1


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Epoch 1/10, Train Acc: 63.29%, Val Acc: 74.25%
Epoch 2/10, Train Acc: 74.28%, Val Acc: 75.75%
Epoch 3/10, Train Acc: 79.51%, Val Acc: 83.00%
Epoch 4/10, Train Acc: 83.25%, Val Acc: 82.73%
Epoch 5/10, Train Acc: 85.32%, Val Acc: 82.41%
Epoch 6/10, Train Acc: 86.44%, Val Acc: 84.65%
Epoch 7/10, Train Acc: 86.93%, Val Acc: 76.33%
Epoch 8/10, Train Acc: 87.71%, Val Acc: 83.64%
Epoch 9/10, Train Acc: 88.25%, Val Acc: 75.05%
Epoch 10/10, Train Acc: 88.15%, Val Acc: 81.82%
Best Validation Accuracy for Fold 1: 84.65%
Processing Fold: 2
Epoch 1/10, Train Acc: 66.56%, Val Acc: 76.05%
Epoch 2/10, Train Acc: 75.43%, Val Acc: 76.96%
Epoch 3/10, Train Acc: 79.11%, Val Acc: 79.89%
Epoch 4/10, Train Acc: 83.72%, Val Acc: 78.40%
Epoch 5/10, Train Acc: 85.15%, Val Acc: 81.97%
Epoch 6/10, Train Acc: 87.40%, Val Acc: 80.11%
Epoch 7/10, Train Acc: 89.03%, Val Acc: 81.39%
Epoch 8/10, Train Acc: 89.93%, Val Acc: 80.48%
Epoch 9/10, Train Acc: 91.23%, Val Acc: 81.97%
Epoch 10/10, Train Acc: 92.15%, Val Acc: 84

In [4]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import torch

# Assuming `model` is your trained model, and `val_loader` is the DataLoader for validation data

# Prepare for evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
        logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Compute confusion matrix and classification report
conf_matrix = confusion_matrix(all_labels, all_preds)
class_report = classification_report(all_labels, all_preds, target_names=["Class 0", "Class 1"])

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.title("8 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")

# Save confusion matrix as a JPG file
conf_matrix_path = "/kaggle/working/8 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.jpg"
plt.savefig(conf_matrix_path)
plt.close()

# Save classification report as a text file
class_report_path = "/kaggle/working/classification_report.txt"
with open(class_report_path, "w") as f:
    f.write(class_report)

print(f"Confusion Matrix saved as {conf_matrix_path}")
print(f"Classification Report saved as {class_report_path}")


Confusion Matrix saved as /kaggle/working/8 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.jpg
Classification Report saved as /kaggle/working/classification_report.txt


# Bert_SGD_Optimizer (16 batchsize) 10 epoch

In [5]:
# Install required libraries
!pip install transformers

# Import necessary libraries
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Load the dataset
dataset_path = "/kaggle/input/proccess-data/Processed Data.csv"  # Adjust path for Kaggle
df = pd.read_csv(dataset_path)

# Display first few rows of the dataset to understand its structure
print("First few rows of the dataset:")
print(df.head())

# Check if 'text' and 'label' columns exist
if 'text' not in df.columns or 'label' not in df.columns:
    raise ValueError("Dataset must contain 'text' and 'label' columns.")

# Tokenize and prepare data
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Ensure the 'text' column is correctly passed as a list of strings
X_encodings = tokenizer(list(df['text']), truncation=True, padding=True, max_length=512, return_tensors="pt")
y_tensor = torch.tensor(df['label'].values)

# Check if tokenization is successful
print(f"Tokenization successful. Encoded input shape: {X_encodings['input_ids'].shape}")

# Stratified K-Fold setup
NUM_FOLDS = 5
BATCH_SIZE = 16
LEARNING_RATE = 1e-3
MOMENTUM = 0.9
NUM_EPOCHS = 10
DROPOUT = 0.1

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

# Define the BERT classifier model
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, hidden_dim, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert_model = bert_model
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, input_ids, attention_mask):
        output = self.bert_model(input_ids, attention_mask)
        pooled_output = output['pooler_output']
        logits = self.fc(self.dropout(pooled_output))
        return logits

# Training and validation
for fold, (train_index, val_index) in enumerate(skf.split(X_encodings['input_ids'], y_tensor), 1):
    print(f"Processing Fold: {fold}")
    
    X_train_fold = {key: val[train_index] for key, val in X_encodings.items()}
    X_val_fold = {key: val[val_index] for key, val in X_encodings.items()}
    y_train_fold = y_tensor[train_index]
    y_val_fold = y_tensor[val_index]

    train_dataset = TensorDataset(X_train_fold['input_ids'], X_train_fold['attention_mask'], y_train_fold)
    val_dataset = TensorDataset(X_val_fold['input_ids'], X_val_fold['attention_mask'], y_val_fold)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Initialize model, optimizer, and loss function
    bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
    model = BERTClassifier(bert_model, bert_model.config.hidden_size, num_classes=2).to(DEVICE)
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
    criterion = nn.CrossEntropyLoss()

    best_accuracy = 0.0
    for epoch in range(NUM_EPOCHS):
        model.train()
        train_loss, train_correct = 0.0, 0

        for batch in train_loader:
            input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * input_ids.size(0)
            train_correct += (torch.argmax(logits, dim=1) == labels).sum().item()

        train_accuracy = 100 * train_correct / len(train_dataset)

        # Validation
        model.eval()
        val_loss, val_correct = 0.0, 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                val_loss += loss.item() * input_ids.size(0)
                val_correct += (torch.argmax(logits, dim=1) == labels).sum().item()

        val_accuracy = 100 * val_correct / len(val_dataset)
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Acc: {train_accuracy:.2f}%, Val Acc: {val_accuracy:.2f}%")

        # Save the best model
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), f"best_model_fold{fold}.pth")

    print(f"Best Validation Accuracy for Fold {fold}: {best_accuracy:.2f}%")

# Inference and evaluation can be added here as needed


Using device: cuda
First few rows of the dataset:
   label                                               text
0      1  কারণ আমরা একলাফে চাকরি পাওয়ার সপ্ন দেখি ছোটখাট...
1      0  অরন্যদেব ছবিতে যে ছিল আমার কন্যা , সে আজ থেকে ...
2      1                ঢাকার কষ্টের আরেক নাম ব্যাচেলর জীবন
3      0  ইয়াশ রোহানতটিনী সেরা জুটিতটিনীকে আমার অনেক ভাল...
4      1  জনগণের ভোট চুরি করে ক্ষমতায় আসা ওই নিয়ত আমাদ...
Tokenization successful. Encoded input shape: torch.Size([9376, 151])
Processing Fold: 1
Epoch 1/10, Train Acc: 63.49%, Val Acc: 50.37%
Epoch 2/10, Train Acc: 67.21%, Val Acc: 50.37%
Epoch 3/10, Train Acc: 56.63%, Val Acc: 63.86%
Epoch 4/10, Train Acc: 67.21%, Val Acc: 67.80%
Epoch 5/10, Train Acc: 73.09%, Val Acc: 76.71%
Epoch 6/10, Train Acc: 77.07%, Val Acc: 76.76%
Epoch 7/10, Train Acc: 80.35%, Val Acc: 79.32%
Epoch 8/10, Train Acc: 83.19%, Val Acc: 83.26%
Epoch 9/10, Train Acc: 85.77%, Val Acc: 82.89%
Epoch 10/10, Train Acc: 87.55%, Val Acc: 83.90%
Best Validation Accuracy for F

In [6]:
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Assuming `model` is your trained model, and `val_loader` is the DataLoader for validation data
# Also assuming `DEVICE` is set to either "cuda" or "cpu"

# Set the model to evaluation mode
model.eval()

# Initialize lists to collect predictions and true labels
all_preds = []
all_labels = []

# Perform inference on the validation data
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
        logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Compute confusion matrix and classification report
conf_matrix = confusion_matrix(all_labels, all_preds)
class_report = classification_report(all_labels, all_preds, target_names=["Class 0", "Class 1"])

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.title("16 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")

# Save confusion matrix as a JPG file
conf_matrix_path = "/kaggle/working/16 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.jpg"
plt.savefig(conf_matrix_path)
plt.close()

# Save classification report as a text file
class_report_path = "/kaggle/working/classification_report.txt"
with open(class_report_path, "w") as f:
    f.write(class_report)

print(f"Confusion Matrix saved as {conf_matrix_path}")
print(f"Classification Report saved as {class_report_path}")


Confusion Matrix saved as /kaggle/working/16 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.jpg
Classification Report saved as /kaggle/working/classification_report.txt


# Bert_SGD_Optimizer (32 batchsize) 10 epoch

In [7]:
# Install required libraries
!pip install transformers

# Import necessary libraries
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Load the dataset
dataset_path = "/kaggle/input/proccess-data/Processed Data.csv"  # Adjust path for Kaggle
df = pd.read_csv(dataset_path)

# Display first few rows of the dataset to understand its structure
print("First few rows of the dataset:")
print(df.head())

# Check if 'text' and 'label' columns exist
if 'text' not in df.columns or 'label' not in df.columns:
    raise ValueError("Dataset must contain 'text' and 'label' columns.")

# Tokenize and prepare data
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Ensure the 'text' column is correctly passed as a list of strings
X_encodings = tokenizer(list(df['text']), truncation=True, padding=True, max_length=512, return_tensors="pt")
y_tensor = torch.tensor(df['label'].values)

# Check if tokenization is successful
print(f"Tokenization successful. Encoded input shape: {X_encodings['input_ids'].shape}")

# Stratified K-Fold setup
NUM_FOLDS = 5
BATCH_SIZE = 32
LEARNING_RATE = 1e-3
MOMENTUM = 0.9
NUM_EPOCHS = 10
DROPOUT = 0.1

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

# Define the BERT classifier model
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, hidden_dim, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert_model = bert_model
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, input_ids, attention_mask):
        output = self.bert_model(input_ids, attention_mask)
        pooled_output = output['pooler_output']
        logits = self.fc(self.dropout(pooled_output))
        return logits

# Training and validation
for fold, (train_index, val_index) in enumerate(skf.split(X_encodings['input_ids'], y_tensor), 1):
    print(f"Processing Fold: {fold}")
    
    X_train_fold = {key: val[train_index] for key, val in X_encodings.items()}
    X_val_fold = {key: val[val_index] for key, val in X_encodings.items()}
    y_train_fold = y_tensor[train_index]
    y_val_fold = y_tensor[val_index]

    train_dataset = TensorDataset(X_train_fold['input_ids'], X_train_fold['attention_mask'], y_train_fold)
    val_dataset = TensorDataset(X_val_fold['input_ids'], X_val_fold['attention_mask'], y_val_fold)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Initialize model, optimizer, and loss function
    bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
    model = BERTClassifier(bert_model, bert_model.config.hidden_size, num_classes=2).to(DEVICE)
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
    criterion = nn.CrossEntropyLoss()

    best_accuracy = 0.0
    for epoch in range(NUM_EPOCHS):
        model.train()
        train_loss, train_correct = 0.0, 0

        for batch in train_loader:
            input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * input_ids.size(0)
            train_correct += (torch.argmax(logits, dim=1) == labels).sum().item()

        train_accuracy = 100 * train_correct / len(train_dataset)

        # Validation
        model.eval()
        val_loss, val_correct = 0.0, 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                val_loss += loss.item() * input_ids.size(0)
                val_correct += (torch.argmax(logits, dim=1) == labels).sum().item()

        val_accuracy = 100 * val_correct / len(val_dataset)
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Acc: {train_accuracy:.2f}%, Val Acc: {val_accuracy:.2f}%")

        # Save the best model
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), f"best_model_fold{fold}.pth")

    print(f"Best Validation Accuracy for Fold {fold}: {best_accuracy:.2f}%")

# Inference and evaluation can be added here as needed


Using device: cuda
First few rows of the dataset:
   label                                               text
0      1  কারণ আমরা একলাফে চাকরি পাওয়ার সপ্ন দেখি ছোটখাট...
1      0  অরন্যদেব ছবিতে যে ছিল আমার কন্যা , সে আজ থেকে ...
2      1                ঢাকার কষ্টের আরেক নাম ব্যাচেলর জীবন
3      0  ইয়াশ রোহানতটিনী সেরা জুটিতটিনীকে আমার অনেক ভাল...
4      1  জনগণের ভোট চুরি করে ক্ষমতায় আসা ওই নিয়ত আমাদ...
Tokenization successful. Encoded input shape: torch.Size([9376, 151])
Processing Fold: 1
Epoch 1/10, Train Acc: 65.89%, Val Acc: 75.32%
Epoch 2/10, Train Acc: 76.23%, Val Acc: 78.89%
Epoch 3/10, Train Acc: 78.92%, Val Acc: 80.06%
Epoch 4/10, Train Acc: 82.92%, Val Acc: 80.97%
Epoch 5/10, Train Acc: 85.65%, Val Acc: 82.78%
Epoch 6/10, Train Acc: 88.40%, Val Acc: 83.58%
Epoch 7/10, Train Acc: 88.61%, Val Acc: 84.91%
Epoch 8/10, Train Acc: 90.68%, Val Acc: 84.12%
Epoch 9/10, Train Acc: 91.49%, Val Acc: 85.87%
Epoch 10/10, Train Acc: 93.71%, Val Acc: 86.03%
Best Validation Accuracy for F

In [8]:
import torch
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Load the best model from the last fold (or a specific fold as needed)
best_model_path = f"best_model_fold{NUM_FOLDS}.pth"
model.load_state_dict(torch.load(best_model_path))
model.eval()

# Prepare for evaluation
all_preds = []
all_labels = []

# Perform inference on the validation data
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
        logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Compute confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)

# Generate classification report
class_report = classification_report(all_labels, all_preds, target_names=["Class 0", "Class 1"])

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")

# Save confusion matrix as a JPG file
conf_matrix_path = "32 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.jpg"
plt.savefig(conf_matrix_path)
plt.close()

# Save the classification report to a text file
class_report_path = "32 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.txt"
with open(class_report_path, "w") as f:
    f.write(class_report)

print(f"Confusion Matrix saved as {conf_matrix_path}")
print(f"Classification Report saved as {class_report_path}")


  model.load_state_dict(torch.load(best_model_path))


Confusion Matrix saved as 32 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.jpg
Classification Report saved as 32 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.txt


# Bert_SGD_Optimizer (68 batchsize) 10 epoch

In [9]:
# Install required libraries
!pip install transformers

# Import necessary libraries
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Load the dataset
dataset_path = "/kaggle/input/proccess-data/Processed Data.csv"  # Adjust path for Kaggle
df = pd.read_csv(dataset_path)

# Display first few rows of the dataset to understand its structure
print("First few rows of the dataset:")
print(df.head())

# Check if 'text' and 'label' columns exist
if 'text' not in df.columns or 'label' not in df.columns:
    raise ValueError("Dataset must contain 'text' and 'label' columns.")

# Tokenize and prepare data
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Ensure the 'text' column is correctly passed as a list of strings
X_encodings = tokenizer(list(df['text']), truncation=True, padding=True, max_length=512, return_tensors="pt")
y_tensor = torch.tensor(df['label'].values)

# Check if tokenization is successful
print(f"Tokenization successful. Encoded input shape: {X_encodings['input_ids'].shape}")

# Stratified K-Fold setup
NUM_FOLDS = 5
BATCH_SIZE = 68
LEARNING_RATE = 1e-3
MOMENTUM = 0.9
NUM_EPOCHS = 10
DROPOUT = 0.1

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

# Define the BERT classifier model
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, hidden_dim, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert_model = bert_model
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, input_ids, attention_mask):
        output = self.bert_model(input_ids, attention_mask)
        pooled_output = output['pooler_output']
        logits = self.fc(self.dropout(pooled_output))
        return logits

# Training and validation
for fold, (train_index, val_index) in enumerate(skf.split(X_encodings['input_ids'], y_tensor), 1):
    print(f"Processing Fold: {fold}")
    
    X_train_fold = {key: val[train_index] for key, val in X_encodings.items()}
    X_val_fold = {key: val[val_index] for key, val in X_encodings.items()}
    y_train_fold = y_tensor[train_index]
    y_val_fold = y_tensor[val_index]

    train_dataset = TensorDataset(X_train_fold['input_ids'], X_train_fold['attention_mask'], y_train_fold)
    val_dataset = TensorDataset(X_val_fold['input_ids'], X_val_fold['attention_mask'], y_val_fold)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Initialize model, optimizer, and loss function
    bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
    model = BERTClassifier(bert_model, bert_model.config.hidden_size, num_classes=2).to(DEVICE)
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
    criterion = nn.CrossEntropyLoss()

    best_accuracy = 0.0
    for epoch in range(NUM_EPOCHS):
        model.train()
        train_loss, train_correct = 0.0, 0

        for batch in train_loader:
            input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * input_ids.size(0)
            train_correct += (torch.argmax(logits, dim=1) == labels).sum().item()

        train_accuracy = 100 * train_correct / len(train_dataset)

        # Validation
        model.eval()
        val_loss, val_correct = 0.0, 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                val_loss += loss.item() * input_ids.size(0)
                val_correct += (torch.argmax(logits, dim=1) == labels).sum().item()

        val_accuracy = 100 * val_correct / len(val_dataset)
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Acc: {train_accuracy:.2f}%, Val Acc: {val_accuracy:.2f}%")

        # Save the best model
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), f"best_model_fold{fold}.pth")

    print(f"Best Validation Accuracy for Fold {fold}: {best_accuracy:.2f}%")

# Inference and evaluation can be added here as needed


Using device: cuda
First few rows of the dataset:
   label                                               text
0      1  কারণ আমরা একলাফে চাকরি পাওয়ার সপ্ন দেখি ছোটখাট...
1      0  অরন্যদেব ছবিতে যে ছিল আমার কন্যা , সে আজ থেকে ...
2      1                ঢাকার কষ্টের আরেক নাম ব্যাচেলর জীবন
3      0  ইয়াশ রোহানতটিনী সেরা জুটিতটিনীকে আমার অনেক ভাল...
4      1  জনগণের ভোট চুরি করে ক্ষমতায় আসা ওই নিয়ত আমাদ...
Tokenization successful. Encoded input shape: torch.Size([9376, 151])
Processing Fold: 1
Epoch 1/10, Train Acc: 64.29%, Val Acc: 66.84%
Epoch 2/10, Train Acc: 72.75%, Val Acc: 76.33%
Epoch 3/10, Train Acc: 76.81%, Val Acc: 78.73%
Epoch 4/10, Train Acc: 80.24%, Val Acc: 80.81%
Epoch 5/10, Train Acc: 82.11%, Val Acc: 80.81%
Epoch 6/10, Train Acc: 83.92%, Val Acc: 82.46%
Epoch 7/10, Train Acc: 87.21%, Val Acc: 83.90%
Epoch 8/10, Train Acc: 89.12%, Val Acc: 84.12%
Epoch 9/10, Train Acc: 90.97%, Val Acc: 84.43%
Epoch 10/10, Train Acc: 91.61%, Val Acc: 85.66%
Best Validation Accuracy for F

In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Load the best model from the last fold (or any specific fold you want to evaluate)
best_model_path = f"best_model_fold{NUM_FOLDS}.pth"
model.load_state_dict(torch.load(best_model_path))
model.eval()

# Initialize lists to collect predictions and true labels
all_preds = []
all_labels = []

# Perform inference on the validation data
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [item.to(DEVICE) for item in batch]
        logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Compute confusion matrix and classification report
conf_matrix = confusion_matrix(all_labels, all_preds)
class_report = classification_report(all_labels, all_preds, target_names=["Class 0", "Class 1"])

# Plot and save confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")

# Save confusion matrix as a JPG file
conf_matrix_path = "64 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.jpg"
plt.savefig(conf_matrix_path)
plt.close()

# Save the classification report to a text file
class_report_path = "64 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.txt"
with open(class_report_path, "w") as f:
    f.write(class_report)

print(f"Confusion Matrix saved as {conf_matrix_path}")
print(f"Classification Report saved as {class_report_path}")


  model.load_state_dict(torch.load(best_model_path))


Confusion Matrix saved as 64 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.jpg
Classification Report saved as 64 batch-size_10-epocs_confusion_matrix_Bangla-BERT-SGD.txt
