In [2]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.model_selection import train_test_split

# Paths to datasets
data_path = '/kaggle/input/train-dataset-trans-learn'
files = ['train_en_l1.csv', 'train_hi_l1.csv', 'train_ta_l1.csv']

# Function to load datasets with language info
def load_data(files, data_path):
    dfs = []
    for file in files:
        lang = file.split('_')[1] 
        df = pd.read_csv(os.path.join(data_path, file))
        df['language'] = lang
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# Load all datasets
df = load_data(files, data_path)

# Drop rows where label is still missing
df.dropna(subset=['label'], inplace=True)

# Step 3: Enhanced Text Cleaning
def clean_text(text):
    text = text.lower()  # Lowercase the text

    # Preserve hashtags and mentions
    text = re.sub(r'@\w+', '[USER]', text)  # Replace handles with [USER]
    text = re.sub(r'http\S+|www\S+', '[URL]', text)  # Replace URLs with [URL]

    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)

    # Keep alphanumeric (English, Hindi, Tamil) + hashtags and mentions
    text = re.sub(r'[^a-zA-Z0-9#@ऀ-ॿ஀-௿\s]', ' ', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df['cleaned_text'] = df['text'].apply(clean_text)

# Step 4: Finalize Dataset
df_final = df[['cleaned_text', 'label', 'language']]

# Save cleaned dataset
df_final.to_csv('cleaned_l1.csv', index=False)

print("Data preparation complete! Cleaned dataset saved as 'cleaned_l1.csv'.")

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(val_df)} samples")

Data preparation complete! Cleaned dataset saved as 'cleaned_l1.csv'.
Training set: 44946 samples
Validation set: 11237 samples


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch import nn
import numpy as np
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Step 5: Load mBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Step 6: Create a custom dataset class
class AbusiveLanguageDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize the text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Step 7: Create the classifier model
class AbusiveLanguageClassifier(nn.Module):
    def __init__(self, bert_model, num_classes=2, dropout_rate=0.3):
        super(AbusiveLanguageClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(768, num_classes)  # 768 is the size of BERT embeddings
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        # Get BERT outputs
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # Use the CLS token representation
        pooled_output = outputs.pooler_output
        
        # Apply dropout and the classifier
        dropout_output = self.dropout(pooled_output)
        logits = self.linear(dropout_output)
        
        return logits

# Step 8: Training function
def train_model(model, train_loader, val_loader, learning_rate=2e-5, epochs=3):
    # Use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    model = model.to(device)
    
    # Set up optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    
    # Set up loss function
    criterion = nn.CrossEntropyLoss()
    
    # Training loop
    best_val_f1 = 0
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        
        # Training phase
        model.train()
        train_loss = 0
        
        for batch in tqdm(train_loader, desc="Training"):
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, token_type_ids)
            
            # Calculate loss
            loss = criterion(outputs, labels)
            train_loss += loss.item()
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
        
        avg_train_loss = train_loss / len(train_loader)
        print(f"Average training loss: {avg_train_loss:.4f}")
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_preds = []
        val_true = []
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                # Move batch to device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                labels = batch['label'].to(device)
                
                # Forward pass
                outputs = model(input_ids, attention_mask, token_type_ids)
                
                # Calculate loss
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
                # Get predictions
                _, preds = torch.max(outputs, dim=1)
                val_preds.extend(preds.cpu().tolist())
                val_true.extend(labels.cpu().tolist())
        
        # Calculate metrics
        val_accuracy = accuracy_score(val_true, val_preds)
        val_f1 = f1_score(val_true, val_preds, average='macro')
        val_precision = precision_score(val_true, val_preds, average='macro')
        val_recall = recall_score(val_true, val_preds, average='macro')
        
        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")
        print(f"Validation Metrics - Accuracy: {val_accuracy:.4f}, Macro F1: {val_f1:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}")
        
        # Save the best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), 'best_mbert_abusive_classifier.pt')
            print("Saved best model!")
    
    return model

# Step 9: Prepare data loaders
# Convert labels to numeric if they're strings
label_to_id = {'NOT': 0, 'HOF': 1} if isinstance(train_df['label'].iloc[0], str) else None

if label_to_id:
    train_labels = [label_to_id[label] for label in train_df['label']]
    val_labels = [label_to_id[label] for label in val_df['label']]
else:
    train_labels = train_df['label'].tolist()
    val_labels = val_df['label'].tolist()

# Create datasets
train_dataset = AbusiveLanguageDataset(
    texts=train_df['cleaned_text'].tolist(),
    labels=train_labels,
    tokenizer=tokenizer
)

val_dataset = AbusiveLanguageDataset(
    texts=val_df['cleaned_text'].tolist(),
    labels=val_labels,
    tokenizer=tokenizer
)

# Create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Step 10: Initialize model
classifier = AbusiveLanguageClassifier(bert_model)

# Step 11: Train model
trained_model = train_model(
    model=classifier,
    train_loader=train_loader,
    val_loader=val_loader,
    learning_rate=2e-5,
    epochs=3
)

2025-04-15 09:11:43.199977: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744708303.410514      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744708303.463397      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Using device: cuda
Epoch 1/3


Training: 100%|██████████| 2810/2810 [09:53<00:00,  4.73it/s]


Average training loss: 0.3422


Validation: 100%|██████████| 703/703 [00:42<00:00, 16.42it/s]


Validation Loss: 0.2990
Validation Metrics - Accuracy: 0.8782, Macro F1: 0.8734, Precision: 0.8721, Recall: 0.8749
Saved best model!
Epoch 2/3


Training: 100%|██████████| 2810/2810 [09:53<00:00,  4.74it/s]


Average training loss: 0.2509


Validation: 100%|██████████| 703/703 [00:42<00:00, 16.40it/s]


Validation Loss: 0.2669
Validation Metrics - Accuracy: 0.8867, Macro F1: 0.8825, Precision: 0.8806, Recall: 0.8846
Saved best model!
Epoch 3/3


Training: 100%|██████████| 2810/2810 [09:53<00:00,  4.74it/s]


Average training loss: 0.2020


Validation: 100%|██████████| 703/703 [00:42<00:00, 16.40it/s]


Validation Loss: 0.2844
Validation Metrics - Accuracy: 0.8872, Macro F1: 0.8828, Precision: 0.8817, Recall: 0.8840
Saved best model!


Preparation of test data


In [4]:
# Paths to datasets
data_path = '/kaggle/input/test-dataset-trans-learn'
files = ['test_en_l1.csv', 'test_hi_l1.csv', 'test_ta_l1.csv']

# Function to load datasets with language info
def load_data(files, data_path):
    dfs = []
    for file in files:
        lang = file.split('_')[1] 
        df = pd.read_csv(os.path.join(data_path, file))
        df['language'] = lang
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# Load all datasets
df = load_data(files, data_path)

# Drop rows where label is still missing
df.dropna(subset=['label'], inplace=True)

# Step 3: Enhanced Text Cleaning
def clean_text(text):
    text = text.lower()  # Lowercase the text

    # Preserve hashtags and mentions
    text = re.sub(r'@\w+', '[USER]', text)  # Replace handles with [USER]
    text = re.sub(r'http\S+|www\S+', '[URL]', text)  # Replace URLs with [URL]

    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)

    # Keep alphanumeric (English, Hindi, Tamil) + hashtags and mentions
    text = re.sub(r'[^a-zA-Z0-9#@ऀ-ॿ஀-௿\s]', ' ', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df['cleaned_text'] = df['text'].apply(clean_text)

# Step 4: Finalize Dataset
df_final = df[['cleaned_text', 'label', 'language']]

# Save cleaned dataset
df_final.to_csv('cleaned_test.csv', index=False)

print("Data preparation complete! Cleaned dataset saved as 'cleaned_test.csv'.")



Data preparation complete! Cleaned dataset saved as 'cleaned_test.csv'.


In [5]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Load the test data
test_df = pd.read_csv('/kaggle/working/cleaned_test.csv')
print(f"Test dataset shape: {test_df.shape}")
print(test_df['label'].value_counts())

# Create DataLoader for test data
batch_size = 16

label_to_id = {'NOT': 0, 'HOF': 1} if isinstance(test_df['label'].iloc[0], str) else None

if label_to_id:
    test_labels = [label_to_id[label] for label in test_df['label']]
else:
    test_labels = test_df['label'].tolist()
    
test_dataset = AbusiveLanguageDataset(texts=test_df['cleaned_text'].tolist(),
    labels=test_labels,
    tokenizer=tokenizer
)

test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your trained model
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
classifier = AbusiveLanguageClassifier(bert_model)  # Using your existing class
classifier.load_state_dict(torch.load('best_mbert_abusive_classifier.pt', map_location=device))
classifier.to(device)
classifier.eval()

# Evaluate on test set
test_preds = []
test_true = []
test_probs = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['label'].to(device)
        
        outputs = classifier(input_ids, attention_mask, token_type_ids)
        probabilities = torch.softmax(outputs, dim=1)
        _, preds = torch.max(outputs, dim=1)
        
        test_preds.extend(preds.cpu().tolist())
        test_true.extend(labels.cpu().tolist())
        test_probs.extend(probabilities[:, 1].cpu().tolist())  # Probability of positive class

# Calculate metrics
test_accuracy = accuracy_score(test_true, test_preds)
test_f1_macro = f1_score(test_true, test_preds, average='macro')
test_f1_weighted = f1_score(test_true, test_preds, average='weighted')
test_precision = precision_score(test_true, test_preds, average='macro')
test_recall = recall_score(test_true, test_preds, average='macro')

# Print metrics
print("\nTest Evaluation Results:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"F1 Score (Macro): {test_f1_macro:.4f}")
print(f"F1 Score (Weighted): {test_f1_weighted:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")


Test dataset shape: (19511, 3)
label
1    11943
0     7568
Name: count, dtype: int64
Using device: cuda


  classifier.load_state_dict(torch.load('best_mbert_abusive_classifier.pt', map_location=device))
Testing: 100%|██████████| 1220/1220 [01:14<00:00, 16.43it/s]


Test Evaluation Results:
Accuracy: 0.8921
F1 Score (Macro): 0.8865
F1 Score (Weighted): 0.8921
Precision: 0.8861
Recall: 0.8869





Fine Tuning

In [6]:
tuned_train_df = pd.read_csv("/kaggle/input/transfer-learn-test/cleaned_dataset_l1.csv")

text_column = 'cleaned_text'
label_column = 'label'

tuned_train_df, tuned_val_df = train_test_split(
    tuned_train_df,
    test_size=0.2,
    random_state=42,
    stratify=tuned_train_df[label_column]
)

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Check if labels are strings or integers
if isinstance(train_df[label_column].iloc[0], str):
    # Adjust this mapping based on your dataset
    label_to_id = {'NOT': 0, 'HOF': 1}
    train_labels = [label_to_id[label] for label in train_df[label_column]]
    val_labels = [label_to_id[label] for label in val_df[label_column]]
else:
    train_labels = train_df[label_column].tolist()
    val_labels = val_df[label_column].tolist()

# Create datasets
train_dataset = AbusiveLanguageDataset(
    texts=train_df[text_column].tolist(),
    labels=train_labels,
    tokenizer=tokenizer
)

val_dataset = AbusiveLanguageDataset(
    texts=val_df[text_column].tolist(),
    labels=val_labels,
    tokenizer=tokenizer
)

# Create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Load pre-trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load base BERT model
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
bert_model.to(device)

# Initialize classifier
classifier = AbusiveLanguageClassifier(bert_model)

# Load your previously trained model weights
classifier.load_state_dict(torch.load('best_mbert_abusive_classifier.pt', map_location=device))
classifier.to(device)

# _________________________________________
#   FINE TUNING FUNCTION
# ________________________________

def fine_tune_model(model, train_loader, val_loader, learning_rate=2e-5, epochs=3):
    model.train()
    
    # We'll use a smaller learning rate for fine-tuning
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()
    
    best_val_f1 = 0
    
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        
        # Training phase
        model.train()
        train_loss = 0
        
        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, token_type_ids)
            
            loss = criterion(outputs, labels)
            train_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        
        avg_train_loss = train_loss / len(train_loader)
        print(f"Average training loss: {avg_train_loss:.4f}")
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_preds = []
        val_true = []
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(input_ids, attention_mask, token_type_ids)
                
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
                _, preds = torch.max(outputs, dim=1)
                val_preds.extend(preds.cpu().tolist())
                val_true.extend(labels.cpu().tolist())
        
        # Calculate metrics
        val_accuracy = accuracy_score(val_true, val_preds)
        val_f1 = f1_score(val_true, val_preds, average='macro')
        val_precision = precision_score(val_true, val_preds, average='macro')
        val_recall = recall_score(val_true, val_preds, average='macro')
        
        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")
        print(f"Validation Metrics - Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}")
        
        # Save the best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), 'best_fine_tuned_classifier.pt')
            print("Saved best fine-tuned model!")
    
    return model

#  FINE TUNE THE MODEL

fine_tuned_model = fine_tune_model(
    model=classifier,
    train_loader=train_loader,
    val_loader=val_loader,
    learning_rate=5e-6,  # Lower learning rate for fine-tuning
    epochs=3
)

# Load best fine-tuned model for evaluation
classifier.load_state_dict(torch.load('best_fine_tuned_classifier.pt'))
classifier.eval()

# Evaluate on validation set
val_preds = []
val_true = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Final Evaluation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['label'].to(device)
        
        outputs = classifier(input_ids, attention_mask, token_type_ids)
        _, preds = torch.max(outputs, dim=1)
        
        val_preds.extend(preds.cpu().tolist())
        val_true.extend(labels.cpu().tolist())

# Calculate final metrics
final_accuracy = accuracy_score(val_true, val_preds)
final_f1 = f1_score(val_true, val_preds, average='macro')
final_precision = precision_score(val_true, val_preds, average='macro')
final_recall = recall_score(val_true, val_preds, average='macro')

print("\nFinal Evaluation Results:")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"F1 Score: {final_f1:.4f}")
print(f"Precision: {final_precision:.4f}")
print(f"Recall: {final_recall:.4f}")

Using device: cuda


  classifier.load_state_dict(torch.load('best_mbert_abusive_classifier.pt', map_location=device))


Epoch 1/3


Training: 100%|██████████| 2810/2810 [09:53<00:00,  4.73it/s]


Average training loss: 0.1215


Validation: 100%|██████████| 703/703 [00:42<00:00, 16.40it/s]


Validation Loss: 0.3231
Validation Metrics - Accuracy: 0.8883, F1: 0.8842, Precision: 0.8821, Recall: 0.8868
Saved best fine-tuned model!
Epoch 2/3


Training: 100%|██████████| 2810/2810 [09:53<00:00,  4.73it/s]


Average training loss: 0.0881


Validation: 100%|██████████| 703/703 [00:42<00:00, 16.36it/s]


Validation Loss: 0.3365
Validation Metrics - Accuracy: 0.8858, F1: 0.8819, Precision: 0.8793, Recall: 0.8853
Epoch 3/3


Training: 100%|██████████| 2810/2810 [09:53<00:00,  4.73it/s]


Average training loss: 0.0674


Validation: 100%|██████████| 703/703 [00:42<00:00, 16.40it/s]
  classifier.load_state_dict(torch.load('best_fine_tuned_classifier.pt'))


Validation Loss: 0.4093
Validation Metrics - Accuracy: 0.8841, F1: 0.8796, Precision: 0.8782, Recall: 0.8812


Final Evaluation: 100%|██████████| 703/703 [00:42<00:00, 16.43it/s]


Final Evaluation Results:
Accuracy: 0.8883
F1 Score: 0.8842
Precision: 0.8821
Recall: 0.8868





In [7]:
# Paths to datasets
data_path = '/kaggle/input/fine-tuned-test-dataset'
files = ['test_en_l1.csv', 'test_hi_l1.csv','test_ta_l1.csv']

# Function to load datasets with language info
def load_data(files, data_path):
    dfs = []
    for file in files:
        lang = file.split('_')[1]
        file_path = os.path.join(data_path, file)
        try:
            # Try with 'c' engine (faster)
            # "engine": "python", "on_bad_lines": "warn"
            df = pd.read_csv(file_path, engine='python', encoding='utf-8', on_bad_lines='warn')
        except Exception as e:
            print(f"Failed with 'c' engine for {file}: {e}, trying 'python' engine...")
            try:
                df = pd.read_csv(file_path, engine='python', encoding='utf-8', error_bad_lines=False)
            except Exception as e2:
                print(f"Skipping {file} due to read error: {e2}")
                continue

        df['language'] = lang
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# Load all datasets
df = load_data(files, data_path)

# Step 1: Handle Missing Values
def clean_missing_values(df):
    # Replace 'NL' with np.nan for easier handling
    df.replace('NL', np.nan, inplace=True)

    # Drop rows where all annotations are missing
    annotator_cols = [col for col in df.columns if re.match(r".*a[1-6]", col)]
    df.dropna(subset=annotator_cols, how='all', inplace=True)

    return df, annotator_cols

df, annotator_cols = clean_missing_values(df)

# Step 2: Create Final Label (Majority Vote)
def majority_vote(row):
    votes = row[annotator_cols].dropna().values.astype(float)
    if len(votes) == 0:
        return np.nan
    return 1.0 if votes.mean() >= 0.5 else 0.0

df['label'] = df.apply(majority_vote, axis=1)

# Drop rows where label is still missing
df.dropna(subset=['label'], inplace=True)

# Step 3: Enhanced Text Cleaning
def clean_text(text):
    text = text.lower()  # Lowercase the text

    # Preserve hashtags and mentions
    text = re.sub(r'@\w+', '[USER]', text)  # Replace handles with [USER]
    text = re.sub(r'http\S+|www\S+', '[URL]', text)  # Replace URLs with [URL]

    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)

    # Keep alphanumeric (English, Hindi, Tamil) + hashtags and mentions
    text = re.sub(r'[^a-zA-Z0-9#@ऀ-ॿ஀-௿\s]', ' ', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df['cleaned_text'] = df['text'].apply(clean_text)

# Step 4: Finalize Dataset
df_final = df[['cleaned_text', 'label', 'language']]

# Save cleaned dataset
df_final.to_csv('fine_tuned_test.csv', index=False)

print("Data preparation complete! Cleaned dataset saved as 'fine_tuned_test.csv'.")

Data preparation complete! Cleaned dataset saved as 'fine_tuned_test.csv'.


In [9]:
fine_tuned_test_df = pd.read_csv('fine_tuned_test.csv')

text_column = 'cleaned_text'
label_column = 'label'

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

if isinstance(fine_tuned_test_df[label_column].iloc[0], str):
    label_to_id = {'NOT': 0, 'HOF': 1}  # Adjust based on your labels
    test_labels = [label_to_id[label] for label in fine_tuned_test_df[label_column]]
else:
    test_labels = fine_tuned_test_df[label_column].tolist()

# Create test dataset
test_dataset = AbusiveLanguageDataset(
    texts=fine_tuned_test_df[text_column].tolist(),
    labels=test_labels,
    tokenizer=tokenizer
)

# Create test dataloader
batch_size = 16
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load BERT model
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
bert_model.to(device)

# Initialize classifier with the fine-tuned model
classifier = AbusiveLanguageClassifier(bert_model)
classifier.load_state_dict(torch.load('best_fine_tuned_classifier.pt', map_location=device))
classifier.to(device)
classifier.eval()

# Evaluate on test set
test_preds = []
test_true = []
test_probs = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing Fine-tuned Model"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['label'].to(device)
        
        outputs = classifier(input_ids, attention_mask, token_type_ids)
        probabilities = torch.softmax(outputs, dim=1)
        _, preds = torch.max(outputs, dim=1)
        
        test_preds.extend(preds.cpu().tolist())
        test_true.extend(labels.cpu().tolist())
        test_probs.extend(probabilities[:, 1].cpu().tolist())  # Probability of positive class

# Calculate metrics
test_accuracy = accuracy_score(test_true, test_preds)
test_f1_macro = f1_score(test_true, test_preds, average='macro')
test_f1_weighted = f1_score(test_true, test_preds, average='weighted')
test_precision = precision_score(test_true, test_preds, average='macro')
test_recall = recall_score(test_true, test_preds, average='macro')

# Print metrics
print("\nFine-tuned Model Test Results:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"F1 Score (Macro): {test_f1_macro:.4f}")
print(f"F1 Score (Weighted): {test_f1_weighted:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")

# Generate detailed classification report
id_to_label = {0: 'NOT', 1: 'HOF'}  # Adjust based on your labels
target_names = [id_to_label[i] for i in sorted(id_to_label.keys())]
print("\nClassification Report:")
print(classification_report(test_true, test_preds, target_names=target_names))

# Generate confusion matrix
cm = confusion_matrix(test_true, test_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Fine-tuned Model Confusion Matrix')
plt.savefig('fine_tuned_confusion_matrix.png')
plt.close()

Using device: cuda


  classifier.load_state_dict(torch.load('best_fine_tuned_classifier.pt', map_location=device))
Testing Fine-tuned Model: 100%|██████████| 235/235 [00:14<00:00, 15.93it/s]



Fine-tuned Model Test Results:
Accuracy: 0.6743
F1 Score (Macro): 0.6560
F1 Score (Weighted): 0.6878
Precision: 0.6628
Recall: 0.6931

Classification Report:
              precision    recall  f1-score   support

         NOT       0.85      0.65      0.74      2631
         HOF       0.47      0.74      0.58      1127

    accuracy                           0.67      3758
   macro avg       0.66      0.69      0.66      3758
weighted avg       0.74      0.67      0.69      3758

