In [1]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, classification_report
from sklearn.ensemble import RandomForestClassifier
from nltk.stem import WordNetLemmatizer
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')


In [2]:
#IMPORT DATA
train = pd.read_csv('/kaggle/input/dataset/train.csv')

In [3]:
# CHECK FOR NULL VALUES
print('Train dataset null values: ')
train.isnull().sum()

Train dataset null values: 


id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [4]:
#Preprocess the comment
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
#split for training data
train = train.drop(columns=['id'], axis=1)
X = train['comment_text']
y = train.drop(columns=['comment_text'], axis=1)

In [6]:
# Split the data into training (70%) and test (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
from sklearn.metrics import accuracy_score, classification_report, hamming_loss, f1_score
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR

# Step 1: Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Step 2: Tokenize the input data
def tokenize_data(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')

# Tokenize the training and test data
X_train_tokens = tokenize_data(X_train.tolist())  # Convert X_train to a list of strings
X_test_tokens = tokenize_data(X_test.tolist())    # Convert X_test to a list of strings

# Step 3: Convert labels to tensor (for multilabel classification)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)  # Use float32 for multi-label classification
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Step 4: Create DataLoader
train_data = TensorDataset(X_train_tokens['input_ids'], X_train_tokens['attention_mask'], y_train_tensor)
test_data = TensorDataset(X_test_tokens['input_ids'], X_test_tokens['attention_mask'], y_test_tensor)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

# Step 5: Initialize the BERT model for sequence classification (multi-label)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=y_train.shape[1])

# Step 6: Set up the optimizer and device
optimizer = AdamW(model.parameters(), lr=1e-5)  # Adjusted learning rate
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Step 7: Training the model
def train_model():
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        
        # Sigmoid activation for multi-label classification
        loss = F.binary_cross_entropy_with_logits(logits, labels)  # Binary Cross-Entropy loss
        loss.backward()
        optimizer.step()
        
    return loss.item()  # Return loss for logging

# Step 8: Evaluate the model
def evaluate_model():
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # Sigmoid activation to get probabilities
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()  # Apply a threshold of 0.5 to decide if a label is predicted
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    # Print classification report for multi-label classification
    print("Accuracy Score: ", accuracy_score(all_labels, all_preds))
    print("Hamming Loss: ", hamming_loss(all_labels, all_preds)) 
    print("F1 Score (micro): ", f1_score(all_labels, all_preds, average='micro'))
    print("Classification Report:\n", classification_report(all_labels, all_preds))

# Step 9: Train and evaluate the model over multiple epochs
def train_and_evaluate(num_epochs=3):
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        train_loss = train_model()  # Get training loss
        print(f"Training Loss: {train_loss:.4f}")
        evaluate_model()

# Train and evaluate the model
train_and_evaluate(num_epochs=4)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4
Training Loss: 0.0182
Accuracy Score:  0.9262199197860963
Hamming Loss:  0.015778186274509803
F1 Score (micro):  0.7589874494788343
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.77      0.83      4582
           1       0.00      0.00      0.00       486
           2       0.90      0.76      0.82      2556
           3       0.00      0.00      0.00       136
           4       0.80      0.68      0.74      2389
           5       0.83      0.06      0.10       432

   micro avg       0.87      0.67      0.76     10581
   macro avg       0.57      0.38      0.41     10581
weighted avg       0.82      0.67      0.73     10581
 samples avg       0.07      0.06      0.06     10581

Epoch 2/4
Training Loss: 0.0400
Accuracy Score:  0.9262199197860963
Hamming Loss:  0.01541262811942959
F1 Score (micro):  0.7889593364160746
Classification Report:
               precision    recall  f1-score   support

           0     