In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

data_255_toxic_comment_in_class_competition_path = kagglehub.competition_download('data-255-toxic-comment-in-class-competition')

print('Data source import complete.')


Import libraires

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pickle


# Load the data
# Inspect the data


In [None]:
# Load the data
data = pd.read_csv('/kaggle/input/data-255-toxic-comment-in-class-competition/train.csv')

print(data.head())
data['text'] = data['text'].fillna('')

   id                                               text  toxicity  \
0   0  This is so cool. It's like, 'would you want yo...  0.000000   
1   1  Thank you!! This would make my life a lot less...  0.000000   
2   2  This is such an urgent design problem; kudos t...  0.000000   
3   3  Is this something I'll be able to install on m...  0.000000   
4   4               haha you guys are a bunch of losers.  0.893617   

   severe_toxicity  obscene  threat   insult  identity_attack  sexual_explicit  
0         0.000000      0.0     0.0  0.00000         0.000000              0.0  
1         0.000000      0.0     0.0  0.00000         0.000000              0.0  
2         0.000000      0.0     0.0  0.00000         0.000000              0.0  
3         0.000000      0.0     0.0  0.00000         0.000000              0.0  
4         0.021277      0.0     0.0  0.87234         0.021277              0.0  


# Data Cleaning

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        text = ''
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text,flags=re.MULTILINE)
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Lowercase
    text = text.lower()
    # Remove extra whitespace
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text


data['clean_text'] = data['text'].apply(clean_text)


In [None]:
data.head()

Unnamed: 0,id,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit,clean_text
0,0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,this is so cool its like would you want your m...
1,1,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,thank you this would make my life a lot less a...
2,2,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,this is such an urgent design problem kudos to...
3,3,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,is this something ill be able to install on my...
4,4,haha you guys are a bunch of losers.,0.893617,0.021277,0.0,0.0,0.87234,0.021277,0.0,haha you guys are a bunch of losers


# Tokenize

In [None]:
from collections import Counter

def tokenize(text):
    return text.split()

# Tokenize all texts
data['tokens'] = data['clean_text'].apply(tokenize)

initial_count = len(data)
data = data[data['tokens'].apply(len) > 0].reset_index(drop=True)
filtered_count = len(data)
print(f'Removed {initial_count - filtered_count} empty samples.')

# Proceed only if there are samples left
if filtered_count == 0:
    raise ValueError("All samples have empty sequences after preprocessing.")


# Build vocabulary
all_tokens = [token for tokens in data['tokens'] for token in tokens]
counter = Counter(all_tokens)
vocab = sorted(counter, key=counter.get, reverse=True)

# Create word to index mapping
vocab_size = len(vocab) + 2  # +2 for PAD and UNK
print(vocab_size,"vocab size")
word2idx = {word: idx+2 for idx, word in enumerate(vocab)}
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1

# Save the vocabulary for future use
with open('word2idx.pkl', 'wb') as f:
    pickle.dump(word2idx, f)


Removed 2598 empty samples.
573750 vocab size


In [None]:
def tokens_to_indices(tokens, word2idx):
    return [word2idx.get(token, word2idx['<UNK>']) for token in tokens]

data['sequence'] = data['tokens'].apply(lambda x: tokens_to_indices(x, word2idx))


In [None]:
label_cols = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']
data['labels'] = data[label_cols].values.tolist()


In [None]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)


# Verify no empty sequences


In [None]:
empty_sequences = train_data['tokens'].apply(len) == 0
print(f'Empty sequences in train set: {empty_sequences.sum()}')

empty_sequences = val_data['tokens'].apply(len) == 0
print(f'Empty sequences in validation set: {empty_sequences.sum()}')


Empty sequences in train set: 0
Empty sequences in validation set: 0


# Prepare Dataset

In [None]:
class CommentDataset(Dataset):
    def __init__(self, data, word2idx, max_len=100):
        self.data = data
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq = self.data.iloc[idx]['sequence']
        label = self.data.iloc[idx]['labels']
        # Truncate or pad the sequence
        if len(seq) > self.max_len:
            seq = seq[:self.max_len]
            length = self.max_len
        else:
            length = len(seq)
            seq = seq + [self.word2idx['<PAD>']] * (self.max_len - len(seq))
        return torch.tensor(seq, dtype=torch.long), torch.tensor(label, dtype=torch.float32), torch.tensor(length, dtype=torch.long)

batch_size = 256
max_len = 128

train_dataset = CommentDataset(train_data, word2idx, max_len)
val_dataset = CommentDataset(val_data, word2idx, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [None]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7c4906fde8c0>

# Define Model

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                            bidirectional=bidirectional, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.fc2 = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 128)
        self.fc_out = nn.Linear(128, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        # Ensure lengths are on CPU and of type torch.long
        lengths = lengths.cpu()
        # text: [batch size, seq len]
        embedded = self.embedding(text)  # [batch size, seq len, embedding dim]
        # Pack the sequences
        packed_embedded = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]
        output = self.fc(hidden)
        return self.sigmoid(output)


In [None]:
from sklearn.utils.class_weight import compute_class_weight
embedding_dim = 128
hidden_dim = 256
output_dim = len(label_cols)  # 7
n_layers = 2
bidirectional = True
dropout = 0.3
pad_idx = word2idx['<PAD>']


# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_data['labels']), y=train_data['labels'])
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

criterion = nn.BCELoss(weight=class_weights_tensor)

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = criterion.to(device)


# Define train and evaluate function

In [None]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for sequences, labels, lengths in dataloader:
        sequences = sequences.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        predictions = model(sequences, lengths)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for sequences, labels, lengths in dataloader:
            sequences = sequences.to(device)
            labels = labels.to(device)
            # Do NOT move lengths to device
            # lengths = lengths.to(device)

            predictions = model(sequences, lengths)
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
            all_preds.append(predictions.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    # Binarize the predictions and labels
    all_preds_binary = (all_preds >= 0.5).astype(int)
    all_labels_binary = (all_labels >= 0.5).astype(int)

    return epoch_loss / len(dataloader), all_preds_binary, all_labels_binary


# Training Loop

In [None]:
num_epochs = 50
best_val_loss = float('inf')

for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_preds_binary, val_labels_binary = evaluate(model, val_loader, criterion, device)

    # Calculate F1 Score and Accuracy
    f1 = f1_score(val_labels_binary, val_preds_binary, average='macro')
    accuracy = accuracy_score(val_labels_binary, val_preds_binary)

    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Val Loss: {val_loss:.4f} | Val F1: {f1:.4f} | Val Accuracy: {accuracy:.4f}')

    # Save the model if validation loss decreases
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pt')
        print('Model saved!')


Epoch 1/50
Train Loss: 0.1032
Val Loss: 0.0950 | Val F1: 0.3195 | Val Accuracy: 0.9396
Model saved!
Epoch 2/50
Train Loss: 0.0932
Val Loss: 0.0934 | Val F1: 0.3556 | Val Accuracy: 0.9400
Model saved!
Epoch 3/50
Train Loss: 0.0911
Val Loss: 0.0931 | Val F1: 0.3484 | Val Accuracy: 0.9406
Model saved!
Epoch 4/50
Train Loss: 0.0893
Val Loss: 0.0933 | Val F1: 0.3751 | Val Accuracy: 0.9413
Epoch 5/50
Train Loss: 0.0876
Val Loss: 0.0941 | Val F1: 0.3782 | Val Accuracy: 0.9397
Epoch 6/50
Train Loss: 0.0858
Val Loss: 0.0949 | Val F1: 0.3885 | Val Accuracy: 0.9399
Epoch 7/50
Train Loss: 0.0840
Val Loss: 0.0961 | Val F1: 0.4017 | Val Accuracy: 0.9390
Epoch 8/50
Train Loss: 0.0823
Val Loss: 0.0973 | Val F1: 0.3866 | Val Accuracy: 0.9388
Epoch 9/50
Train Loss: 0.0807
Val Loss: 0.0986 | Val F1: 0.4013 | Val Accuracy: 0.9379
Epoch 10/50
Train Loss: 0.0792
Val Loss: 0.1008 | Val F1: 0.4011 | Val Accuracy: 0.9368
Epoch 11/50
Train Loss: 0.0780
Val Loss: 0.1024 | Val F1: 0.4044 | Val Accuracy: 0.9358
Ep

# Load the best model


In [None]:
model.load_state_dict(torch.load('best_model.pt'))

val_loss, val_preds_binary, val_labels_binary = evaluate(model, val_loader, criterion, device)

# Calculate final metrics
f1 = f1_score(val_labels_binary, val_preds_binary, average='macro')
accuracy = accuracy_score(val_labels_binary, val_preds_binary)

print(f'Final Val Loss: {val_loss:.4f}')
print(f'Final Val F1 Score: {f1:.4f}')
print(f'Final Val Accuracy: {accuracy:.4f}')


  model.load_state_dict(torch.load('best_mode.pt'))


Final Val Loss: 0.0935
Final Val F1 Score: 0.3602
Final Val Accuracy: 0.9402


Train best model again

In [None]:
num_epochs = 5
best_val_loss = float('inf')

for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_preds_binary, val_labels_binary = evaluate(model, val_loader, criterion, device)

    # Calculate F1 Score and Accuracy
    f1 = f1_score(val_labels_binary, val_preds_binary, average='macro')
    accuracy = accuracy_score(val_labels_binary, val_preds_binary)

    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Val Loss: {val_loss:.4f} | Val F1: {f1:.4f} | Val Accuracy: {accuracy:.4f}')

    # Save the model if validation loss decreases
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_mode.pt')
        print('Model saved!')


Epoch 1/5
Train Loss: 0.0890
Val Loss: 0.0935 | Val F1: 0.3602 | Val Accuracy: 0.9402
Model saved!
Epoch 2/5
Train Loss: 0.0878
Val Loss: 0.0940 | Val F1: 0.3943 | Val Accuracy: 0.9394
Epoch 3/5
Train Loss: 0.0861
Val Loss: 0.0949 | Val F1: 0.3824 | Val Accuracy: 0.9404
Epoch 4/5
Train Loss: 0.0844
Val Loss: 0.0957 | Val F1: 0.3889 | Val Accuracy: 0.9394
Epoch 5/5
Train Loss: 0.0827
Val Loss: 0.0981 | Val F1: 0.4050 | Val Accuracy: 0.9382


# # Load test data


In [None]:
test_data = pd.read_csv('/kaggle/input/data-255-toxic-comment-in-class-competition/test.csv')

# Clean text
test_data['clean_text'] = test_data['text'].apply(clean_text)

# Tokenize
test_data['tokens'] = test_data['clean_text'].apply(tokenize)

# Remove empty sequences
initial_test_count = len(test_data)
test_data = test_data[test_data['tokens'].apply(len) > 0].reset_index(drop=True)
filtered_test_count = len(test_data)
print(f'Removed {initial_test_count - filtered_test_count} empty test samples.')

if filtered_test_count == 0:
    raise ValueError("All test samples have empty sequences after preprocessing.")

# Convert tokens to indices
test_data['sequence'] = test_data['tokens'].apply(lambda x: tokens_to_indices(x, word2idx))


Removed 137 empty test samples.


In [None]:
class TestDataset(Dataset):
    def __init__(self, data, word2idx, max_len=100):
        self.data = data
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq = self.data.iloc[idx]['sequence']
        # Truncate or pad the sequence
        if len(seq) > self.max_len:
            seq = seq[:self.max_len]
            length = self.max_len
        else:
            length = len(seq)
            seq = seq + [self.word2idx['<PAD>']] * (self.max_len - len(seq))
        return torch.tensor(seq, dtype=torch.long), torch.tensor(length, dtype=torch.long)

test_dataset = TestDataset(test_data, word2idx, max_len)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# Run predictions

In [None]:
def predict(model, dataloader, device):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for sequences, lengths in dataloader:
            sequences = sequences.to(device)
            predictions = model(sequences, lengths)
            all_preds.append(predictions.cpu().numpy())
    all_preds = np.vstack(all_preds)
    return all_preds

test_preds = predict(model, test_loader, device)

# Convert probabilities to binary (0 or 1)
test_preds_binary = (test_preds >= 0.5).astype(int)


In [None]:

original_test_ids = pd.read_csv('/kaggle/input/data-255-toxic-comment-in-class-competition/test.csv')['id']

filtered_test_ids = test_data['id']

removed_test_ids = set(original_test_ids) - set(filtered_test_ids)

predictions_df = pd.DataFrame({
    'id': filtered_test_ids,
    'toxicity': test_preds_binary[:,0],
    'severe_toxicity': test_preds_binary[:,1],
    'obscene': test_preds_binary[:,2],
    'threat': test_preds_binary[:,3],
    'insult': test_preds_binary[:,4],
    'identity_attack': test_preds_binary[:,5],
    'sexual_explicit': test_preds_binary[:,6],
})

# For removed test samples, assign 0 to all labels or handle as needed
if removed_test_ids:
    removed_df = pd.DataFrame({
        'id': list(removed_test_ids),
        'toxicity': 0,
        'severe_toxicity': 0,
        'obscene': 0,
        'threat': 0,
        'insult': 0,
        'identity_attack': 0,
        'sexual_explicit': 0,
    })
    # Concatenate predictions with removed samples
    submission = pd.concat([predictions_df, removed_df], ignore_index=True)
else:
    submission = predictions_df

# Ensure the submission has the same order as the original test set
submission = submission.set_index('id').reindex(original_test_ids).reset_index()

# Save to CSV
submission.to_csv('predictions.csv', index=False)

print('Predictions saved to predictions.csv')



Predictions saved to predictions.csv
