In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, f1_score

# Paths to saved files
data_path = "Data_Part2/"
glove_path = "glove.6B.300d.txt"

# Load preprocessed data
train_df = pd.read_csv(f"{data_path}train_split_new.csv")
valid_df = pd.read_csv(f"{data_path}valid_split_new.csv")
test_df = pd.read_csv(f"{data_path}test_cleaned.csv")

# Load tokenized and padded data
train_padded = torch.load(f"{data_path}train_tokenized_0_6_1950.pt")
valid_padded = torch.load(f"{data_path}valid_tokenized_0_6_1950.pt")
test_padded = torch.load(f"{data_path}test_tokenized_0_6_1950.pt")


  train_padded = torch.load(f"{data_path}train_tokenized_0_6_1950.pt")
  valid_padded = torch.load(f"{data_path}valid_tokenized_0_6_1950.pt")
  test_padded = torch.load(f"{data_path}test_tokenized_0_6_1950.pt")


In [12]:
# Load GloVe embeddings
def load_glove_embeddings(glove_path, vocab, embedding_dim=300):
    embeddings = np.random.uniform(-0.25, 0.25, (len(vocab), embedding_dim))
    with open(glove_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if word in vocab:
                embeddings[vocab[word]] = vector
    return torch.tensor(embeddings, dtype=torch.float32)

# Load vocabulary
vocab = {word: idx for idx, word in enumerate(train_df['text'].str.split().explode().unique())}

# Load embeddings
embedding_dim = 300
glove_embeddings = load_glove_embeddings(glove_path, vocab, embedding_dim)

In [13]:
# Load FastText embeddings
def load_fasttext_embeddings(fasttext_path, vocab, embedding_dim=300):
    embeddings = np.random.uniform(-0.25, 0.25, (len(vocab), embedding_dim))
    with open(fasttext_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if word in vocab:
                embeddings[vocab[word]] = vector
    return torch.tensor(embeddings, dtype=torch.float32)

# Load both embeddings
fasttext_path = "crawl-300d-2M.vec"
#glove_embeddings = load_glove_embeddings(glove_path, vocab, embedding_dim)
fasttext_embeddings = load_fasttext_embeddings(fasttext_path, vocab, embedding_dim)


In [None]:
# Target columns
target_columns = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']

# Convert labels to tensors
train_labels = torch.tensor(train_df[target_columns].values, dtype=torch.float32)
valid_labels = torch.tensor(valid_df[target_columns].values, dtype=torch.float32)

# Ensure alignment
assert train_padded.size(0) == len(train_df), "Mismatch between train tokenized data and labels!"
assert valid_padded.size(0) == len(valid_df), "Mismatch between valid tokenized data and labels!"


In [55]:
# Target columns
target_columns = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']

# Convert labels to tensors
train_labels = torch.tensor(train_df[target_columns].values, dtype=torch.float32)
valid_labels = torch.tensor(valid_df[target_columns].values, dtype=torch.float32)

# Ensure alignment
assert train_padded.size(0) == len(train_df), "Mismatch between train tokenized data and labels!"
assert valid_padded.size(0) == len(valid_df), "Mismatch between valid tokenized data and labels!"

# DataLoaders
batch_size = 64
train_dataset = TensorDataset(train_padded, train_labels)
valid_dataset = TensorDataset(valid_padded, valid_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

In [65]:
# Define the LSTM model
class LSTMOnly(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout, fasttext_embeddings):
        super(LSTMOnly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(fasttext_embeddings)
        self.embedding.weight.requires_grad = False  # Allow embeddings to be fine-tuned

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Final fully connected layer

    def forward(self, x):
        embedded = self.embedding(x)  # Shape: (batch_size, seq_len, embedding_dim)
        lstm_out, _ = self.lstm(embedded)  # Shape: (batch_size, seq_len, hidden_dim * 2)
        lstm_output = lstm_out[:, -1, :]  # Take the last hidden state for each sequence
        lstm_output = self.dropout(lstm_output)
        return torch.sigmoid(self.fc(lstm_output))


# Initialize model
hidden_dim = 256
output_dim = len(target_columns)
dropout = 0.3
model = LSTMAttention(len(vocab), embedding_dim, hidden_dim, output_dim, dropout, fasttext_embeddings)

# Define loss, optimizer, and scheduler
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5)  # Adding L2 regularization
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.7, patience=4)


In [66]:
print(f"Vocabulary size: {len(vocab)}")
print(f"Max index in train_padded: {train_padded.max()}")
print(f"Max index in valid_padded: {valid_padded.max()}")
print(f"Max index in test_padded: {test_padded.max()}")


Vocabulary size: 462690
Max index in train_padded: 462689
Max index in valid_padded: 462668
Max index in test_padded: 462664


In [67]:
vocab_size = len(vocab)

train_padded[train_padded >= vocab_size] = 0
valid_padded[valid_padded >= vocab_size] = 0
test_padded[test_padded >= vocab_size] = 0


In [68]:
print(f"Vocabulary size: {len(vocab)}")
print(f"Max index in train_padded: {train_padded.max()}")
print(f"Max index in valid_padded: {valid_padded.max()}")
print(f"Max index in test_padded: {test_padded.max()}")


Vocabulary size: 462690
Max index in train_padded: 462689
Max index in valid_padded: 462668
Max index in test_padded: 462664


In [72]:
import torch

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Move the model to the device
model = model.to(device)


Using device: cuda


In [74]:
class LSTMAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout):
        super(LSTMAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention_layer = nn.Linear(hidden_dim * 2, 1)  # BiLSTM doubles the hidden dimension
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Final fully connected layer

    def attention(self, lstm_outputs):
        # Attention mechanism
        weights = torch.tanh(self.attention_layer(lstm_outputs))  # Shape: (batch_size, seq_len, 1)
        weights = torch.softmax(weights, dim=1)  # Shape: (batch_size, seq_len, 1)
        weighted_output = torch.sum(weights * lstm_outputs, dim=1)  # Shape: (batch_size, hidden_dim * 2)
        return weighted_output

    def forward(self, x):
        embedded = self.embedding(x)  # Shape: (batch_size, seq_len, embedding_dim)
        lstm_out, _ = self.lstm(embedded)  # Shape: (batch_size, seq_len, hidden_dim * 2)
        attended_output = self.attention(lstm_out)  # Apply Attention
        attended_output = self.dropout(attended_output)
        return torch.sigmoid(self.fc(attended_output))

In [None]:
import itertools

# Define hyperparameter search space
patience_values = [3, 4]
learning_rates = [1e-3, 5e-4, 1e-4]
batch_sizes = [32, 64]
dropout_values = [0.4, 0.5, 0.6]
hidden_dims = [128, 256]
factors = [0.4, 0.5, 0.6]

# Iterate over all combinations of hyperparameters
for patience, lr, batch_size, dropout, hidden_dim, factor in itertools.product(
    patience_values, learning_rates, batch_sizes, dropout_values, hidden_dims, factors
):
    print(f"Running configuration: Patience={patience}, LR={lr}, Batch Size={batch_size}, Dropout={dropout}, Hidden Dim={hidden_dim}, Factor={factor}")

    # Update DataLoader with new batch size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

    model = LSTMAttention(
    vocab_size=len(vocab),
    embedding_dim=300,
    hidden_dim=hidden_dim,
    output_dim=len(target_columns),
        dropout=dropout
    ).to(device)
    
    # Load GloVe embeddings into the model
    model.embedding.weight.data.copy_(glove_embeddings)
    model.embedding.weight.requires_grad = False  # Freeze embeddings if necessary


    # Define optimizer, scheduler, and loss function
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)  # Include L2 regularization
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=factor, patience=patience)
    criterion = nn.BCELoss()

    # Training loop
    best_valid_loss = float('inf')
    results = []
    for epoch in range(5):  # Run for 5 epochs
        print(f"Epoch {epoch + 1} Training Begins")
        model.train()
        train_loss = 0.0
        for inputs, targets in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for inputs, targets in tqdm(valid_loader, desc=f"Validating Epoch {epoch + 1}"):
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                valid_loss += loss.item()

        # Log losses
        train_loss_avg = train_loss / len(train_loader)
        valid_loss_avg = valid_loss / len(valid_loader)
        print(f"Epoch {epoch + 1}: Train Loss = {train_loss_avg:.4f}, Valid Loss = {valid_loss_avg:.4f}")

        # Save best model
        if valid_loss_avg < best_valid_loss:
            best_valid_loss = valid_loss_avg
            model_save_path = f"best_model_p{patience}_lr{lr}_bs{batch_size}_do{dropout}_hd{hidden_dim}_f{factor}.pth"
            torch.save(model.state_dict(), model_save_path)
            print(f"Model weights saved: {model_save_path}")

        # Record epoch results
        results.append({
            "epoch": epoch + 1,
            "train_loss": train_loss_avg,
            "valid_loss": valid_loss_avg,
            "best_valid_loss": best_valid_loss
        })

        # Step scheduler
        scheduler.step(valid_loss_avg)

    # Save results for the current combination
    results_df = pd.DataFrame(results)
    results_filename = f"results_p{patience}_lr{lr}_bs{batch_size}_do{dropout}_hd{hidden_dim}_f{factor}.csv"
    results_df.to_csv(results_filename, index=False)
    print(f"Results saved to {results_filename}")

    # Test loop and submission generation
    test_dataset = TensorDataset(torch.tensor(test_padded, dtype=torch.long))
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    test_predictions = []
    model.eval()
    with torch.no_grad():
        for inputs in tqdm(test_loader, desc="Making Predictions"):
            inputs = inputs[0].to(device)  # Extract Tensors from DataLoader and move to device
            outputs = model(inputs)
            test_predictions.append(outputs.cpu().numpy())

    test_predictions = np.vstack(test_predictions)

    # Threshold the predictions for multi-label classification
    binary_predictions = (test_predictions > 0.6).astype(int)

    # Create a DataFrame with the correct format
    submission_filename = f"submission_p{patience}_lr{lr}_bs{batch_size}_do{dropout}_hd{hidden_dim}_f{factor}.csv"
    submission = pd.DataFrame(binary_predictions, columns=target_columns)
    submission.insert(0, 'id', test_df['id'])  # Ensure the 'id' column is included
    submission.to_csv(submission_filename, index=False)
    print(f"Submission file created: {submission_filename}")


Running configuration: Patience=3, LR=0.001, Batch Size=32, Dropout=0.4, Hidden Dim=128, Factor=0.4
Epoch 1 Training Begins


Training Epoch 1:  11%|█         | 4339/39482 [00:18<02:26, 240.28it/s]

In [70]:
# Training loop with explicit embedding control
best_valid_loss = float('inf')

for epoch in range(10):  # Adjust the number of epochs as needed
    print(f"Epoch {epoch + 1} Training Begins")
    
    # Set embeddings for the training phase
    model.embedding.weight.data.copy_(glove_embeddings )  # Use FastText for training
    model.embedding.weight.requires_grad = False  # Freeze embeddings during training

    # Training Phase
    model.train()  # Set the model to training mode
    train_loss = 0.0
    for inputs, targets in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        inputs, targets = inputs.to(device), targets.to(device)  # Move inputs and targets to the correct device
        optimizer.zero_grad()
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, targets)  # Calculate loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        train_loss += loss.item()
    
    # Set embeddings for the validation phase
    model.embedding.weight.data.copy_(glove_embeddings )  # Use FastText for validation
    model.embedding.weight.requires_grad = False  # Freeze embeddings during validation

    # Validation Phase
    model.eval()  # Set the model to evaluation mode
    valid_loss = 0.0
    with torch.no_grad():  # Disable gradient calculation for validation
        for inputs, targets in tqdm(valid_loader, desc=f"Validating Epoch {epoch + 1}"):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            valid_loss += loss.item()
    
    # Calculate and Log Losses
    train_loss_avg = train_loss / len(train_loader)
    valid_loss_avg = valid_loss / len(valid_loader)
    
    print(f"Epoch {epoch + 1}: Train Loss = {train_loss_avg:.4f}, Valid Loss = {valid_loss_avg:.4f}")
    
    # Save Best Model
    if valid_loss_avg < best_valid_loss:
        best_valid_loss = valid_loss_avg
        torch.save(model.state_dict(), f"{data_path}best_model_epoch_{epoch + 1}.pth")
        print(f"Model weights saved for epoch {epoch + 1}.")


Epoch 1 Training Begins


Training Epoch 1: 100%|██████████| 9871/9871 [05:13<00:00, 31.53it/s]
Validating Epoch 1: 100%|██████████| 4231/4231 [01:05<00:00, 64.16it/s]


Epoch 1: Train Loss = 0.1102, Valid Loss = 0.1007
Model weights saved for epoch 1.
Epoch 2 Training Begins


Training Epoch 2: 100%|██████████| 9871/9871 [05:28<00:00, 30.04it/s]
Validating Epoch 2: 100%|██████████| 4231/4231 [01:04<00:00, 65.65it/s]


Epoch 2: Train Loss = 0.0996, Valid Loss = 0.0989
Model weights saved for epoch 2.
Epoch 3 Training Begins


Training Epoch 3: 100%|██████████| 9871/9871 [05:28<00:00, 30.06it/s]
Validating Epoch 3: 100%|██████████| 4231/4231 [01:03<00:00, 66.26it/s]


Epoch 3: Train Loss = 0.0981, Valid Loss = 0.0990
Epoch 4 Training Begins


Training Epoch 4: 100%|██████████| 9871/9871 [05:29<00:00, 29.92it/s]
Validating Epoch 4: 100%|██████████| 4231/4231 [01:06<00:00, 63.78it/s]


Epoch 4: Train Loss = 0.0982, Valid Loss = 0.0992
Epoch 5 Training Begins


Training Epoch 5: 100%|██████████| 9871/9871 [05:35<00:00, 29.42it/s]
Validating Epoch 5: 100%|██████████| 4231/4231 [01:04<00:00, 65.31it/s]


Epoch 5: Train Loss = 0.0981, Valid Loss = 0.0990
Epoch 6 Training Begins


Training Epoch 6: 100%|██████████| 9871/9871 [05:31<00:00, 29.80it/s]
Validating Epoch 6: 100%|██████████| 4231/4231 [01:04<00:00, 65.18it/s]


Epoch 6: Train Loss = 0.0987, Valid Loss = 0.1006
Epoch 7 Training Begins


Training Epoch 7:   5%|▍         | 445/9871 [00:15<05:24, 29.08it/s]


KeyboardInterrupt: 

In [39]:
# Test loop and submission generation
test_dataset = TensorDataset(torch.tensor(test_padded, dtype=torch.long))
test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_predictions = []
model.eval()
with torch.no_grad():
    for inputs in tqdm(test_loader, desc="Making Predictions"):
        inputs = inputs[0].to(device)  # Extract Tensors from DataLoader and move to device
        outputs = model(inputs)
        test_predictions.append(outputs.cpu().numpy())

test_predictions = np.vstack(test_predictions)

# Threshold the predictions for multi-label classification
binary_predictions = (test_predictions > 0.6).astype(int)

# Create a DataFrame with the correct format
submission = pd.DataFrame(binary_predictions, columns=target_columns)
submission.insert(0, 'id', test_df['id'])  # Ensure the 'id' column is included
submission.to_csv(f"{data_path}submission_LSTM.csv", index=False)

print("Binary submission file created.")


  test_dataset = TensorDataset(torch.tensor(test_padded, dtype=torch.long))
Making Predictions: 100%|██████████| 761/761 [00:09<00:00, 76.14it/s]


Binary submission file created.


In [46]:
# Load the pre-trained model weights
model_path = f"{data_path}best_model_epoch_5.pth"
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)  # Ensure the model is on the correct device
model.eval()  # Set the model to evaluation mode

# Test loop and submission generation
test_dataset = TensorDataset(torch.tensor(test_padded, dtype=torch.long))
test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_predictions = []
with torch.no_grad():
    for inputs in tqdm(test_loader, desc="Making Predictions"):
        inputs = inputs[0].to(device)  # Extract Tensors from DataLoader and move to device
        outputs = model(inputs)
        test_predictions.append(outputs.cpu().numpy())

test_predictions = np.vstack(test_predictions)

# Threshold the predictions for multi-label classification
binary_predictions = (test_predictions > 0.6).astype(int)

# Create a DataFrame with the correct format
submission = pd.DataFrame(binary_predictions, columns=target_columns)
submission.insert(0, 'id', test_df['id'])  # Ensure the 'id' column is included
submission.to_csv(f"{data_path}submission_LSTM.csv", index=False)

print("Binary submission file created.")


  model.load_state_dict(torch.load(model_path, map_location=device))
  test_dataset = TensorDataset(torch.tensor(test_padded, dtype=torch.long))
Making Predictions: 100%|██████████| 1521/1521 [00:12<00:00, 121.41it/s]


Binary submission file created.
