In [None]:
pip install transformers datasets

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import log_loss

# Define the SiameseLSTM class
class SiameseLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(SiameseLSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 3)  # Output size 3 for 3 classes: model A wins, model B wins, tie

    def forward_one(self, x):
        x = self.embedding(x)
        _, (h, _) = self.lstm(x)
        return h[-1]

    def forward(self, x1, x2):
        h1 = self.forward_one(x1)
        h2 = self.forward_one(x2)
        return self.fc(torch.abs(h1 - h2))

# Step 1: Load your training dataset
df_train = pd.read_csv('/kaggle/input/datasetcomp/train.csv')

# Filter out invalid cases and prepare data
data = df_train[['response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie']].values

def determine_label(row):
    if row[2] == 1:
        return 0  # model A wins
    elif row[3] == 1:
        return 1  # model B wins
    elif row[4] == 1:
        return 2  # tie
    else:
        return -1  # Invalid or unclear case

labels = [determine_label(row) for row in data if determine_label(row) != -1]
data = [row[:2] for row in data if determine_label(row) != -1]

# Step 2: Define a custom Dataset class
class SiameseDataset(Dataset):
    def __init__(self, data, labels, tokenizer, max_length):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pair = self.data[idx]
        response_a = pair[0]
        response_b = pair[1]
        label = self.labels[idx]

        tokens_a = self.tokenizer(response_a, padding="max_length", truncation=True, max_length=self.max_length)
        tokens_b = self.tokenizer(response_b, padding="max_length", truncation=True, max_length=self.max_length)

        return {
            'input_ids_a': torch.tensor(tokens_a['input_ids']),
            'attention_mask_a': torch.tensor(tokens_a['attention_mask']),
            'input_ids_b': torch.tensor(tokens_b['input_ids']),
            'attention_mask_b': torch.tensor(tokens_b['attention_mask']),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Step 3: Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bert-base-uncased/')
max_length = 128  # Adjust according to your dataset

# Step 4: Create instances of Dataset and DataLoader for training
train_dataset = SiameseDataset(data, labels, tokenizer, max_length)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Step 5: Define the Siamese network model
input_size = len(tokenizer)
hidden_size = 300
num_layers = 1
model = SiameseLSTM(input_size, hidden_size, num_layers)

# Step 6: Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Step 7: Training loop
num_epochs = 5  # Adjust as needed
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
        input_ids_a = batch['input_ids_a'].to(device)
        attention_mask_a = batch['attention_mask_a'].to(device)
        input_ids_b = batch['input_ids_b'].to(device)
        attention_mask_b = batch['attention_mask_b'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids_a, input_ids_b)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader)}')

# Load your test dataset
df_test = pd.read_csv('/kaggle/input/datasetcomp/test.csv')

# Define a custom Dataset class for testing
class SiameseTestDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pair = self.data[idx]
        response_a = pair[0]
        response_b = pair[1]

        tokens_a = self.tokenizer(response_a, padding="max_length", truncation=True, max_length=self.max_length)
        tokens_b = self.tokenizer(response_b, padding="max_length", truncation=True, max_length=self.max_length)

        return {
            'input_ids_a': torch.tensor(tokens_a['input_ids']),
            'attention_mask_a': torch.tensor(tokens_a['attention_mask']),
            'input_ids_b': torch.tensor(tokens_b['input_ids']),
            'attention_mask_b': torch.tensor(tokens_b['attention_mask']),
        }

# Prepare test data
test_data = df_test[['response_a', 'response_b']].values.tolist()

# Create instance of SiameseTestDataset
test_dataset = SiameseTestDataset(test_data, tokenizer, max_length)

# Create DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Load the saved model
model.load_state_dict(torch.load('/kaggle/input/datasetcomp/siamese_model.pth'))
model.eval()  # Set model to evaluation mode

# Perform inference on the test data and generate predictions
all_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids_a = batch['input_ids_a'].to(device)
        attention_mask_a = batch['attention_mask_a'].to(device)
        input_ids_b = batch['input_ids_b'].to(device)
        attention_mask_b = batch['attention_mask_b'].to(device)

        outputs = model(input_ids_a, input_ids_b)
        probabilities = nn.Softmax(dim=1)(outputs)
        all_preds.extend(probabilities.cpu().numpy().tolist())

# Create a DataFrame for predictions
pred_df = pd.DataFrame(all_preds, columns=['winner_model_a', 'winner_model_b', 'winner_tie'])
pred_df['id'] = df_test['id']

# Reorder columns to match the required format
pred_df = pred_df[['id', 'winner_model_a', 'winner_model_b', 'winner_tie']]

# Save predictions to CSV for submission
pred_df.to_csv('submission.csv', index=False)
print(pred_df.head())
