In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import re
from transformers import GPT2Tokenizer

In [None]:
# Load Data
train_data = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet')
test_data = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet')

#Text Cleaning Function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters (keep alphanumeric and spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Clean the relevant text columns in the train and test data
train_data['prompt'] = train_data['prompt'].apply(clean_text)
train_data['response_a'] = train_data['response_a'].apply(clean_text)
train_data['response_b'] = train_data['response_b'].apply(clean_text)

test_data['prompt'] = test_data['prompt'].apply(clean_text)
test_data['response_a'] = test_data['response_a'].apply(clean_text)
test_data['response_b'] = test_data['response_b'].apply(clean_text)

# Step 3: Tokenization (Using GPT-2 Tokenizer from Hugging Face)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set pad_token to eos_token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize the text (prompt, response_a, response_b)
train_data['prompt_tokens'] = train_data['prompt'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))
train_data['response_a_tokens'] = train_data['response_a'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))
train_data['response_b_tokens'] = train_data['response_b'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))

test_data['prompt_tokens'] = test_data['prompt'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))
test_data['response_a_tokens'] = test_data['response_a'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))
test_data['response_b_tokens'] = test_data['response_b'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))

# Optional: Check the cleaned data and tokenized output
print(train_data[['prompt', 'response_a', 'response_b', 'prompt_tokens', 'response_a_tokens', 'response_b_tokens']].head())


In [None]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch

# Load the fine-tuned model
model = GPT2ForSequenceClassification.from_pretrained('/kaggle/input/gpt2/transformers/gpt21/1')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Ensure the padding token is set

# Set the model to evaluation mode
model.eval()


In [None]:
# Tokenize test data
test_data['prompt_tokens'] = test_data['prompt'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))
test_data['response_a_tokens'] = test_data['response_a'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))
test_data['response_b_tokens'] = test_data['response_b'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))


In [None]:
def predict_winner(prompt, response_a, response_b, model, tokenizer):
    # Concatenate prompt with response_a and response_b
    input_a = f"{prompt} [SEP] {response_a}"
    input_b = f"{prompt} [SEP] {response_b}"
    
    # Tokenize both inputs
    tokens_a = tokenizer.encode(input_a, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    tokens_b = tokenizer.encode(input_b, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    
    # Get model logits for each input
    with torch.no_grad():
        logits_a = model(tokens_a).logits[0][0].item()  # Logit for class 0 (response_a)
        logits_b = model(tokens_b).logits[0][0].item()  # Logit for class 0 (response_b)
    
    # Compare logits and return the "winner"
    return "response_a" if logits_a > logits_b else "response_b"

# Apply to test data
test_data['predicted_winner'] = test_data.apply(
    lambda row: predict_winner(row['prompt'], row['response_a'], row['response_b'], model, tokenizer),
    axis=1
)

# Optional: Inspect predictions
print(test_data[['prompt', 'response_a', 'response_b', 'predicted_winner']].head())


In [None]:
# Calculate accuracy on validation data
accuracy = (train_data['winner'] == train_data['winner']).mean()
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Generate predictions for test set
train_data['winner'] = train_data['winner']

# Save to submission file
train_data[['id', 'winner']].to_csv('submission.csv', index=False)

In [None]:
sub_df = pd.read_csv("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv")
sub_df

In [None]:
pd.read_csv("submission.csv")

In [None]:
#sub = train_data[['id','winner']]
#sub.to_csv('submission.csv', index=False)
#sub

