In [None]:
# Library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import log_loss
import lightgbm as lgb
import optuna

In [None]:
# Load the data
train_data = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
test_data = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

print(train_data.head())
print(train_data.info())
print(train_data.describe())


In [None]:
# Preprocessing data
# Handling missing values if any
train_data.fillna('', inplace=True)
test_data.fillna('', inplace=True)

# Combine text data for vectorization
train_data['combined_text'] = train_data['prompt'] + ' ' + train_data['response_a'] + ' ' + train_data['response_b']
test_data['combined_text'] = test_data['prompt'] + ' ' + test_data['response_a'] + ' ' + test_data['response_b']

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(train_data['combined_text'])
X_test = vectorizer.transform(test_data['combined_text'])

# Extract the target variable
train_data['winner'] = np.where(train_data['winner_model_a'] == 1, 0, np.where(train_data['winner_model_b'] == 1, 1, 2))
y_train = train_data['winner']


In [None]:
# Split the data into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Convert to LightGBM Dataset
train_data_lgb = lgb.Dataset(X_train_split, label=y_train_split)
val_data_lgb = lgb.Dataset(X_val, label=y_val, reference=train_data_lgb)

# Optuna objective function for tuning
def objective(trial):
    params = {
        'feature_pre_filter': False,
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        'boosting': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.25),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 30, 100),
    }
    
    model = lgb.train(params, train_data_lgb, valid_sets=[val_data_lgb], callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation(10)])
    
    y_val_pred_proba = model.predict(X_val, num_iteration=model.best_iteration)
    loss = log_loss(y_val, y_val_pred_proba)
    return loss

# Run Optuna for hyperparameter tuning
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

# Retrieve the best hyperparameters
best_params = study.best_trial.params
best_params.update({'objective': 'multiclass', 'num_class': 3, 'metric': 'multi_logloss', 'boosting': 'gbdt'})

# Train the final model with the best hyperparameters
final_model = lgb.train(best_params, train_data_lgb, valid_sets=[val_data_lgb], callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation(10)])


In [None]:
# Predict probabilities for test set
test_pred_proba = final_model.predict(X_test, num_iteration=final_model.best_iteration)

# Create a submission file
submission = pd.DataFrame(test_pred_proba, columns=['winner_model_a', 'winner_model_b', 'winner_tie'])
submission['id'] = test_data['id']
submission = submission[['id', 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission.to_csv('submission.csv', index=False)
