In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, log_loss, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt
import seaborn as sns

print('All libraries has been imported')

In [None]:
# Data Collection
train_file = '/kaggle/input/lmsys-chatbot-arena/train.csv'
test_file = '/kaggle/input/lmsys-chatbot-arena/test.csv'
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)


In [None]:
# Data Understanding
print("Training Dataset Info:")
print(train_data.info())
print("\nFirst few rows:")
print(train_data.head())

print("\nTest Dataset Info:")
print(test_data.info())
print("\nFirst few rows:")
print(test_data.head())

# Visualize class distribution before resampling
plt.figure(figsize=(12, 6))
sns.countplot(x=train_data['winner_model_a'])
plt.title("Class Distribution Before Resampling")
plt.show()

In [None]:
# Data Preparation
def clean_text(text, stop_words):
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

stop_words = set()

# Clean text data
train_data['prompt'] = train_data['prompt'].apply(lambda x: clean_text(x, stop_words))
train_data['response_a'] = train_data['response_a'].apply(lambda x: clean_text(x, stop_words))
train_data['response_b'] = train_data['response_b'].apply(lambda x: clean_text(x, stop_words))

test_data['prompt'] = test_data['prompt'].apply(lambda x: clean_text(x, stop_words))
test_data['response_a'] = test_data['response_a'].apply(lambda x: clean_text(x, stop_words))
test_data['response_b'] = test_data['response_b'].apply(lambda x: clean_text(x, stop_words))


In [None]:
# Vectorize text data
vectorizer = TfidfVectorizer(max_features=1000)
train_text = train_data['prompt'] + ' ' + train_data['response_a'] + ' ' + train_data['response_b']
test_text = test_data['prompt'] + ' ' + test_data['response_a'] + ' ' + test_data['response_b']

X_train_text = vectorizer.fit_transform(train_text)
X_test_text = vectorizer.transform(test_text)

# Verbosity Bias - Add response lengths and length differences
train_data['response_a_length'] = train_data['response_a'].apply(len)
train_data['response_b_length'] = train_data['response_b'].apply(len)
test_data['response_a_length'] = test_data['response_a'].apply(len)
test_data['response_b_length'] = test_data['response_b'].apply(len)
train_data['length_diff'] = train_data['response_a_length'] - train_data['response_b_length']
test_data['length_diff'] = test_data['response_a_length'] - test_data['response_b_length']

# Position Bias - Add position bias feature
train_data['position_bias_a'] = 0  # Assuming response_a is always the first
train_data['position_bias_b'] = 1  # Assuming response_b is always the second
test_data['position_bias_a'] = 0
test_data['position_bias_b'] = 1

# Self-Enhancement Bias - Add self-enhancement detection feature
def detect_self_enhancement(text):
    keywords = ['best', 'better', 'excellent', 'superior', 'number one']
    for keyword in keywords:
        if keyword in text:
            return 1
    return 0

train_data['self_enhancement_a'] = train_data['response_a'].apply(detect_self_enhancement)
train_data['self_enhancement_b'] = train_data['response_b'].apply(detect_self_enhancement)
test_data['self_enhancement_a'] = test_data['response_a'].apply(detect_self_enhancement)
test_data['self_enhancement_b'] = test_data['response_b'].apply(detect_self_enhancement)


In [None]:
# Encoding Categorical Features
categorical_columns = ['model_a', 'model_b']
for column in categorical_columns:
    if column not in test_data.columns:
        test_data[column] = 'missing'
train_data_encoded = pd.get_dummies(train_data, columns=categorical_columns)
test_data_encoded = pd.get_dummies(test_data, columns=categorical_columns)
train_data_encoded, test_data_encoded = train_data_encoded.align(test_data_encoded, join='left', axis=1, fill_value=0)

# Handle Missing Columns
test_data_encoded.drop(columns=['winner_model_a', 'winner_model_b', 'winner_tie'], errors='ignore', inplace=True)

# Remove non-numeric columns
non_numeric_columns = train_data_encoded.select_dtypes(exclude=[np.number]).columns
train_data_encoded.drop(columns=non_numeric_columns, inplace=True)
test_data_encoded.drop(columns=non_numeric_columns, inplace=True)

# Combine all features into training and testing sets
X_train_combined = np.hstack((X_train_text.toarray(), train_data_encoded.drop(columns=['winner_model_a', 'winner_model_b', 'winner_tie']).values))
X_test_combined = np.hstack((X_test_text.toarray(), test_data_encoded.values))
X = X_train_combined
y = train_data_encoded['winner_model_a']


In [None]:
# Modeling
# Resample Data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Visualize class distribution after resampling
plt.figure(figsize=(12, 6))
sns.countplot(x=y_resampled)
plt.title("Class Distribution After Resampling")
plt.show()


In [None]:
# Split the resampled data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [None]:
# Hyperparameter tuning and model selection
models = {
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {'C': [0.01, 0.1, 1, 10, 100]}
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, 30]}
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [100, 200]}
    }
}

best_models = {}

for model_name, config in models.items():
    grid_search = GridSearchCV(estimator=config['model'], param_grid=config['params'], cv=5, scoring='neg_log_loss', verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best {model_name} Model: {grid_search.best_params_}")


In [None]:
# Evaluation
for model_name, model in best_models.items():
    y_val_pred = model.predict(X_val)
    y_val_pred_proba = model.predict_proba(X_val)
    print(f"Classification Report on Validation Set ({model_name}):")
    print(classification_report(y_val, y_val_pred, zero_division=1))
    print(f"Log Loss ({model_name}): {log_loss(y_val, y_val_pred_proba)}")

    # Confusion Matrix
    cm = confusion_matrix(y_val, y_val_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix ({model_name})')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_val, y_val_pred_proba[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic ({model_name})')
    plt.legend(loc="lower right")
    plt.show()


In [None]:
# Deployment with the best model (example: Logistic Regression)
best_lr_model = best_models['Logistic Regression']
test_predictions_proba = best_lr_model.predict_proba(X_test_combined)
submission_df = pd.DataFrame({
    'id': test_data['id'],
    'winner_model_a': test_predictions_proba[:, 0],
    'winner_model_b': test_predictions_proba[:, 1],
    'winner_tie': 0.0  # Assuming binary classification
})
submission_df.to_csv('submission.csv', index=False)
print(submission_df.head())
print("Submission file saved successfully.")
