In [None]:
!pip install ../input/textstat/Pyphen-0.10.0-py3-none-any.whl
!pip install ../input/textstat/textstat-0.7.0-py3-none-any.whl
import textstat

In [None]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
import nltk
import textstat
from textblob import TextBlob
from collections import Counter

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')
sample_sub = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/sample_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(f"The size of the train data: {train.shape} is and the test data is: {test.shape}")

In [None]:
print(train['winner_model_a'].value_counts())
print(train['winner_model_b'].value_counts())
print(train['winner_tie'].value_counts())

In [None]:
# Create a figure and axes
fig, axes = plt.subplots(3, 1, figsize=(7, 6))

# Columns to plot
columns = ['winner_model_a', 'winner_model_b', 'winner_tie']

# Define colors for 0 and 1
colors = {0: 'steelblue', 1: 'salmon'}

# Plot each column in its respective subplot
for i, column in enumerate(columns):
    ax = axes[i]
    value_counts = train[column].value_counts().sort_index()
    
    # Plot bars with specified colors and labels for legend
    bars = ax.bar(value_counts.index.astype(str), value_counts, color=[colors[idx] for idx in value_counts.index],
                  label=value_counts.index.map({0: 'Lose (0)', 1: 'Win (1)'}))
    
    # Annotate counts on bars
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
    
    ax.set_xlabel('Winner')
    ax.set_ylabel('Count')
    ax.set_title(f'Model {column.split("_")[-1].capitalize()} Counts')
    ax.legend(title='Outcome', loc='upper right')

# Add overall title and adjust layout
fig.suptitle('Distribution of Winners Across Models', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

# Display the plot
plt.show()

In [None]:
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)


train.loc[:, 'prompt'] = train['prompt'].apply(process)
train.loc[:, 'response_a'] = train['response_a'].apply(process)
train.loc[:, 'response_b'] = train['response_b'].apply(process)

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
%%time

# Function to compute word count
def word_count(text):
    return len(nltk.word_tokenize(text))

# Function to compute character count
def char_count(text):
    return len(text)

# Function to compute sentence count
def sentence_count(text):
    return len(nltk.sent_tokenize(text))

# Function to compute average word length
def avg_word_length(text):
    words = nltk.word_tokenize(text)
    if len(words) == 0:
        return 0
    return sum(len(word) for word in words) / len(words)

# Function to compute average sentence length
def avg_sentence_length(text):
    words = nltk.word_tokenize(text)
    sentences = nltk.sent_tokenize(text)
    if len(sentences) == 0:
        return 0
    return len(words) / len(sentences)

# Function to compute type-token ratio
def ttr(text):
    words = nltk.word_tokenize(text)
    if len(words) == 0:
        return 0
    unique_words = set(words)
    return len(unique_words) / len(words)

# Function to compute word frequency
def word_freq(text):
    words = nltk.word_tokenize(text)
    return Counter(words)

# Function to compute bigram frequency
def bigram_freq(text):
    words = nltk.word_tokenize(text)
    bigrams = list(nltk.bigrams(words))
    return Counter(bigrams)

# Function to compute readability scores
def readability_scores(text):
    scores = {
        "flesch_kincaid_score": textstat.flesch_kincaid_grade(text),
        "gunning_fog_index": textstat.gunning_fog(text),
        "smog_index": textstat.smog_index(text),
        "ari": textstat.automated_readability_index(text)
    }
    return scores

# Compute additional metrics and add to DataFrame
for column in ["prompt", "response_a", "response_b"]:
    train[f"{column}_word_count"] = train[column].apply(word_count)
    train[f"{column}_char_count"] = train[column].apply(char_count)
    train[f"{column}_sentence_count"] = train[column].apply(sentence_count)
    train[f"{column}_avg_word_length"] = train[column].apply(avg_word_length)
    train[f"{column}_avg_sentence_length"] = train[column].apply(avg_sentence_length)
#     train[f"{column}_ttr"] = train[column].apply(ttr)
#     readability = train[column].apply(readability_scores)
#     train[f"{column}_flesch_kincaid_score"] = readability.apply(lambda x: x["flesch_kincaid_score"])
#     train[f"{column}_gunning_fog_index"] = readability.apply(lambda x: x["gunning_fog_index"])
#     train[f"{column}_smog_index"] = readability.apply(lambda x: x["smog_index"])
#     train[f"{column}_ari"] = readability.apply(lambda x: x["ari"])

train.head()

In [None]:
%%time

import time
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import log_loss
from scipy.stats import uniform, randint

# Convert the target into a single column with categorical labels
train['winner'] = (train['winner_model_a'] * 1 + train['winner_model_b'] * 2 + train['winner_tie'] * 3).astype(int)

# Define features and target
columns_to_remove = {'id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 
                     'winner_model_a', 'winner_model_b', 'winner_tie', 'winner'}

features = [col for col in train.columns if col not in columns_to_remove]

X = train[features]
y = train['winner'] - 1

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the models
models = {
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'XGBClassifier': XGBClassifier()
}

# Define the parameter distributions for random search
param_distributions = {
    'GradientBoostingClassifier': {
        'n_estimators': [100,200,350,300],
        'max_depth': [2,3,4,5,7,9]
    },
    'XGBClassifier': {
        'n_estimators': [100,200,350,300],
        'max_depth': [2,3,4,5,7,9]
    }
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

best_models = {}  # Dictionary to store the best models

# Iterate over each model
for model_name, model in models.items():
    print(f"Model training for {model_name}")
    
    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_distributions[model_name], n_iter=10, scoring='neg_log_loss', 
                                       n_jobs=-1, cv=skf, random_state=42)
    random_search.fit(X_train, y_train)
    
    best_model = random_search.best_estimator_
    best_models[model_name] = best_model  # Store the best model for the current type
    
    logloss_scores = []
    start_time = time.time()
    
    count = 0
    for train_index, test_index in skf.split(X, y):
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

        best_model.fit(X_train_fold, y_train_fold)
        y_test_pred_proba = best_model.predict_proba(X_test_fold)

        logloss = log_loss(y_test_fold, y_test_pred_proba)
        logloss_scores.append(logloss)
        print(f"The log loss score for fold {count}: {logloss}")
        count += 1

    average_logloss = sum(logloss_scores) / len(logloss_scores)
    print(f"The average log loss score for {model_name} across all folds: {average_logloss}")
    
    elapsed_time = time.time() - start_time
    print(f"Time taken for {model_name}: {elapsed_time:.2f} seconds")
    
    # Predict probabilities on the validation set
    y_val_prob = best_model.predict_proba(X_val)
    # Calculate log loss on the validation set
    val_loss = log_loss(y_val, y_val_prob)
    print(f'Log Loss using {model_name} on validation set: {val_loss}')

# Identify the best model based on validation set performance
best_model_name = min(best_models, key=lambda k: log_loss(y_val, best_models[k].predict_proba(X_val)))
best_average_logloss = log_loss(y_val, best_models[best_model_name].predict_proba(X_val))

print(f"The best model is {best_model_name} with an average log loss score of {best_average_logloss}")

In [None]:
model_to_use = best_models[best_model_name]
model_to_use

In [None]:
# Compute additional metrics and add to DataFrame
for column in ["prompt", "response_a", "response_b"]:
    test[f"{column}_word_count"] = test[column].apply(word_count)
    test[f"{column}_char_count"] = test[column].apply(char_count)
    test[f"{column}_sentence_count"] = test[column].apply(sentence_count)
    test[f"{column}_avg_word_length"] = test[column].apply(avg_word_length)
    test[f"{column}_avg_sentence_length"] = test[column].apply(avg_sentence_length)
    
test.head()

In [None]:
test_features = test[features]
test_predictions = model_to_use.predict_proba(test_features)

In [None]:
test_predictions

In [None]:
# Prepare the submission file
submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': test_predictions[:, 0],
    'winner_model_b': test_predictions[:, 1],
    'winner_tie': test_predictions[:, 2]
})

In [None]:
submission.head()

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index= False)