In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
sys.path.append('/kaggle/input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl')
!pip install '/kaggle/input/textstat-pypi/Pyphen-0.9.3-py2.py3-none-any.whl'

In [None]:
sys.path.append('/kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl')
!pip install '/kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl'

In [None]:
sys.path.append('/kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl')
!pip install '/kaggle/input/textstat-pypi/textstat-0.7.0-py3-none-any.whl'

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import textstat
from textblob import TextBlob
import spacy
import concurrent.futures
import optuna
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# nltk.download('punkt')

In [None]:

def calculate_readability_scores(text):
    # This function calculates various readability scores
    return {
        "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
        "gunning_fog": textstat.gunning_fog(text),
        "smog_index": textstat.smog_index(text),
        "ari": textstat.automated_readability_index(text),
        "coleman_liau_index": textstat.coleman_liau_index(text)
    }

def count_noun_phrases(text):
    blob = TextBlob(text)
    return len(blob.noun_phrases)

def analyze_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

def count_passive_voice(text):
    doc = nlp(text)
    return sum(1 for token in doc if token.dep_ == 'auxpass')

def pos_tag_frequencies(text):
    words = word_tokenize(text)
    tags = nltk.pos_tag(words)
    freq_dist = nltk.FreqDist(tag for (word, tag) in tags)
    # Ensure all frequencies are stored in a consistent dictionary format
    return {tag: freq for tag, freq in freq_dist.items()}

def text_statistics(text):
    stats = calculate_readability_scores(text)
    stats.update({
        "word_count": len(word_tokenize(text)),
        "char_count": len(text),
        "sentence_count": len(sent_tokenize(text)),
        "avg_word_length": sum(len(word) for word in word_tokenize(text)) / len(word_tokenize(text)),
        "avg_sentence_length": sum(len(sent) for sent in sent_tokenize(text)) / len(sent_tokenize(text)),
        "lexical_diversity": len(set(word_tokenize(text))) / len(word_tokenize(text)),
        "noun_phrases_count": count_noun_phrases(text),
        "sentiment": analyze_sentiment(text),
        "passive_voice_count": count_passive_voice(text),
    })
    # Merge POS tag frequencies into the main stats dictionary
    pos_tags = pos_tag_frequencies(text)
    for tag, count in pos_tags.items():
        stats[f'pos_tag_{tag}'] = count
    return stats

def parallel_apply(df, column):
    # Drop NaN values to avoid errors during text processing
    texts = df[column].dropna()

    # Use ProcessPoolExecutor to apply the function in parallel
    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = list(executor.map(text_statistics, texts))

    # Convert the list of dictionaries to a DataFrame
    results_df = pd.DataFrame(results)

    # Automatically handles missing POS tags by filling with 0 and converts data types appropriately
    # Filling missing POS tags handled by DataFrame initialization from dict
    results_df.fillna(0, inplace=True)
    for col in results_df.columns:
        if results_df[col].dtype == float:
            results_df[col] = results_df[col].astype(int)

    return results_df


In [None]:
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')
sample_sub = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/sample_submission.csv')
print('Data has been imported')

In [None]:
train.shape

In [None]:
# def parallel_apply(df, column):
#     with concurrent.futures.ProcessPoolExecutor() as executor:
#         results = list(executor.map(text_statistics, df[column].dropna()))  # Use dropna to handle NaNs gracefully
#     return pd.DataFrame(results)

In [None]:
# Applying parallel_apply to 'prompt' and 'response' columns
nlp = spacy.load('en_core_web_sm')
prompt_stats_df = parallel_apply(train, 'prompt')
response_a_stats_df = parallel_apply(train, 'response_a')
response_b_stats_df = parallel_apply(train, 'response_b')

In [None]:
train = train.join(prompt_stats_df.add_suffix('_prompt'))
train = train.join(response_a_stats_df.add_suffix('_response_a'))
train = train.join(response_b_stats_df.add_suffix('_response_b'))

In [None]:
train.shape

In [None]:
train.head()

In [None]:
%%time

import time
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import log_loss
from scipy.stats import uniform, randint

# Convert the target into a single column with categorical labels
train['winner'] = (train['winner_model_a'] * 1 + train['winner_model_b'] * 2 + train['winner_tie'] * 3).astype(int)

# Define features and target
columns_to_remove = {'id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 
                     'winner_model_a', 'winner_model_b', 'winner_tie', 'winner'}

features = [col for col in train.columns if col not in columns_to_remove]

X = train[features]
y = train['winner'] - 1

In [None]:
# Define the Optimization Function
def objective(trial):
    # Data splitting inside the trial
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define model and hyperparameters to optimize
    model_type = trial.suggest_categorical('model_type', ['XGBClassifier', 'LGBMClassifier', 'CatBoostClassifier'])
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 0.1)

    if model_type == 'XGBClassifier':
        model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, use_label_encoder=False, eval_metric='logloss', random_state=42)
    elif model_type == 'LGBMClassifier':
        model = LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=42)
    elif model_type == 'CatBoostClassifier':
        model = CatBoostClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, verbose=0, random_state=42)

    # Training and evaluating the model
    model.fit(X_train, y_train)
    y_val_pred = model.predict_proba(X_val)
    return log_loss(y_val, y_val_pred)

# Run Optuna Optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

print('Best trial:', study.best_trial.params)

# Train the Best Model on Full Data
best_params = study.best_trial.params
model_type = best_params.pop('model_type')

if model_type == 'XGBClassifier':
    final_model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss', random_state=42)
elif model_type == 'LGBMClassifier':
    final_model = LGBMClassifier(**best_params, random_state=42)
elif model_type == 'CatBoostClassifier':
    final_model = CatBoostClassifier(**best_params, verbose=0, random_state=42)

final_model.fit(X, y)  # Training on the full dataset

In [None]:
final_model

In [None]:
# Applying parallel_apply to 'prompt' and 'response' columns
prompt_stats_df_test = parallel_apply(test, 'prompt')
response_a_stats_df_test = parallel_apply(test, 'response_a')
response_b_stats_df_test = parallel_apply(test, 'response_b')

In [None]:
test = test.join(prompt_stats_df_test.add_suffix('_prompt'))
test = test.join(response_a_stats_df_test.add_suffix('_response_a'))
test = test.join(response_b_stats_df_test.add_suffix('_response_b'))

In [None]:
test = test[features]
test

In [None]:
train.head()

In [None]:

test_predictions = final_model.predict_proba(test)

In [None]:
test_predictions

In [None]:
test_raw = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv', usecols=['id'])


In [None]:
# Prepare the submission file
submission = pd.DataFrame({
    'id': test_raw['id'],
    'winner_model_a': test_predictions[:, 0],
    'winner_model_b': test_predictions[:, 1],
    'winner_tie': test_predictions[:, 2]
})

submission.head()

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index= False)
