# 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 2. Load train data

In [None]:
df_train = pd.read_parquet(
    '/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet'
)
df_train

# 3. Visualizations

Here are some visualization ideas to explore the data:

## a) Distribution of winners
This will show how often each model (A or B) was selected as the winner.

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='winner', data=df_train)
plt.title('Distribution of Winners (Model A vs. Model B)')
plt.show()

## b) Prompt length distribution
This will give an idea of the typical prompt lengths and their distribution.

In [None]:
df_train['prompt_length'] = df_train['prompt'].apply(len)
plt.figure(figsize=(8, 6))
sns.histplot(df_train['prompt_length'], bins=50)
plt.title('Distribution of Prompt Lengths')
plt.xlabel('Prompt Length')
plt.ylabel('Frequency')
plt.show()

## c) Response length distribution for each model
This compares the response lengths of models A and B.

In [None]:
df_train['response_a_length'] = df_train['response_a'].apply(len)
df_train['response_b_length'] = df_train['response_b'].apply(len)
plt.figure(figsize=(8, 6))
sns.histplot(df_train['response_a_length'], bins=50, label='Model A', color='blue', alpha=0.5)
sns.histplot(df_train['response_b_length'], bins=50, label='Model B', color='red', alpha=0.5)
plt.title('Distribution of Response Lengths')
plt.xlabel('Response Length')
plt.ylabel('Frequency')
plt.legend()
plt.grid()
plt.show()

## d) Language distribution
This shows the frequency of different languages in the prompts.

In [None]:
plt.figure(figsize=(8, 20))  # Adjust figure height for better readability
sns.countplot(y='language', data=df_train)
plt.title('Distribution of Languages')
plt.xlabel('Frequency')
plt.ylabel('Language')
plt.grid()
plt.show()

## e) Winner distribution per language
This reveals any potential language-based bias in model performance.

In [None]:
plt.figure(figsize=(8, 20))
sns.countplot(y='language', hue='winner', data=df_train)
plt.title('Winner Distribution per Language')
plt.xlabel('Frequency')
plt.ylabel('Language')
plt.grid()
plt.show()

## f) Prompt and Response Length Correlation
- **Visualization:** Scatter plot with prompt length on the X-axis and response length (for both models A and B) on the Y-axis. Use different colors or markers to represent models A and B.
- **Purpose:** To examine the relationship between prompt length and the length of the generated responses. It can help you understand if longer prompts tend to elicit longer responses, and if this relationship differs between the two models.

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='prompt_length', y='response_a_length', data=df_train, label='Model A', alpha=0.5)
sns.scatterplot(x='prompt_length', y='response_b_length', data=df_train, label='Model B', alpha=0.5)
plt.title('Prompt Length vs. Response Length')
plt.xlabel('Prompt Length')
plt.ylabel('Response Length')
plt.legend()
plt.show()

## g) Winner Distribution by Model Identity
- **Visualization:** A bar chart showing the win rate for each distinct model (model_a and model_b).
- **Purpose:** To compare the overall performance of the different models used in the competition. It can identify which models tend to perform better overall.

In [None]:
model_a_win_rate = df_train[df_train['winner'] == 'model_a']['model_a'].value_counts(normalize=True)
model_b_win_rate = df_train[df_train['winner'] == 'model_b']['model_b'].value_counts(normalize=True)

win_rates = pd.concat([model_a_win_rate, model_b_win_rate], axis=1)
win_rates.columns = ['Model A Win Rate', 'Model B Win Rate']

win_rates.plot(kind='barh', figsize=(8, 15))
plt.title('Winner Distribution by Model Identity')
plt.ylabel('Model')
plt.xlabel('Win Rate')
plt.grid()
plt.show()

## h) Cross-Comparison Win Rate Heatmap (Between models)
The heatmap represents the win rate of Model A against Model B.

In [None]:
# Create a cross-tabulation of models and winners
model_wins = pd.crosstab(df_train['model_a'], df_train['winner'])  # Using model_a as a reference

# Get unique models
models = model_wins.index.tolist()

# Create a league table DataFrame
league_table = pd.DataFrame(index=models, columns=models)

# Calculate and fill the league table with win rates
for model1 in models:
    for model2 in models:
        if model1 == model2:
            league_table.loc[model1, model2] = '-'  # Diagonal (model vs. itself)
        else:
            # Calculate win rate of model1 against model2
            try:
                wins_model1 = model_wins.loc[model1, 'model_a']  # Wins of model1
                wins_model2 = model_wins.loc[model2, 'model_b']  # Assuming model_b represents wins of model2
                total_matches = wins_model1 + wins_model2
                win_rate = (wins_model1 / total_matches) * 100 if total_matches else 0
                
            except KeyError:
              win_rate = 0
                
            league_table.loc[model1, model2] = f'{win_rate:.0f}%'  # Format win rate

# Remove percentage signs and convert to numeric
league_table_numeric = league_table.apply(lambda x: pd.to_numeric(x.str.rstrip('%'), errors='coerce'))

# Replace hyphens with np.nan (if they still exist)
league_table_numeric = league_table_numeric.replace('-', np.nan)

# Display the league table using seaborn heatmap with colorbar
plt.figure(figsize=(18, 12))
heatmap = sns.heatmap(league_table_numeric, cmap='jet', cbar=True, xticklabels=True, yticklabels=True)
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=45, horizontalalignment='right')
#sns.heatmap(league_table_numeric, cmap='jet', cbar=True)  # Removed annot and fmt, added cbar=True
plt.title('LLM Model Win Rate League Table')
plt.xlabel('Model B')
plt.ylabel('Model A')
plt.grid()
plt.show()