<a href="https://colab.research.google.com/github/varunkk732/LLM-Comparison/blob/main/LLM_Comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

df = pd.read_csv("llm_comparison_dataset.csv")

# Define aggregation rules
agg_rules = {
    "Provider": lambda x: x.mode()[0],  # Most frequent provider
    "Context Window": "mean",
    "Speed (tokens/sec)": "mean",
    "Latency (sec)": "mean",
    "Benchmark (MMLU)": "mean",
    "Benchmark (Chatbot Arena)": "mean",
    "Open-Source": lambda x: x.mode()[0],  # Most frequent value
    "Price / Million Tokens": "mean",
    "Training Dataset Size": "mean",
    "Compute Power": "mean",
    "Energy Efficiency": "mean",
    "Quality Rating": "mean",
    "Speed Rating": "mean",
    "Price Rating": "mean"
}

# Aggregate data by model name
df_agg = df.groupby("Model").agg(agg_rules).reset_index()

# Display the new dataset structure
df_agg.head()

# Assuming you have the aggregated DataFrame `df_agg`
df_agg.to_csv("llm_comparison_aggregated.csv", index=False)

print("File saved as llm_comparison_aggregated.csv")

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Load your aggregated dataset
df_agg = pd.read_csv("llm_comparison_aggregated.csv")

# Select numerical columns (excluding categorical ones)
num_cols = df_agg.select_dtypes(include=["int64", "float64"]).columns.difference(["Open-Source"])

# Apply Min-Max Scaling (0-1)
scaler = MinMaxScaler()
df_agg[num_cols] = scaler.fit_transform(df_agg[num_cols])

# Save the scaled dataset
df_agg.to_csv("llm_comparison_scaled.csv", index=False)

print("Min-Max Scaling applied and saved as llm_comparison_scaled.csv")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('llm_comparison_aggregated.csv')

# Display the first few rows before standardization
print("Original Data (First 5 rows):")
print(data.head())

# Columns to standardize (exclude categorical columns)
numeric_columns = [
    'Context Window', 'Speed (tokens/sec)', 'Latency (sec)',
    'Benchmark (MMLU)', 'Benchmark (Chatbot Arena)',
    'Price / Million Tokens', 'Training Dataset Size',
    'Compute Power', 'Energy Efficiency',
    'Quality Rating', 'Speed Rating', 'Price Rating'
]

# Create a copy for the standardized data
data_standardized = data.copy()

# Apply z-score standardization to numeric columns
for column in numeric_columns:
    mean = data[column].mean()
    std = data[column].std()
    data_standardized[column] = (data[column] - mean) / std

# Display the first few rows after standardization
print("\nStandardized Data (First 5 rows):")
print(data_standardized.head())

# Summary statistics for standardized columns
print("\nSummary Statistics for Standardized Columns:")
print(data_standardized[numeric_columns].describe().T[['mean', 'std']])

# Save the standardized data to a new CSV file
data_standardized.to_csv('llm_comparison_standardized.csv', index=False)
print("\nStandardized data saved to 'llm_comparison_standardized.csv'")

In [None]:
import pandas as pd
import numpy as np

# Load the data to encode
data = pd.read_csv('llm_comparison_standardized.csv')

# Create a copy to work with
encoded_data = data.copy()

# MORE EFFICIENT ENCODING for Model using Provider information
print("\nEFFICIENT ENCODING FOR MODEL")
print("-" * 40)

# Extract model number from the model name and combine with provider
# Example: convert "Claude-1", "GPT-2", etc. to just 1, 2, etc.
encoded_data['Model_number'] = encoded_data['Model'].str.extract(r'-(\d+)').astype(int)

# Create a more compact mapping by combining provider and model number
# This means model 1 from different providers will have different codes
unique_providers = data['Provider'].unique()
provider_code_map = {provider: i for i, provider in enumerate(unique_providers)}

# Map providers to their numeric codes
encoded_data['Provider_code'] = encoded_data['Provider'].map(provider_code_map)

# Print provider mapping
print("Provider Code Mapping:")
for provider, code in provider_code_map.items():
    print(f"  {provider} -> {code}")

# Create a combined model+provider code
# Formula: provider_code * 100 + model_number
# This gives each unique model a unique code while keeping related models close
encoded_data['Model_encoded'] = encoded_data['Provider_code'] * 100 + encoded_data['Model_number']

# Create and print the mapping
model_mapping = {}
for _, row in encoded_data[['Model', 'Model_encoded']].drop_duplicates().iterrows():
    model_mapping[row['Model']] = row['Model_encoded']

print("\nEfficient Model Encoding Mapping:")
for original, encoded in model_mapping.items():
    print(f"  {original} -> {encoded}")

# Show a subset of the encoded data
print("\nFirst 5 rows of encoded data:")
cols_to_show = ['Model', 'Model_encoded', 'Provider', 'Provider_code', 'Model_number']
print(encoded_data[cols_to_show].head())

# Save the encoded dataset
encoded_data.to_csv('llm_comparison_efficient_encoded.csv', index=False)
print("\nEncoded data saved to 'llm_comparison_efficient_encoded.csv'")

In [None]:
# LLM Performance Prediction Project
# This script builds models to predict LLM benchmark performance based on various features

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Set random seed for reproducibility
np.random.seed(42)

# Load the data
df_scaled = pd.read_csv('llm_comparison_scaled_encoded.csv')
df_standardized = pd.read_csv('llm_comparison_standardized_encoded.csv')

# Let's work with the standardized data
df = df_standardized.copy()

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nSample Data:")
print(df.head())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Extract provider names for later use
providers = {code: name for code, name in zip(df['Provider_code'].unique(), df['Provider'].unique())}

# Data Exploration
# ------------------------------------------------------

# Distribution of benchmark scores
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.histplot(df['Benchmark (MMLU)'], kde=True)
plt.title('Distribution of MMLU Benchmark Scores')
plt.xlabel('Standardized Score')

plt.tight_layout()
plt.savefig('benchmark_distributions.png')
plt.close()

# Correlation matrix for key features
feature_cols = ['Context Window', 'Speed (tokens/sec)', 'Latency (sec)',
                'Price / Million Tokens', 'Training Dataset Size',
                'Compute Power', 'Energy Efficiency', 'Open-Source']

plt.figure(figsize=(12, 10))
correlation = df[feature_cols + ['Benchmark (MMLU)']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

# Box plots for benchmark scores by provider
plt.figure(figsize=(14, 10))

plt.subplot(2, 1, 1)
sns.boxplot(x='Provider', y='Benchmark (MMLU)', data=df)
plt.title('MMLU Benchmark Scores by Provider')
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('benchmark_by_provider.png')
plt.close()

# Scatter plots to explore relationships between key features and benchmarks
plt.figure(figsize=(16, 12))

for i, feature in enumerate(feature_cols[:4]):
    plt.subplot(2, 2, i+1)
    sns.scatterplot(x=feature, y='Benchmark (MMLU)', data=df, hue='Provider')
    plt.title(f'{feature} vs MMLU Benchmark')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig('feature_vs_mmlu.png')
plt.close()

# Feature Engineering
# ------------------------------------------------------

# Create combined feature for efficiency (speed/energy consumption)
df['Efficiency_Ratio'] = df['Speed (tokens/sec)'] / (df['Energy Efficiency'] + 3)  # Adding 3 to avoid division by negative values

# Create pricing tier categories
df['Price_Tier'] = pd.qcut(df['Price / Million Tokens'], q=3, labels=['Low', 'Medium', 'High'])

# Create model generation indicator
df['Generation'] = df['Model_number']

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=['Provider', 'Price_Tier'], drop_first=True)

# Prepare features and targets
X = df_encoded.drop(['Model', 'Benchmark (MMLU)', 'Model_encoded', 'Provider_code'], axis=1)
y_mmlu = df_encoded['Benchmark (MMLU)']

# Split the data
X_train, X_test, y_mmlu_train, y_mmlu_test = train_test_split(X, y_mmlu, test_size=0.25, random_state=42)

# Model Building - For MMLU Benchmark
# ------------------------------------------------------

# Define models to evaluate
models_mmlu = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# Function to evaluate models
def evaluate_models(models, X_train, X_test, y_train, y_test):
    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store results
        results[name] = {
            'model': model,
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        }

        print(f"{name} - RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")

    return results

# Evaluate models for MMLU
print("\nEvaluating models for MMLU benchmark prediction:")
mmlu_results = evaluate_models(models_mmlu, X_train, X_test, y_mmlu_train, y_mmlu_test)

# Find best models
best_mmlu_model = max(mmlu_results.items(), key=lambda x: x[1]['r2'])

print(f"\nBest model for MMLU prediction: {best_mmlu_model[0]} with R² = {best_mmlu_model[1]['r2']:.4f}")

# Feature Importance Analysis
# ------------------------------------------------------

# For the best models, extract feature importance if available
def plot_feature_importance(model, model_name, X_columns, target_name):
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_

        # Create DataFrame for feature importance
        feature_importance = pd.DataFrame({
            'Feature': X_columns,
            'Importance': importance
        })

        # Sort by importance
        feature_importance = feature_importance.sort_values('Importance', ascending=False).head(15)

        # Plot
        plt.figure(figsize=(12, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_importance)
        plt.title(f'Feature Importance for {target_name} using {model_name}')
        plt.tight_layout()
        plt.savefig(f'feature_importance_{target_name.lower().replace(" ", "_")}.png')
        plt.close()

        return feature_importance
    else:
        print(f"Model {model_name} doesn't have feature_importances_ attribute")
        return None

# Extract best models
best_mmlu_model_name = best_mmlu_model[0]
best_mmlu_model_obj = best_mmlu_model[1]['model']

# Plot feature importance
mmlu_importance = plot_feature_importance(best_mmlu_model_obj, best_mmlu_model_name, X.columns, 'MMLU Benchmark')

if mmlu_importance is not None:
    print("\nTop 5 features for MMLU prediction:")
    print(mmlu_importance.head())

# Model Tuning
# ------------------------------------------------------

# Function to perform grid search for hyperparameter tuning
def tune_model(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation R²: {grid_search.best_score_:.4f}")

    return grid_search.best_estimator_

# Define parameter grids for the best models
if best_mmlu_model_name == 'Random Forest':
    param_grid_mmlu = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    print("\nTuning Random Forest for MMLU prediction:")
    tuned_mmlu_model = tune_model(RandomForestRegressor(random_state=42), param_grid_mmlu, X_train, y_mmlu_train)
elif best_mmlu_model_name == 'Gradient Boosting':
    param_grid_mmlu = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0]
    }
    print("\nTuning Gradient Boosting for MMLU prediction:")
    tuned_mmlu_model = tune_model(GradientBoostingRegressor(random_state=42), param_grid_mmlu, X_train, y_mmlu_train)

# Evaluate the tuned model
y_mmlu_pred_tuned = tuned_mmlu_model.predict(X_test)
tuned_mse = mean_squared_error(y_mmlu_test, y_mmlu_pred_tuned)
tuned_rmse = np.sqrt(tuned_mse)
tuned_r2 = r2_score(y_mmlu_test, y_mmlu_pred_tuned)

print(f"\nTuned model performance for MMLU prediction:")
print(f"RMSE: {tuned_rmse:.4f}, R²: {tuned_r2:.4f}")

# Predicted vs Actual Plot
plt.figure(figsize=(10, 6))
plt.scatter(y_mmlu_test, y_mmlu_pred_tuned, alpha=0.7)
plt.plot([y_mmlu_test.min(), y_mmlu_test.max()], [y_mmlu_test.min(), y_mmlu_test.max()], 'r--')
plt.xlabel('Actual MMLU Benchmark Score')
plt.ylabel('Predicted MMLU Benchmark Score')
plt.title('Predicted vs Actual MMLU Benchmark Scores (Tuned Model)')
plt.tight_layout()
plt.savefig('predicted_vs_actual_mmlu.png')
plt.close()

# Model for future LLM prediction
# ------------------------------------------------------

# Train the final model on the full dataset
final_model = tuned_mmlu_model
final_model.fit(X, y_mmlu)

# Save insights and findings
insights = """
# LLM Performance Prediction - Key Insights

## Model Performance
- The best model for predicting MMLU benchmark scores is {best_model} with an R² of {r2:.4f}.
- After tuning, the model achieved an R² of {tuned_r2:.4f}.

## Key Factors Influencing LLM Performance
Based on feature importance analysis, the top factors that influence MMLU benchmark scores are:
{top_features}

## Provider Comparison
- The providers with the highest average MMLU scores are: {top_providers_mmlu}
- The providers with the highest average Chatbot Arena scores are: {top_providers_arena}

## Interesting Findings
- There is a {correlation_compute_mmlu:.2f} correlation between compute power and MMLU scores.
- Open-source models show {open_source_diff:.2f} difference in average performance compared to closed-source models.
- Models with larger context windows tend to {context_window_impact} in benchmark scores.

## Recommendations
1. Focus on {recommendation1} to maximize benchmark performance.
2. The trade-off between {recommendation2} should be carefully considered when developing new models.
3. For cost-efficient performance, prioritize {recommendation3}.
"""

# Fill in the insights template with actual values
provider_mmlu_avg = df.groupby('Provider')['Benchmark (MMLU)'].mean().sort_values(ascending=False)
provider_arena_avg = df.groupby('Provider')['Benchmark (Chatbot Arena)'].mean().sort_values(ascending=False)

top_providers_mmlu = ", ".join(provider_mmlu_avg.index[:3])
top_providers_arena = ", ".join(provider_arena_avg.index[:3])

correlation_compute_mmlu = correlation.loc['Compute Power', 'Benchmark (MMLU)']
open_source_diff = df[df['Open-Source'] == 1]['Benchmark (MMLU)'].mean() - df[df['Open-Source'] == 0]['Benchmark (MMLU)'].mean()

if correlation.loc['Context Window', 'Benchmark (MMLU)'] > 0:
    context_window_impact = "show improvement"
else:
    context_window_impact = "do not necessarily show improvement"

if mmlu_importance is not None:
    top_features = "\n".join([f"- {row['Feature']}: {row['Importance']:.4f}" for _, row in mmlu_importance.head(5).iterrows()])
else:
    top_features = "Feature importance analysis not available for the selected model."

# Determine recommendations based on analysis
if mmlu_importance is not None and 'Training Dataset Size' in mmlu_importance['Feature'].values[:5]:
    recommendation1 = "increasing training dataset size"
elif mmlu_importance is not None and 'Compute Power' in mmlu_importance['Feature'].values[:5]:
    recommendation1 = "allocating more compute resources"
else:
    recommendation1 = "optimizing model architecture"

recommendation2 = "inference speed and model accuracy"
recommendation3 = "models with balanced compute efficiency and training dataset size"

filled_insights = insights.format(
    best_model=best_mmlu_model_name,
    r2=best_mmlu_model[1]['r2'],
    tuned_r2=tuned_r2,
    top_features=top_features,
    top_providers_mmlu=top_providers_mmlu,
    top_providers_arena=top_providers_arena,
    correlation_compute_mmlu=correlation_compute_mmlu,
    open_source_diff=open_source_diff,
    context_window_impact=context_window_impact,
    recommendation1=recommendation1,
    recommendation2=recommendation2,
    recommendation3=recommendation3
)

with open('llm_performance_insights.md', 'w') as f:
    f.write(filled_insights)

print("\nProject completed successfully! Results and insights saved to files.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load the standardized encoded data
df_encoded = pd.read_csv('llm_comparison_standardized_encoded.csv')

# Load the scaled data
df_scaled = pd.read_csv('llm_comparison_scaled_encoded.csv')

# Create a model identifier column
df_encoded['Model_ID'] = df_encoded['Model'] + ' (' + df_encoded['Provider'] + ')'
df_scaled['Model_ID'] = df_scaled['Model'] + ' (' + df_scaled['Provider'] + ')'

# Display basic info about the datasets
print(f"Encoded dataset shape: {df_encoded.shape}")
print(f"Scaled dataset shape: {df_scaled.shape}")

# Summary statistics of encoded data
encoded_summary = df_encoded.describe()

# Visualize distributions of key metrics
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.flatten()

features = ['Context Window', 'Speed (tokens/sec)', 'Latency (sec)',
            'Benchmark (MMLU)', 'Benchmark (Chatbot Arena)',
            'Price / Million Tokens', 'Training Dataset Size',
            'Compute Power', 'Energy Efficiency']

for i, feature in enumerate(features):
    sns.histplot(df_encoded[feature], ax=axes[i], kde=True)
    axes[i].set_title(f'Distribution of {feature}')
    axes[i].set_xlabel(feature)

plt.tight_layout()
plt.savefig('feature_distributions.png')

# Correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = df_encoded[features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of LLM Features')
plt.tight_layout()
plt.savefig('correlation_matrix.png')

# Compare models by provider
plt.figure(figsize=(14, 8))
sns.boxplot(x='Provider', y='Benchmark (MMLU)', data=df_encoded)
plt.title('MMLU Benchmark Scores by Provider')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('mmlu_by_provider.png')

# Quality vs Speed by Provider
plt.figure(figsize=(14, 10))
sns.scatterplot(data=df_encoded, x='Quality Rating', y='Speed Rating',
                hue='Provider', size='Benchmark (MMLU)', sizes=(50, 200))
plt.title('Quality vs Speed Rating by Provider')
plt.tight_layout()
plt.savefig('quality_vs_speed.png')

# Select features for clustering
clustering_features = [
    'Context Window', 'Speed (tokens/sec)', 'Latency (sec)',
    'Benchmark (MMLU)', 'Benchmark (Chatbot Arena)',
    'Price / Million Tokens', 'Training Dataset Size',
    'Compute Power', 'Energy Efficiency',
    'Quality Rating', 'Speed Rating', 'Price Rating'
]

# Extract features for clustering (already standardized in the encoded dataset)
X = df_encoded[clustering_features].values

# Handle missing values if any
X = np.nan_to_num(X)

# Elbow method to determine optimal number of clusters
inertia = []
silhouette_scores = []
k_range = range(2, 10)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

    # Calculate silhouette score
    if k > 1:  # Silhouette score requires at least 2 clusters
        labels = kmeans.labels_
        silhouette_scores.append(silhouette_score(X, labels))

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia (Sum of squared distances)')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.savefig('elbow_method.png')

# Hierarchical clustering dendrogram
plt.figure(figsize=(16, 10))
linked = linkage(X, method='ward')
dendrogram(linked, labels=df_encoded['Model_ID'].values, orientation='top',
           leaf_font_size=8, distance_sort='descending')
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Model')
plt.ylabel('Distance')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('hierarchical_dendrogram.png')

# Apply K-means with optimal K (assuming k=4 based on analysis)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X)

# Add cluster labels to dataframe
df_encoded['KMeans_Cluster'] = kmeans_labels
df_scaled['KMeans_Cluster'] = kmeans_labels

# Apply hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=optimal_k)
hierarchical_labels = hierarchical.fit_predict(X)

# Add cluster labels to dataframe
df_encoded['Hierarchical_Cluster'] = hierarchical_labels
df_scaled['Hierarchical_Cluster'] = hierarchical_labels

# Apply PCA for visualization
pca = PCA(n_components=3)
pca_result = pca.fit_transform(X)

# Create a DataFrame with PCA results
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2', 'PC3'])
pca_df['KMeans_Cluster'] = kmeans_labels
pca_df['Hierarchical_Cluster'] = hierarchical_labels
pca_df['Model'] = df_encoded['Model_ID']
pca_df['Provider'] = df_encoded['Provider']

# PCA explained variance
explained_variance = pca.explained_variance_ratio_
print(f"PCA explained variance ratio: {explained_variance}")
print(f"Total variance explained: {sum(explained_variance):.2f}")

# Visualize PCA components with K-means clusters
plt.figure(figsize=(12, 10))
sns.scatterplot(x='PC1', y='PC2', hue='KMeans_Cluster',
                palette='viridis', s=100, data=pca_df)

# Add model names as annotations
for idx, row in pca_df.iterrows():
    plt.annotate(row['Model'], (row['PC1'], row['PC2']),
                 fontsize=8, alpha=0.7)

plt.title('PCA of LLMs with K-means Clusters')
plt.tight_layout()
plt.savefig('pca_kmeans_clusters.png')

# 3D PCA plot with plotly
fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3',
                    color='KMeans_Cluster', symbol='Provider',
                    hover_name='Model', title='3D PCA with K-means Clusters')
fig.update_layout(scene=dict(xaxis_title='PC1', yaxis_title='PC2', zaxis_title='PC3'))
fig.write_html('3d_pca_clusters.html')

# Analyze feature importance in PCA
pca_components = pd.DataFrame(
    pca.components_.T,
    columns=[f'PC{i+1}' for i in range(3)],
    index=clustering_features
)

plt.figure(figsize=(12, 8))
sns.heatmap(pca_components, annot=True, cmap='coolwarm')
plt.title('Feature Importance in Principal Components')
plt.tight_layout()
plt.savefig('pca_feature_importance.png')

# Calculate cluster profiles
cluster_profiles = df_encoded.groupby('KMeans_Cluster')[clustering_features].mean()

# Radar chart for cluster profiles
categories = clustering_features
fig = make_subplots(rows=2, cols=2, specs=[[{'type': 'polar'}]*2]*2,
                   subplot_titles=[f'Cluster {i}' for i in range(optimal_k)])

for i in range(optimal_k):
    row, col = divmod(i, 2)
    values = cluster_profiles.iloc[i].values.tolist()
    values.append(values[0])  # Close the loop

    cats = categories + [categories[0]]  # Close the loop

    fig.add_trace(
        go.Scatterpolar(
            r=values,
            theta=cats,
            fill='toself',
            name=f'Cluster {i}'
        ),
        row=row+1, col=col+1
    )

fig.update_layout(
    height=800,
    width=1000,
    title_text='Cluster Profiles'
)
fig.write_html('cluster_profiles_radar.html')

# Bar chart comparing key features across clusters
plt.figure(figsize=(15, 10))
cluster_profiles[['Quality Rating', 'Speed Rating', 'Price Rating',
                  'Benchmark (MMLU)', 'Benchmark (Chatbot Arena)']].plot(
    kind='bar', figsize=(15, 8))
plt.title('Key Performance Metrics by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Average Value (Standardized)')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('cluster_key_metrics.png')

# Count models per provider in each cluster
provider_cluster_counts = pd.crosstab(df_encoded['Provider'], df_encoded['KMeans_Cluster'])
provider_cluster_counts.plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Distribution of Providers Across Clusters')
plt.xlabel('Provider')
plt.ylabel('Number of Models')
plt.tight_layout()
plt.savefig('provider_cluster_distribution.png')

# Display detailed statistics for each cluster
for cluster in range(optimal_k):
    print(f"\n=== Cluster {cluster} ===")
    cluster_models = df_encoded[df_encoded['KMeans_Cluster'] == cluster]['Model_ID'].values
    print(f"Models in this cluster: {cluster_models}")

    print("\nCluster Statistics:")
    cluster_stats = df_encoded[df_encoded['KMeans_Cluster'] == cluster][clustering_features].describe()
    print(cluster_stats)

# Identify top models in each cluster based on different criteria
best_mmlu = df_encoded.loc[df_encoded.groupby('KMeans_Cluster')['Benchmark (MMLU)'].idxmax()]
best_arena = df_encoded.loc[df_encoded.groupby('KMeans_Cluster')['Benchmark (Chatbot Arena)'].idxmax()]
best_quality = df_encoded.loc[df_encoded.groupby('KMeans_Cluster')['Quality Rating'].idxmax()]
best_efficiency = df_encoded.loc[df_encoded.groupby('KMeans_Cluster')['Energy Efficiency'].idxmax()]

print("\n=== Best-in-class Models by Cluster ===")
print("\nBest MMLU Score:")
for _, row in best_mmlu.iterrows():
    print(f"Cluster {row['KMeans_Cluster']}: {row['Model_ID']} (Score: {row['Benchmark (MMLU)']})")

print("\nBest Chatbot Arena Score:")
for _, row in best_arena.iterrows():
    print(f"Cluster {row['KMeans_Cluster']}: {row['Model_ID']} (Score: {row['Benchmark (Chatbot Arena)']})")

print("\nBest Quality Rating:")
for _, row in best_quality.iterrows():
    print(f"Cluster {row['KMeans_Cluster']}: {row['Model_ID']} (Rating: {row['Quality Rating']})")

print("\nBest Energy Efficiency:")
for _, row in best_efficiency.iterrows():
    print(f"Cluster {row['KMeans_Cluster']}: {row['Model_ID']} (Efficiency: {row['Energy Efficiency']})")

