In [1]:
# setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import typing
import sklearn
import nltk

In [2]:
import glob
import json
import os
from tqdm import tqdm

def fix_model_name_formatting(folder_path):
    """
    Fix the formatting in RF model JSON files by adding a comma between min_samples_leaf and bootstrap.
    
    Args:
        folder_path (str): Path to the folder containing JSON files
    """
    # Find all RF JSON files
    json_files = glob.glob(os.path.join(folder_path, "**/*RF*.json"), recursive=True)
    
    # Initialize counters
    fixed_count = 0
    error_count = 0
    
    # Create progress bar
    for file_path in tqdm(json_files, desc="Processing files"):
        try:
            # Read the JSON file
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            # Check if this is an RF model file with the formatting issue
            if ('model_name' in data and 
                'RF' in data['model_name'] and 
                'min_samples_leaf' in data['model_name'] and 
                'bootstrap' in data['model_name']):
                
                # Fix the formatting
                model_name = data['model_name']
                if ' bootstrap=' in model_name:
                    # Add comma before bootstrap
                    fixed_name = model_name.replace(' bootstrap=', ' ,bootstrap=')
                    
                    # Update the model_name in the data
                    data['model_name'] = fixed_name
                    
                    # Write the corrected data back to the file
                    with open(file_path, 'w') as f:
                        json.dump(data, f, indent=4)
                    
                    fixed_count += 1
                    
        except Exception as e:
            print(f"\nError processing {file_path}: {str(e)}")
            error_count += 1

    # Print summary
    print(f"\nProcessing complete:")
    print(f"Total files processed: {len(json_files)}")
    print(f"Files fixed: {fixed_count}")
    print(f"Errors encountered: {error_count}")


In [None]:

# Usage
folder_path = "test_output_results"  # Replace with your folder path
fix_model_name_formatting(folder_path)

In [None]:
import glob
import json
import os
import pandas as pd

#    accuracy  precision_weighted  recall_weighted  f1_weighted  accuracy_std  precision_weighted_std  recall_weighted_std  f1_weighted_std  model_name     dataset_name_name
# 0  0.900880         0.938316        0.900880     0.908401      0.021403             0.007927           0.021403         0.018926        RF (n_estimators=20...)  amazon_review
# 1  0.915234         0.945123        0.915234     0.925678      0.019876             0.008234           0.019876         0.017654        RF (n_estimators=30...)  amazon_review

def read_json_files_to_df(folder_path, dataset_name, model_prefix):
    # Get all JSON files that start with the model prefix
    json_files = glob.glob(os.path.join(folder_path, f'{model_prefix}*.json'))
    
    # List to store data from all files
    data_list = []
    
    for file in json_files:
        with open(file, 'r') as f:
            data = json.load(f)
            data['dataset'] = dataset_name
            data_list.append(data)
    
    # Create DataFrame from the list of dictionaries
    df = pd.DataFrame(data_list)
    
    return df
svm_data = pd.concat([
    read_json_files_to_df(os.path.join('test_output_results', 'amazon_reviews'),'amazon_reviews','SVM'),
    read_json_files_to_df(os.path.join('test_output_results', 'congressional_voting'),'congressional_voting','SVM'),
    read_json_files_to_df(os.path.join('test_output_results', 'traffic_prediction'),'traffic_situation','SVM'),
    read_json_files_to_df(os.path.join('test_output_results', 'wine_reviews'),'wine_reviews','SVM'),
    ])
rf_data = pd.concat([
    read_json_files_to_df(os.path.join('test_output_results', 'amazon_reviews'),'amazon_reviews','RF'),
    read_json_files_to_df(os.path.join('test_output_results', 'congressional_voting'),'congressional_voting','RF'),
    read_json_files_to_df(os.path.join('test_output_results', 'traffic_prediction'),'traffic_situation','RF'),
    read_json_files_to_df(os.path.join('test_output_results', 'wine_reviews'),'wine_reviews','RF'),
    ])
knn_data = pd.concat([
    read_json_files_to_df(os.path.join('test_output_results', 'amazon_reviews'),'amazon_reviews','KNN'),
    read_json_files_to_df(os.path.join('test_output_results', 'congressional_voting'),'congressional_voting','KNN'),
    read_json_files_to_df(os.path.join('test_output_results', 'traffic_prediction'),'traffic_situation','KNN'),
    read_json_files_to_df(os.path.join('test_output_results', 'wine_reviews'),'wine_reviews','KNN'),
    ])
all_model_data = pd.concat([
        rf_data,
        knn_data,
        svm_data
    ])


In [None]:
# {
#     "accuracy": 0.9008809429812192,
#     "precision_weighted": 0.9383161061842419,
#     "recall_weighted": 0.9008809429812192,
#     "f1_weighted": 0.9084013514080975,
#     "accuracy_std": 0.021402960283051446,
#     "precision_weighted_std": 0.007926814601418997,
#     "recall_weighted_std": 0.021402960283051446,
#     "f1_weighted_std": 0.01892636657376924,
#     "model_name": "RF (n_estimators=20, max_depth=5, min_samples_split=2, min_samples_leaf=2 bootstrap=False,max_features=sqrt,criterions=entropy,class_weights=balanced_subsample)"
# }
def parse_model_parameters(df):
    """
    Parse model parameters from the 'model_name' column and create new columns for each parameter.
    
    Args:
        df (pd.DataFrame): DataFrame containing 'model_name' column
    
    Returns:
        pd.DataFrame: DataFrame with additional columns for each parameter
    """
    # Create a copy of the dataframe
    df_copy = df.copy()
    
    def extract_params(model_name):
        # Extract the content between parentheses
        params_str = model_name[model_name.find("(")+1:model_name.find(")")]
        
        # Split the parameters
        params_list = params_str.split(',')
        
        # Create a dictionary to store parameters
        params_dict = {}
        
        # Extract model type (RF, SVM, or KNN)
        params_dict['model_type'] = model_name.split()[0]
        
        # Parse each parameter
        for param in params_list:
            param = param.strip()
            if '=' in param:
                key, value = param.split('=')
                key = key.strip()
                value = value.strip()
                
                # Convert to appropriate data type
                if value.lower() == 'true':
                    value = True
                elif value.lower() == 'false':
                    value = False
                elif value.isdigit():
                    value = int(value)
                elif value.replace('.', '').isdigit():
                    value = float(value)
                
                params_dict[key] = value
                
        return params_dict
    
    # Apply the parsing function to each row and create a DataFrame
    params_df = pd.DataFrame([extract_params(name) for name in df_copy['model_name']])
    
    # Combine the original DataFrame with the parameters DataFrame
    result_df = pd.concat([df_copy, params_df], axis=1)
    
    return result_df
svm_data = parse_model_parameters(svm_data)
rf_data = parse_model_parameters(rf_data)
knn_data = parse_model_parameters(knn_data)
all_model_data = parse_model_parameters(all_model_data)
# Usage example:
# df_with_params = parse_model_parameters(your_dataframe)

# Example output columns for RF models:
# Original columns + [
#     'model_type',
#     'n_estimators',
#     'max_depth',
#     'min_samples_split',
#     'min_samples_leaf',
#     'bootstrap',
#     'max_features',
#     'criterions',
#     'class_weights'
# ]


In [None]:
# Assuming your DataFrame is called 'svm_data'
parsed_df = parse_model_parameters(svm_data)

# View the new columns
print(parsed_df.columns)

# Example queries:
# Get all RF models with max_depth=5
rf_depth_5 = parsed_df[
    (parsed_df['model_type'] == 'RF') & 
    (parsed_df['max_depth'] == 5)
]

# Get average accuracy grouped by model_type
avg_by_model = parsed_df.groupby('model_type')['accuracy'].mean()

# Get best performing configuration for each model type
best_configs = parsed_df.sort_values('accuracy', ascending=False).groupby('model_type').first()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
parameter_maps = {
    'RF': ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf'],
    'SVM': ['C', 'kernel', 'gamma'],
    'KNN': ['n_neighbors', 'weights', 'metric']
}
def analyze_parameter_impact(df, metric='accuracy', figsize=(15, 10)):
    """
    Analyze how a specific metric changes with parameter variations for each model type and dataset.
    
    Args:
        df (pd.DataFrame): DataFrame containing the model results and parameters
        metric (str): Metric to analyze (e.g., 'accuracy', 'f1_weighted', etc.)
        figsize (tuple): Figure size for plots
    """
    # Get unique model types and datasets
    model_types = df['model_type'].unique()
    datasets = df['dataset'].unique()
    
    # Parameter mapping for each model type
    parameter_maps = {
        'RF': ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf'],
        'SVM': ['C', 'kernel', 'gamma'],
        'KNN': ['n_neighbors', 'weights', 'metric']
    }
    
    for model in model_types:
        print(f"\n{'='*50}")
        print(f"Analysis for {model} models")
        print(f"{'='*50}")
        
        model_df = df[df['model_type'] == model]
        parameters = parameter_maps.get(model, [])
        
        for dataset in datasets:
            print(f"\nDataset: {dataset}")
            dataset_df = model_df[model_df['dataset'] == dataset]
            
            if dataset_df.empty:
                print("No data available for this combination")
                continue
            
            # Create subplots for each parameter
            fig, axes = plt.subplots(nrows=(len(parameters) + 1) // 2, 
                                   ncols=2, 
                                   figsize=figsize)
            fig.suptitle(f'{model} - {dataset}: Parameter Impact on {metric}')
            axes = axes.flatten()
            
            for idx, param in enumerate(parameters):
                if param in dataset_df.columns:
                    # Create parameter vs metric plot
                    sns.boxplot(data=dataset_df, x=param, y=metric, ax=axes[idx])
                    axes[idx].set_title(f'Impact of {param}')
                    axes[idx].set_xlabel(param)
                    axes[idx].set_ylabel(metric)
                    
                    # Add mean line
                    means = dataset_df.groupby(param)[metric].mean()
                    axes[idx].plot(range(len(means)), means.values, 'r-', label='Mean')
                    axes[idx].legend()
                    
                    # Print statistical summary
                    print(f"\nParameter: {param}")
                    summary = dataset_df.groupby(param)[metric].agg(['mean', 'std', 'count'])
                    print(summary)
            
            # Remove empty subplots
            for idx in range(len(parameters), len(axes)):
                fig.delaxes(axes[idx])
            
            plt.tight_layout()
            plt.show()

def print_best_configurations(df, metric='accuracy'):
    """
    Print the best configurations for each model type and dataset combination.
    """
    for model in df['model_type'].unique():
        print(f"\n{'='*50}")
        print(f"Best configurations for {model}")
        print(f"{'='*50}")
        
        model_df = df[df['model_type'] == model]
        
        for dataset in model_df['dataset'].unique():
            print(f"\nDataset: {dataset}")
            dataset_df = model_df[model_df['dataset'] == dataset]
            
            # Get best configuration
            best_config = dataset_df.nlargest(1, metric).iloc[0]
            print(f"Best {metric}: {best_config[metric]:.4f}")
            print("Parameters:")
            for param in parameter_maps[model]:
                if param in best_config:
                    print(f"- {param}: {best_config[param]}")

# Usage example
# Assuming your DataFrame is called 'parsed_df' and contains the parsed parameters
analyze_parameter_impact(parsed_df, metric='accuracy')
print_best_configurations(parsed_df, metric='accuracy')

# You can also analyze other metrics
# analyze_parameter_impact(parsed_df, metric='f1_weighted')

In [None]:
def analyze_parameter_correlations(df, metric='accuracy'):
    """
    Analyze correlations between parameters and metrics.
    """
    for model in df['model_type'].unique():
        model_df = df[df['model_type'] == model]
        numeric_params = model_df.select_dtypes(include=['int64', 'float64']).columns
        
        for dataset in model_df['dataset'].unique():
            dataset_df = model_df[model_df['dataset'] == dataset]
            
            # Create correlation matrix
            corr_matrix = dataset_df[list(numeric_params) + [metric]].corr()
            
            # Plot correlation heatmap
            plt.figure(figsize=(10, 8))
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
            plt.title(f'Parameter Correlations - {model} - {dataset}')
            plt.tight_layout()
            plt.show()
def analyze_parameter_interactions(df, metric='accuracy', param1=None, param2=None):
    """
    Analyze interactions between two parameters.
    """
    for model in df['model_type'].unique():
        model_df = df[df['model_type'] == model]
        
        if param1 is None or param2 is None:
            parameters = parameter_maps[model]
            param1, param2 = parameters[0], parameters[1]
        
        for dataset in model_df['dataset'].unique():
            dataset_df = model_df[model_df['dataset'] == dataset]
            
            # Create interaction plot
            plt.figure(figsize=(12, 8))
            sns.scatterplot(data=dataset_df, x=param1, y=param2, hue=metric, size=metric)
            plt.title(f'Parameter Interaction - {model} - {dataset}')
            plt.tight_layout()
            plt.show()

In [None]:
from scipy import stats

def analyze_parameter_significance(df, metric='accuracy'):
    """
    Perform statistical tests to determine parameter significance.
    """
    for model in df['model_type'].unique():
        print(f"\nStatistical Analysis for {model}")
        model_df = df[df['model_type'] == model]
        parameters = parameter_maps[model]
        
        for dataset in model_df['dataset'].unique():
            print(f"\nDataset: {dataset}")
            dataset_df = model_df[model_df['dataset'] == dataset]
            
            for param in parameters:
                if param in dataset_df.columns:
                    # Perform one-way ANOVA
                    groups = [group[metric].values for name, group in dataset_df.groupby(param)]
                    f_stat, p_val = stats.f_oneway(*groups)
                    
                    print(f"\nParameter: {param}")
                    print(f"F-statistic: {f_stat:.4f}")
                    print(f"p-value: {p_val:.4f}")
                    print("Significant" if p_val < 0.05 else "Not significant")

# Usage
analyze_parameter_significance(parsed_df)

# ANAL Y SIS

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import friedmanchisquare
import warnings
warnings.filterwarnings('ignore')

def analyze_metric_comparison(df, metrics, test='wilcoxon'):
    """
    Compare different distance metrics for kNN.
    
    Args:
        df: DataFrame containing kNN results
        metrics: List of distance metrics to compare
        test: Statistical test to use ('wilcoxon' or 'friedman')
    """
    plt.figure(figsize=(12, 6))
    
    # Create box plots for each metric
    sns.boxplot(data=df[df['metric'].isin(metrics)], 
                x='metric', 
                y='accuracy', 
                palette='Set3')
    
    plt.title('Comparison of Distance Metrics in kNN')
    plt.xlabel('Distance Metric')
    plt.ylabel('Accuracy')
    
    # Statistical tests
    print("\nStatistical Analysis of Distance Metrics")
    print("="*50)
    
    # Prepare data for statistical testing
    metric_groups = [df[df['metric'] == m]['accuracy'].values for m in metrics]
    
    if test == 'wilcoxon':
        # Pairwise Wilcoxon tests
        for i in range(len(metrics)):
            for j in range(i+1, len(metrics)):
                stat, p_val = stats.wilcoxon(metric_groups[i], metric_groups[j])
                print(f"\n{metrics[i]} vs {metrics[j]}:")
                print(f"Wilcoxon statistic: {stat:.4f}")
                print(f"p-value: {p_val:.4f}")
    
    elif test == 'friedman':
        # Friedman test for all metrics
        stat, p_val = friedmanchisquare(*metric_groups)
        print("\nFriedman Test Results:")
        print(f"Statistic: {stat:.4f}")
        print(f"p-value: {p_val:.4f}")
    
    plt.show()

def analyze_kernel_performance(df, kernels, metric='accuracy'):
    """
    Analyze performance of different SVM kernels.
    
    Args:
        df: DataFrame containing SVM results
        kernels: List of kernels to compare
        metric: Performance metric to analyze
    """
    plt.figure(figsize=(15, 5))
    
    # Create subplot for each analysis
    plt.subplot(1, 2, 1)
    # Box plots for kernel comparison
    sns.boxplot(data=df[df['kernel'].isin(kernels)], 
                x='kernel', 
                y=metric, 
                palette='viridis')
    plt.title('Kernel Performance Comparison')
    
    plt.subplot(1, 2, 2)
    # Violin plots for distribution visualization
    sns.violinplot(data=df[df['kernel'].isin(kernels)], 
                  x='kernel', 
                  y=metric, 
                  palette='viridis')
    plt.title('Kernel Performance Distribution')
    
    plt.tight_layout()
    plt.show()
    
    # Statistical analysis
    print("\nKernel Performance Statistics")
    print("="*50)
    
    # Summary statistics
    summary = df[df['kernel'].isin(kernels)].groupby('kernel')[metric].agg(['mean', 'std', 'count'])
    print("\nSummary Statistics:")
    print(summary)
    
    # ANOVA test
    kernel_groups = [df[df['kernel'] == k][metric].values for k in kernels]
    f_stat, p_val = stats.f_oneway(*kernel_groups)
    print("\nANOVA Test Results:")
    print(f"F-statistic: {f_stat:.4f}")
    print(f"p-value: {p_val:.4f}")

def analyze_parameter_scale(df, param, scale='log', metric='accuracy'):
    """
    Analyze parameter performance across different scales.
    
    Args:
        df: DataFrame containing model results
        param: Parameter to analyze
        scale: Scale type ('log' or 'linear')
        metric: Performance metric to analyze
    """
    plt.figure(figsize=(12, 6))
    
    # Transform parameter values based on scale
    if scale == 'log':
        param_values = np.log10(df[param])
        plt.xscale('log')
    else:
        param_values = df[param]
    
    # Scatter plot with trend line
    sns.regplot(x=df[param], 
                y=df[metric], 
                scatter_kws={'alpha':0.5}, 
                line_kws={'color': 'red'})
    
    plt.title(f'{param} vs {metric} ({scale} scale)')
    plt.xlabel(f'{param} value ({scale} scale)')
    plt.ylabel(metric)
    
    # Add correlation analysis
    correlation = df[[param, metric]].corr().iloc[0,1]
    plt.text(0.05, 0.95, f'Correlation: {correlation:.4f}', 
             transform=plt.gca().transAxes)
    
    plt.show()
    
    # Parameter value distribution
    plt.figure(figsize=(8, 4))
    sns.histplot(param_values, bins=30)
    plt.title(f'Distribution of {param} values')
    plt.xlabel(f'{param} value ({scale} scale)')
    plt.show()

def analyze_model_comparison(df, models, metrics, statistical_test='friedman'):
    """
    Compare performance across different models.
    
    Args:
        df: DataFrame containing all model results
        models: List of models to compare
        metrics: List of metrics to analyze
        statistical_test: Type of statistical test to perform
    """
    n_metrics = len(metrics)
    plt.figure(figsize=(15, 5*n_metrics))
    
    for i, metric in enumerate(metrics):
        plt.subplot(n_metrics, 1, i+1)
        
        # Create violin plots for model comparison
        sns.violinplot(data=df[df['model_type'].isin(models)], 
                      x='model_type', 
                      y=metric, 
                      palette='deep')
        
        plt.title(f'Model Comparison - {metric}')
        plt.xlabel('Model Type')
        plt.ylabel(metric)
    
    plt.tight_layout()
    plt.show()
    
    # Statistical analysis
    print("\nModel Comparison Statistics")
    print("="*50)
    
    for metric in metrics:
        print(f"\nMetric: {metric}")
        
        # Summary statistics
        summary = df[df['model_type'].isin(models)].groupby('model_type')[metric].agg(['mean', 'std', 'count'])
        print("\nSummary Statistics:")
        print(summary)
        
        # Statistical test
        if statistical_test == 'friedman':
            model_groups = [df[df['model_type'] == m][metric].values for m in models]
            stat, p_val = friedmanchisquare(*model_groups)
            print("\nFriedman Test Results:")
            print(f"Statistic: {stat:.4f}")
            print(f"p-value: {p_val:.4f}")

def analyze_dataset_impact(df, characteristics, models):
    """
    Analyze how dataset characteristics affect model performance.
    
    Args:
        df: DataFrame containing all results
        characteristics: List of dataset characteristics to analyze
        models: List of models to compare
    """
    plt.figure(figsize=(15, 5*len(characteristics)))
    
    for i, char in enumerate(characteristics):
        plt.subplot(len(characteristics), 1, i+1)
        
        # Create scatter plots for each model
        for model in models:
            model_data = df[df['model_type'] == model]
            sns.scatterplot(data=model_data, 
                          x=char, 
                          y='accuracy', 
                          label=model, 
                          alpha=0.6)
            
            # Add trend line
            sns.regplot(data=model_data, 
                       x=char, 
                       y='accuracy', 
                       scatter=False, 
                       label=f'{model} trend')
        
        plt.title(f'Impact of {char} on Model Performance')
        plt.xlabel(char)
        plt.ylabel('Accuracy')
        plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Correlation analysis
    print("\nCorrelation Analysis")
    print("="*50)
    
    for model in models:
        print(f"\nModel: {model}")
        model_data = df[df['model_type'] == model]
        
        for char in characteristics:
            correlation = model_data[[char, 'accuracy']].corr().iloc[0,1]
            print(f"Correlation with {char}: {correlation:.4f}")

# Usage examples:
# Assuming your DataFrame is called 'results_df'

# Compare kNN distance metrics
analyze_metric_comparison(
    all_model_data[all_model_data['model_type'] == 'KNN'],
    metrics=['euclidean', 'manhattan', 'minkowski'],
    test='wilcoxon'
)

# Compare SVM kernels
analyze_kernel_performance(
    all_model_data[all_model_data['model_type'] == 'SVM'],
    kernels=['linear', 'rbf', 'poly'],
    metric='accuracy'
)

# Analyze SVM C parameter
analyze_parameter_scale(
    all_model_data[all_model_data['model_type'] == 'SVM'],
    param='C',
    scale='log',
    metric='accuracy'
)

# Compare all models
analyze_model_comparison(
    all_model_data,
    models=['RF', 'KNN', 'SVM'],
    metrics=['accuracy', 'f1_weighted'],
    statistical_test='friedman'
)

# Analyze dataset impact
analyze_dataset_impact(
    all_model_data,
    characteristics=['n_samples', 'n_features', 'class_balance'],
    models=['RF', 'KNN', 'SVM']
)

# RF

In [None]:
# Key parameters to analyze for RF:
rf_params = ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf']

# Correlation Analysis
analyze_parameter_correlations(
    rf_data,  # DataFrame filtered for RF models
    metric='accuracy',
    params=rf_params  # Focus on these specific parameters
)

# Significance Tests for RF
# 1. ANOVA test for numerical parameters
analyze_parameter_significance(
    rf_data, 
    params=['n_estimators', 'max_depth'],
    metric='accuracy'
)

# 2. Chi-square test for categorical parameters (like criterion)
# 3. Friedman test for comparing different parameter combinations

# KNN

In [None]:
# Key parameters for kNN:
knn_params = ['n_neighbors', 'weights', 'metric']

# Parameter Impact Analysis
analyze_parameter_impact(
    knn_data,
    metric='accuracy',
    focus_params=['n_neighbors'],  # Most important parameter
    plot_type='line'  # Better for showing k-value progression
)

# Non-parametric tests for different distance metrics
analyze_metric_comparison(
    knn_data,
    metrics=['euclidean', 'manhattan', 'minkowski'],
    test='wilcoxon'  # Paired comparison of distance metrics
)

# SVM

In [None]:
# Key parameters for SVM:
svm_params = ['C', 'kernel', 'gamma']

# Kernel Performance Comparison
analyze_kernel_performance(
    svm_data,
    kernels=['rbf', 'linear', 'poly'],
    metric='accuracy'
)

# Parameter Scale Analysis
analyze_parameter_scale(
    svm_data,
    param='C',
    scale='log',  # C and gamma often need log scale analysis
    metric='accuracy'
)

# ALL MODELS

In [None]:
# Compare all models across datasets
analyze_model_comparison(
    all_model_data,
    models=['RF', 'KNN', 'SVM'],
    metrics=['accuracy', 'f1_weighted'],
    statistical_test='friedman'  # Non-parametric test for multiple models
)

# Dataset characteristic impact
analyze_dataset_impact(
    all_model_data,
    characteristics=['n_samples', 'n_features', 'class_balance'],
    models=['RF', 'KNN', 'SVM']
)

In [3]:
def create_advanced_parameter_plot(df, model_type, param, metric='accuracy'):
    """
    Create an advanced visualization for parameter analysis
    """
    plt.figure(figsize=(12, 6))
    
    # Main parameter vs metric plot
    sns.boxplot(data=df[df['model_type'] == model_type], 
                x=param, 
                y=metric, 
                color='lightblue')
    
    # Add trend line
    sns.regplot(data=df[df['model_type'] == model_type],
                x=param,
                y=metric,
                scatter=False,
                color='red')
    
    # Add standard deviation bands
    param_stats = df[df['model_type'] == model_type].groupby(param)[metric].agg(['mean', 'std'])
    plt.fill_between(param_stats.index,
                     param_stats['mean'] - param_stats['std'],
                     param_stats['mean'] + param_stats['std'],
                     alpha=0.2)
    
    plt.title(f'Impact of {param} on {metric} for {model_type}')
    plt.tight_layout()
    plt.show()

# Statistical Summary Function
def print_statistical_summary(df, model_type, param, metric='accuracy'):
    """
    Print comprehensive statistical summary
    """
    model_data = df[df['model_type'] == model_type]
    
    print(f"Statistical Summary for {model_type} - {param}")
    print("="*50)
    
    # Basic statistics
    stats = model_data.groupby(param)[metric].agg(['mean', 'std', 'count'])
    print("\nParameter-wise statistics:")
    print(stats)
    
    # Statistical tests
    if model_data[param].nunique() > 2:
        # ANOVA for more than 2 groups
        f_stat, p_val = stats.f_oneway(*[group[metric].values 
                                       for name, group in model_data.groupby(param)])
        print("\nANOVA Test:")
        print(f"F-statistic: {f_stat:.4f}")
        print(f"p-value: {p_val:.4f}")
    else:
        # T-test for 2 groups
        group1, group2 = [group[metric].values 
                         for name, group in model_data.groupby(param)]
        t_stat, p_val = stats.ttest_ind(group1, group2)
        print("\nT-test:")
        print(f"t-statistic: {t_stat:.4f}")
        print(f"p-value: {p_val:.4f}")