In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("not_final3.csv")
df = df.rename(columns={'Sport_-1': 'Sport'})

# List of sports to exclude
excluded_sports = ['climbing', 'Fitness', 'Headis']

# Filter out excluded sports
df = df[~df['Sport'].isin(excluded_sports)]

# Define parameters including 'label'
parameters = ['drug', 'equity', 'popularity', 'normalizedcountry', 'CV', 'label']

# Initialize a dictionary to store processed data for each sport
sport_data_dict = {}

for sport_name in df['Sport'].unique():
    sport_dict = {'Year': []}
    
    # Initialize a DataFrame for Year
    all_years = pd.Series(dtype=int)
    
    for parameter in parameters:
        param_columns = [col for col in df.columns if col.startswith(f"{parameter}_")]
        
        if not param_columns:
            continue
        
        # Melt the data for this parameter
        melted_data = df[df['Sport'] == sport_name][['Sport'] + param_columns].melt(
            id_vars='Sport', var_name='Year', value_name=f'{parameter}_Value'
        )
        melted_data['Year'] = melted_data['Year'].str.extract(r'(\d+)').astype(int)  # Extract year as int
        melted_data = melted_data.sort_values('Year')

        if all_years.empty:
            all_years = melted_data['Year']
        else:
            all_years = pd.concat([all_years, melted_data['Year']]).drop_duplicates().sort_values()
        
        # Align parameter data with all years
        melted_data = melted_data.set_index('Year').reindex(all_years).reset_index()
        
        # Handle missing data based on parameter type
        if parameter == 'drug':
            melted_data[f'{parameter}_Value'] = melted_data[f'{parameter}_Value'].fillna(0)  # For drug, fill missing with 0
        elif parameter in ['popularity', 'CV']:
            # For popularity and CV, treat 0 as missing and replace with NaN
            melted_data[f'{parameter}_Value'] = melted_data[f'{parameter}_Value'].replace(0, np.nan)
        else:
            # Preserve NaN for other parameters
            melted_data[f'{parameter}_Value'] = melted_data[f'{parameter}_Value']
        
        # Store parameter values aligned by Year
        sport_dict[parameter] = melted_data[f'{parameter}_Value'].values

    # Ensure that Year is included
    sport_dict['Year'] = all_years.values
    
    # Convert the dictionary to a DataFrame
    sport_data_dict[sport_name] = pd.DataFrame(sport_dict)

# Example usage: print data for a specific sport
for sport, data in sport_data_dict.items():
    data.set_index('Year', inplace=True)  # Ensure Year is the index
    

    print(f"\nData for {sport}:")
    print(data.head())



Data for Alpine Skiing:
      drug  equity  popularity  normalizedcountry  CV  label
Year                                                        
1896   0.0     NaN         NaN                NaN NaN      0
1900   0.0     NaN         NaN                NaN NaN      0
1904   0.0     NaN         NaN                NaN NaN      0
1906   0.0     NaN         NaN                NaN NaN      0
1908   0.0     NaN         NaN                NaN NaN      0

Data for Alpinism:
      drug  equity  popularity  normalizedcountry  CV  label
Year                                                        
1896   0.0     NaN         NaN                NaN NaN      0
1900   0.0     NaN         NaN                NaN NaN      0
1904   0.0     NaN         NaN                NaN NaN      0
1906   0.0     NaN         NaN                NaN NaN      0
1908   0.0     NaN         NaN                NaN NaN      0

Data for Archery:
      drug    equity  popularity  normalizedcountry        CV  label
Year         

In [2]:
import os
import pandas as pd
from prophet import Prophet
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Create directories for plots and metrics
os.makedirs("model_plots_comparison", exist_ok=True)
os.makedirs("model_metrics", exist_ok=True)

# Initialize metrics storage
metrics_data = []

def train_and_evaluate_prophet(train_df, test_df, is_first_iteration=False):
    """
    Train and evaluate a Prophet model.
    Handles special case for first iteration by predicting constant value.
    """
    if is_first_iteration:
        prediction = train_df['y'].iloc[0]  # Constant prediction for first iteration
        actual = test_df['y'].values[0]
        return prediction, actual, None
    
    # Initialize and fit the Prophet model
    model = Prophet()
    model.fit(train_df)

    # Forecast the next point
    future = model.make_future_dataframe(periods=1, freq='Y')
    forecast = model.predict(future)

    # Get the forecasted value for the next step
    prediction = forecast.iloc[-1]['yhat']
    actual = test_df['y'].values[0]

    return prediction, actual, forecast

# Main loop for each sport and parameter
for sport, df in sport_data_dict.items():
    for parameter in ['drug', 'equity', 'popularity', 'normalizedcountry', 'CV']:
        if parameter not in df.columns:
            continue
        
        # Ensure 'Year' is available as a column
        if 'Year' not in df.columns:
            df = df.reset_index()
        
        # Prepare the data
        parameter_df = df[['Year', parameter]].dropna().rename(columns={'Year': 'ds', parameter: 'y'})
        parameter_df['ds'] = pd.to_datetime(parameter_df['ds'], format='%Y')
        
        if len(parameter_df) < 3:
            continue  # Skip if not enough data
        
        predictions = []
        actuals = []
        
        for i in range(len(parameter_df) - 1):
            train_subset = parameter_df.iloc[:i+1]
            test_point = parameter_df.iloc[i+1:i+2]  # Next data point for validation
            
            if test_point.empty:
                break
            
            is_first_iteration = i == 0
            
            try:
                pred, actual, _ = train_and_evaluate_prophet(train_subset, test_point, is_first_iteration=is_first_iteration)
                predictions.append(pred)
                actuals.append(actual)
            except ValueError as e:
                print(f"Skipping iteration {i+1} for {sport} - {parameter}: {str(e)}")
                continue
        
        # Evaluate metrics
        r2 = r2_score(actuals, predictions) if len(predictions) > 1 else None
        mse = mean_squared_error(actuals, predictions) if len(predictions) > 1 else None
        
        # Store metrics
        metrics_data.append({
            'Sport': sport,
            'Parameter': parameter,
            'R2': r2,
            'MSE': mse
        })
        
        # Correctly align predictions to actual years
        predicted_years = parameter_df['ds'][1:len(predictions)+1]  # Use offset for correct plotting
        
        # Plot actual vs. predicted (aligned correctly)
        plt.figure(figsize=(12, 8))
        plt.plot(parameter_df['ds'], parameter_df['y'], label='Actual', color='black', marker='o')
        plt.scatter(predicted_years, predictions, label='Predicted', color='blue', marker='o')
        plt.title(f'{sport} - {parameter} Prediction')
        plt.xlabel('Year')
        plt.ylabel('Value')
        plt.grid(True)
        plt.legend()

        # Save the figure
        plt.savefig(f"model_plots_comparison/{sport}_{parameter}_comparison.png")
        plt.close()

# Save metrics
metrics_df = pd.DataFrame(metrics_data)
metrics_df.to_csv('model_metrics/prophet_metrics.csv', index=False)

print("Training and evaluation completed. Metrics saved.")


  from .autonotebook import tqdm as notebook_tqdm
10:27:29 - cmdstanpy - INFO - Chain [1] start processing
10:27:29 - cmdstanpy - INFO - Chain [1] done processing
10:27:29 - cmdstanpy - INFO - Chain [1] start processing
10:27:29 - cmdstanpy - INFO - Chain [1] done processing
10:27:29 - cmdstanpy - INFO - Chain [1] start processing
10:27:29 - cmdstanpy - INFO - Chain [1] done processing
10:27:29 - cmdstanpy - INFO - Chain [1] start processing
10:27:29 - cmdstanpy - INFO - Chain [1] done processing
10:27:29 - cmdstanpy - INFO - Chain [1] start processing
10:27:29 - cmdstanpy - INFO - Chain [1] done processing
10:27:29 - cmdstanpy - INFO - Chain [1] start processing
10:27:29 - cmdstanpy - INFO - Chain [1] done processing
10:27:29 - cmdstanpy - INFO - Chain [1] start processing
10:27:29 - cmdstanpy - INFO - Chain [1] done processing
10:27:29 - cmdstanpy - INFO - Chain [1] start processing
10:27:29 - cmdstanpy - INFO - Chain [1] done processing
10:27:29 - cmdstanpy - INFO - Chain [1] start 

Training and evaluation completed. Metrics saved.


In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Load the saved metrics
metrics_file = 'model_metrics/prophet_metrics.csv'
metrics_df = pd.read_csv(metrics_file)

# Create directories for heatmap plots if they don't exist
os.makedirs("model_metrics_heatmaps", exist_ok=True)

# Pivot data for heatmaps
r2_pivot = metrics_df.pivot(index="Sport", columns="Parameter", values="R2")
mse_pivot = metrics_df.pivot(index="Sport", columns="Parameter", values="MSE")

# Plot R² Heatmap
plt.figure(figsize=(50,30))
sns.heatmap(r2_pivot, annot=True, cmap="YlGnBu", fmt=".2f", linewidths=.5)
plt.title('R² Score Heatmap')
plt.xlabel('Parameter')
plt.ylabel('Sport')
plt.xticks(rotation=45)
plt.savefig('model_metrics_heatmaps/R2_heatmap.png')
plt.close()

# Plot MSE Heatmap
plt.figure(figsize=(50, 30))
sns.heatmap(mse_pivot, annot=True, cmap="YlOrRd", fmt=".2f", linewidths=.5)
plt.title('MSE Score Heatmap')
plt.xlabel('Parameter')
plt.ylabel('Sport')
plt.xticks(rotation=45)
plt.savefig('model_metrics_heatmaps/MSE_heatmap.png')
plt.close()

print("Heatmaps saved to the 'model_metrics_heatmaps' folder.")


Heatmaps saved to the 'model_metrics_heatmaps' folder.
