In [None]:
import pandas as pd
import numpy as np

# Load datasets
resort_meta = pd.read_csv("/mnt/data/australia_ski_resorts.csv")
weather = pd.read_csv("/mnt/data/Weather Training Data.csv")
season_holiday = pd.read_csv("/mnt/data/ski_season_vs_holiday.csv")
allianz_xlsx = pd.read_excel("/mnt/data/2025 Allianz Datathon Dataset.xlsx", sheet_name=None)

# ---- Missing Value Summary Function ----
def missing_summary(df, name):
    missing = df.isnull().mean().sort_values(ascending=False)
    print(f"\nMissing values in {name}:\n", missing[missing > 0])

# ---- Resort Meta: fill with group mean or leave for manual filling if too few ----
resort_meta.fillna({
    'skiable_acres': resort_meta['skiable_acres'].mean(),
    'base_elevation_ft': resort_meta['base_elevation_ft'].mean(),
    'top_elevation_ft': resort_meta['top_elevation_ft'].mean(),
    'vertical_ft': resort_meta['vertical_ft'].mean(),
    'trails': resort_meta['trails'].mean(),
    'historical_average_snowfall_inches': resort_meta['historical_average_snowfall_inches'].mean(),
    'historical_average_snowfall_days': resort_meta['historical_average_snowfall_days'].mean(),
    '2021-2022_length_days': resort_meta['2021-2022_length_days'].mean(),
    '2022-2023_length_days': resort_meta['2022-2023_length_days'].mean(),
    '2023-2024_length_days': resort_meta['2023-2024_length_days'].mean()
}, inplace=True)

# ---- Weather Data Cleaning ----
# Drop evaporation and sunshine (too many missing)
weather.drop(columns=["Evaporation", "Sunshine"], inplace=True)

# Impute numeric with median
num_cols = weather.select_dtypes(include=[np.number]).columns
weather[num_cols] = weather[num_cols].fillna(weather[num_cols].median())

# Impute wind direction using forward fill
wind_cols = ["WindGustDir", "WindDir9am", "WindDir3pm"]
weather[wind_cols] = weather[wind_cols].fillna(method='ffill')

# ---- Holiday Data ----
season_holiday["public_holidays_in_week"].fillna("None", inplace=True)

# ---- Allianz Climate Data ----
climate_df = allianz_xlsx["Climate Data"]
climate_df.fillna(method="ffill", inplace=True)

# ---- EDA: Missing Value Summary ----
missing_summary(resort_meta, "resort_meta")
missing_summary(weather, "weather")
missing_summary(season_holiday, "season_holiday")
missing_summary(climate_df, "climate_df")
