In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
FILE = "Data/Annual_Resort.csv"
df = pd.read_csv(FILE)

# Standardize visitor column
df = df.rename(columns={"Maximum temperature (Degree C)": "MaxTemp",
                        "Minimum temperature (Degree C)": "MinTemp",
                        "Rainfall amount (millimetres)": "Rainfall"})

df = df[["Year","SkiWeek","Resort","Visitors"]].copy()

# Clean types
df["Year"] = df["Year"].astype(int)
df["SkiWeek"] = df["SkiWeek"].astype(int)
df["Resort"] = df["Resort"].astype("category")
df["Visitors"] = pd.to_numeric(df["Visitors"], errors="coerce")

# Drop bad rows
df = df.dropna(subset=["Visitors"])
df = df[df["Visitors"] > 0]
df = df.sort_values(["Resort","Year","SkiWeek"]).reset_index(drop=True)

print(df.dtypes)
print("Years span:", df["Year"].min(), "→", df["Year"].max())
print("Resorts:", df["Resort"].unique())
df.head()


Year           int32
SkiWeek        int32
Resort      category
Visitors       int64
dtype: object
Years span: 2014 → 2024
Resorts: ['Charlotte Pass', 'Falls Creek', 'Mt. Baw Baw', 'Mt. Buller', 'Mt. Hotham', 'Perisher', 'Selwyn', 'Thredbo']
Categories (8, object): ['Charlotte Pass', 'Falls Creek', 'Mt. Baw Baw', 'Mt. Buller', 'Mt. Hotham', 'Perisher', 'Selwyn', 'Thredbo']


Unnamed: 0,Year,SkiWeek,Resort,Visitors
0,2014,1,Charlotte Pass,408
1,2014,2,Charlotte Pass,151
2,2014,3,Charlotte Pass,230
3,2014,4,Charlotte Pass,1134
4,2014,5,Charlotte Pass,3403


In [None]:
# Feature engineering + split

# continuous time index
W = int(df["SkiWeek"].max()) 
year0 = int(df["Year"].min())  

def add_time_features(frame: pd.DataFrame) -> pd.DataFrame:
    out = frame.copy()
    out["TimeIndex"] = (out["Year"] - year0) * W + out["SkiWeek"]
    out["sin_w"] = np.sin(2 * np.pi * out["SkiWeek"] / W)
    out["cos_w"] = np.cos(2 * np.pi * out["SkiWeek"] / W)
    return out

df_f = add_time_features(df)

# Train = up to 2022 and Test = 2023–24
train = df_f[df_f["Year"] <= 2022].copy()
test  = df_f[df_f["Year"] > 2022].copy()

print("Train years:", train["Year"].min(), "→", train["Year"].max())
print("Test years :", test["Year"].min(), "→", test["Year"].max())

train.head()


Train years: 2014 → 2022
Test years : 2023 → 2024


Unnamed: 0,Year,SkiWeek,Resort,Visitors,TimeIndex,sin_w,cos_w
0,2014,1,Charlotte Pass,408,1,0.406737,0.913545
1,2014,2,Charlotte Pass,151,2,0.743145,0.669131
2,2014,3,Charlotte Pass,230,3,0.951057,0.309017
3,2014,4,Charlotte Pass,1134,4,0.994522,-0.104528
4,2014,5,Charlotte Pass,3403,5,0.866025,-0.5


In [None]:
# Model training (only visitors)
# Features & target
feature_cols = ["Year","SkiWeek","TimeIndex","sin_w","cos_w","Resort"]
target_col = "Visitors"

X_train = train[feature_cols]
y_train = train[target_col]
X_test  = test[feature_cols]
y_test  = test[target_col]

# Preprocessing: polynomial on numeric, OHE on Resort
numeric_cols = ["Year","SkiWeek","TimeIndex","sin_w","cos_w"]
cat_cols     = ["Resort"]

pre = ColumnTransformer(
    transformers=[
        ("poly", PolynomialFeatures(degree=2, include_bias=False), numeric_cols),
        ("ohe", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols)
    ]
)

# Model: Ridge regression
ridge = Ridge(alpha=3.0, random_state=42)

model = Pipeline([
    ("pre", pre),
    ("ridge", ridge)
])

# Fit
model.fit(X_train, y_train)

# Evaluate on test set (2023–24)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)
print(f"Visitors | RMSE={rmse:.2f} | R2={r2:.3f}")


Visitors | RMSE=11928.79 | R2=0.596


In [None]:
# Forecast Visitors 2026

resorts = df["Resort"].unique()
weeks   = sorted(df["SkiWeek"].unique())

future_2026 = pd.DataFrame(
    [{"Year": 2026, "SkiWeek": w, "Resort": r} for r in resorts for w in weeks]
)

future_2026 = add_time_features(future_2026)

# predict
X_future = future_2026[feature_cols]
future_2026["Pred_Visitors"] = np.rint(model.predict(X_future)).astype(int)

# save to CSV
future_2026.to_csv("predicted_visitors_2026.csv", index=False)
print(" 2026 forecast saved as predicted_visitors_2026.csv")

future_2026.head(12)

In [None]:
# Plot 2026 visitor predictions
sns.set(style="whitegrid", palette="muted", font_scale=1.1)

plt.figure(figsize=(12,6))
sns.lineplot(
    data=future_2026,
    x="SkiWeek", y="Pred_Visitors", hue="Resort", marker="o"
)
plt.title("Predicted Visitors per Resort — 2026 Ski Season")
plt.ylabel("Predicted Visitors")
plt.xlabel("Ski Week")
plt.legend(title="Resort", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Predictions
y_pred = model.predict(X_test)

# Metrics
r2   = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae  = mean_absolute_error(y_test, y_pred)

print("=== Visitor Prediction Validity Metrics (Test: 2023–24) ===")
print(f"R²   : {r2:.3f}")
print(f"RMSE : {rmse:.2f} visitors")
print(f"MAE  : {mae:.2f} visitors")

## Method  
- Cleaned data (removed bad rows, zero visitors).  
- Added **TimeIndex** (trend) + **seasonal features** (sin/cos of week).  
- Model: **Ridge regression** (good for extrapolation).  
- Train on ≤2022, test on 2023–24.  

---

## Model Performance (2023–24 test)  
- **R² = 0.60**  
- **RMSE ≈ 11.9k visitors**  
- **MAE ≈ 9.4k visitors**  
Captures season shape well, less accurate in small/edge weeks.  
