# Phase 3: Predictive Modeling

**Goal**: Identify the best model to predict behavioral clusters.
**Data**: `final_dataset_18parks.csv` (18 samples, 20 features, 5 classes)
**Method**: Leave-One-Out Cross-Validation (LOOCV)
**Models**: Logistic Regression, SVM, Random Forest, XGBoost
**Note**: Singleton clusters (< 2 samples) are filtered out to ensure LOOCV validity.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneOut, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

## 1. Load Data & Filter Singletons

In [2]:
df = pd.read_csv('../interim/final_dataset_18parks.csv')
print(f"Original dataset: {df.shape}")

# Check Class Counts and Filter Singletons
counts = df['cluster'].value_counts()
print("Original Class Counts:\n", counts)

# Keep only classes with >= 2 confirmed samples
to_keep = counts[counts > 1].index
df_filtered = df[df['cluster'].isin(to_keep)].copy()

print(f"\nFiltered dataset (removed singletons): {df_filtered.shape}")
removed = set(df['cluster'].unique()) - set(df_filtered['cluster'].unique())
print(f"Removed Clusters: {removed}")

# Define X and y
X = df_filtered.drop(columns=['park_name', 'cluster'])
y = df_filtered['cluster']

# Label Encoding (Critical for XGBoost to have [0,1,2,3...])
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y = pd.Series(y_encoded, index=y.index)
print(f"\nEncoded Classes Mapping:")
for i, cls in enumerate(le.classes_):
    print(f"  Original {cls} -> New {i}")

feature_names = X.columns.tolist()
print(f"Features ({len(feature_names)}): {feature_names}")

Original dataset: (18, 22)
Original Class Counts:
 cluster
2    6
0    5
1    4
4    2
3    1
Name: count, dtype: int64

Filtered dataset (removed singletons): (17, 22)
Removed Clusters: {np.int64(3)}

Encoded Classes Mapping:
  Original 0 -> New 0
  Original 1 -> New 1
  Original 2 -> New 2
  Original 4 -> New 3
Features (20): ['area_official_m2', 'visitor_density', 'management_intensity', 'infrastructure_index', 'recreation_index', 'restrooms_count', 'playgrounds_count', 'exercise_equipment_count', 'facility_density', 'workers_count', 'topo_node_degree', 'topo_local_efficiency', 'topo_clustering_coefficient', 'topo_eccentricity', 'centrality_score', 'subway_station_count', 'bus_station_count', 'total_transit_ridership', 'transit_accessibility_index', 'distance_to_center']


## 2. Preprocessing

In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=feature_names)

## 3. Model Definition

In [4]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_estimators=50, max_depth=3)
}

## 4. LOOCV Evaluation

In [5]:
results = []

loo = LeaveOneOut()

for name, model in models.items():
    y_true = []
    y_pred = []
    
    for train_index, test_index in loo.split(X_scaled):
        X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        try:
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            y_true.append(y_test.values[0])
            y_pred.append(pred[0])
        except ValueError as e:
             # Should be rare now with filtering + LabelEncoding
             print(f"Error in {name}: {e}")
             y_true.append(y_test.values[0])
             y_pred.append(-1)

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    
    results.append({'Model': name, 'Accuracy': acc, 'F1 Score (Macro)': f1})
    print(f"{name}: Accuracy={acc:.3f}, F1={f1:.3f}")

# Create Summary Table
results_df = pd.DataFrame(results).sort_values(by='F1 Score (Macro)', ascending=False)
print("\n=== Model Performance (LOOCV) ===")
display(results_df)

Logistic Regression: Accuracy=0.412, F1=0.278
SVM: Accuracy=0.471, F1=0.293


Random Forest: Accuracy=0.412, F1=0.279


XGBoost: Accuracy=0.471, F1=0.325

=== Model Performance (LOOCV) ===


Unnamed: 0,Model,Accuracy,F1 Score (Macro)
3,XGBoost,0.470588,0.325
1,SVM,0.470588,0.292614
2,Random Forest,0.411765,0.278571
0,Logistic Regression,0.411765,0.277778


## 5. Save Results

In [6]:
results_df.to_csv('../results/phase3_model_comparison_18parks.csv', index=False)
print("Results saved to results/phase3_model_comparison_18parks.csv")

Results saved to results/phase3_model_comparison_18parks.csv
