### first identify good performers

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

# Import all the models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

# --- 1. SETUP ---
df = pd.read_csv('../data/buurt_data_for_modeling_STRICT.csv')
target = 'gentrified'
y = df[target]
features = [col for col in df.columns if col.endswith('_2020')]
features.append('buurt_area_m2')
exclude_cols = ['nearest_green_name_2020', 'nearest_green_type_2020']
features = [f for f in features if f not in exclude_cols]
X = df[features]

# --- 2. CREATE A SINGLE, CONSISTENT TRAIN/TEST SPLIT ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# --- 3. DEFINE MODELS TO COMPARE ---
# Use pipelines for models that require feature scaling
pipe_lr = Pipeline([('scaler', StandardScaler()), ('classifier', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000))])
pipe_knn = Pipeline([('scaler', StandardScaler()), ('classifier', KNeighborsClassifier(n_neighbors=5))]) # Using a default of 5 neighbors
pipe_svc = Pipeline([('scaler', StandardScaler()), ('classifier', SVC(random_state=42, class_weight='balanced'))])

# Use our previously found best parameters for Random Forest
rf = RandomForestClassifier(max_depth=10, max_features=0.8, min_samples_leaf=4, min_samples_split=2, n_estimators=300, random_state=42, class_weight='balanced')
lgbm = lgb.LGBMClassifier(random_state=42, class_weight='balanced')

models_to_compare = {
    'Logistic Regression': pipe_lr,
    'K-Nearest Neighbors': pipe_knn,
    'Support Vector Machine': pipe_svc,
    'Random Forest (Tuned)': rf,
    'LightGBM': lgbm
}

# --- 4. TRAIN, PREDICT, AND EVALUATE EACH MODEL ---
predictions_df = pd.DataFrame(index=X_test.index)
predictions_df['ACTUAL'] = y_test

for model_name, model in models_to_compare.items():
    print(f"\n{'='*80}")
    print(f"# PERFORMANCE: {model_name}")
    print(f"{'='*80}")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    predictions_df[model_name] = y_pred
    
    # Print the full classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Non-Gentrified (0)', 'Gentrified (1)']))
    
    # Print the confusion matrix
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(f"[[TN: {cm[0][0]}  FP: {cm[0][1]}]")
    print(f" [FN: {cm[1][0]}  TP: {cm[1][1]}]]\n")

# --- 5. SHOW SIDE-BY-SIDE PREDICTIONS ---
print("\n" + "#"*80)
print("# SIDE-BY-SIDE PREDICTION COMPARISON")
print("#"*80)
print("\nThis table shows which neighborhoods each model flagged as 'At Risk' (1).")
print("ACTUAL=1 means the neighborhood truly gentrified.")

# To make it easier to read, let's show only the rows where at least one model made a positive prediction
interesting_cases = predictions_df[predictions_df.iloc[:, 1:].sum(axis=1) > 0]
print(interesting_cases.to_string())


# PERFORMANCE: Logistic Regression

Classification Report:
                    precision    recall  f1-score   support

Non-Gentrified (0)       0.94      0.81      0.87        78
    Gentrified (1)       0.29      0.60      0.39        10

          accuracy                           0.78        88
         macro avg       0.61      0.70      0.63        88
      weighted avg       0.87      0.78      0.81        88

Confusion Matrix:
[[TN: 63  FP: 15]
 [FN: 4  TP: 6]]


# PERFORMANCE: K-Nearest Neighbors

Classification Report:
                    precision    recall  f1-score   support

Non-Gentrified (0)       0.94      0.96      0.95        78
    Gentrified (1)       0.62      0.50      0.56        10

          accuracy                           0.91        88
         macro avg       0.78      0.73      0.75        88
      weighted avg       0.90      0.91      0.90        88

Confusion Matrix:
[[TN: 75  FP: 3]
 [FN: 5  TP: 5]]


# PERFORMANCE: Support Vector Machine

Classif

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import log_evaluation # For suppressing verbose output

from sklearn.neighbors import KNeighborsClassifier

# --- SETUP ---
df = pd.read_csv('../data/buurt_data_for_modeling_STRICT.csv')
target = 'gentrified'
y = df[target]
features = [col for col in df.columns if col.endswith('_2020')]
features.append('buurt_area_m2')
exclude_cols = ['nearest_green_name_2020', 'nearest_green_type_2020']
features = [f for f in features if f not in exclude_cols]
X = df[features]

# --- DEFINE MODELS AND EXTENSIVE PARAMETER GRIDS ---
# Create pipelines for models that need scaling
pipe_knn = Pipeline([('scaler', StandardScaler()), ('classifier', KNeighborsClassifier())])
pipe_svc = Pipeline([('scaler', StandardScaler()), ('classifier', SVC(random_state=42, class_weight='balanced'))])

# Model that doesn't need scaling
lgbm = lgb.LGBMClassifier(random_state=42, class_weight='balanced')

# Define EXTENSIVE parameter grids for each model
param_grid_knn = {
    'classifier__n_neighbors': [3, 5, 7, 9, 11, 15],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2] # 1 is Manhattan distance, 2 is Euclidean
}

param_grid_svc = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__gamma': ['scale', 'auto'],
    'classifier__kernel': ['rbf', 'poly']
}

param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 40],
    'max_depth': [5, 10, -1] # -1 means no limit
}

# --- RUN THE DEEP DIVE ---
champions_to_tune = {
    'K-Nearest Neighbors': (pipe_knn, param_grid_knn),
    'Support Vector Machine': (pipe_svc, param_grid_svc),
    'LightGBM': (lgbm, param_grid_lgbm)
}

results = []
cv_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for model_name, (model, params) in champions_to_tune.items():
    print(f"\n{'='*40}\nDeep Dive Tuning for {model_name}...\n{'='*40}")
    
    # Use F1-score as our main metric
    grid_search = GridSearchCV(model, params, cv=cv_splitter, scoring='f1', n_jobs=-1, verbose=2)
    
    # Fit the model
    if model_name == 'LightGBM':
        # Use a callback to suppress the verbose warnings
        grid_search.fit(X, y, callbacks=[log_evaluation(period=0)])
    else:
        grid_search.fit(X, y)
    
    results.append({
        'Model': model_name,
        'Best F1-Score': grid_search.best_score_,
        'Best Parameters': grid_search.best_params_
    })

# --- DISPLAY FINAL RESULTS ---
results_df = pd.DataFrame(results).sort_values(by='Best F1-Score', ascending=False)

print("\n" + "#"*80)
print("# DEEP DIVE RESULTS (FINAL CHAMPIONS, RANKED BY F1-SCORE)")
print("#"*80)
print(results_df.to_string(index=False))


Deep Dive Tuning for K-Nearest Neighbors...
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Deep Dive Tuning for Support Vector Machine...
Fitting 5 folds for each of 16 candidates, totalling 80 fits

Deep Dive Tuning for LightGBM...
Fitting 5 folds for each of 81 candidates, totalling 405 fits
[LightGBM] [Info] Number of positive: 49, number of negative: 390
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3485
[LightGBM] [Info] Number of data points in the train set: 439, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

################################################################################
# DEEP DIVE RESULTS (FINAL CHAMPIONS, RANKED BY F1-SCORE)
###############################################################

In [None]:
# --- 1. SETUP (Consistent with previous steps) ---
df = pd.read_csv('../data/buurt_data_for_modeling_STRICT.csv')
target = 'gentrified'
y = df[target]
features = [col for col in df.columns if col.endswith('_2020')]
features.append('buurt_area_m2')
exclude_cols = ['nearest_green_name_2020', 'nearest_green_type_2020']
features = [f for f in features if f not in exclude_cols]
X = df[features]

# --- 2. CREATE A SINGLE, CONSISTENT TRAIN/TEST SPLIT ---
# This ensures we are comparing all models on the exact same data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# --- 3. DEFINE THE FINAL, TUNED CHAMPION MODELS ---
# Use the exact 'Best Parameters' from your GridSearchCV output

# Champion 1: LightGBM
lgbm_champion = lgb.LGBMClassifier(
    learning_rate=0.01,
    max_depth=10,
    n_estimators=200,
    num_leaves=20,
    random_state=42,
    class_weight='balanced'
)

# Champion 2: Support Vector Machine (within a pipeline for scaling)
svc_champion = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC(
        C=1,
        gamma='scale',
        kernel='poly',
        random_state=42,
        class_weight='balanced'
    ))
])

# Champion 3: K-Nearest Neighbors (within a pipeline for scaling)
knn_champion = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier(
        n_neighbors=3,
        p=2,
        weights='uniform'
    ))
])


models_to_evaluate = {
    'LightGBM (Champion)': lgbm_champion,
    'Support Vector Machine (Tuned)': svc_champion,
    'K-Nearest Neighbors (Tuned)': knn_champion
}

# --- 4. TRAIN, PREDICT, AND PRODUCE DETAILED EVALUATION ---
predictions_df = pd.DataFrame(index=X_test.index)
predictions_df['ACTUAL'] = y_test

for model_name, model in models_to_evaluate.items():
    print(f"\n{'='*80}")
    print(f"# FINAL PERFORMANCE: {model_name}")
    print(f"{'='*80}")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    predictions_df[model_name] = y_pred
    
    # --- Detailed Report ---
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Non-Gentrified (0)', 'Gentrified (1)']))
    
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    print(f"[[TN: {tn}  FP: {fp}]")
    print(f" [FN: {fn}  TP: {tp}]]\n")
    
    # --- Plain-Language Metrics for the 'Gentrified' Class ---
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    print("Key Metrics for 'Gentrified' Class (1):")
    print(f"  - Recall (Sensitivity): {recall:.2%}")
    print("    (Of all the neighborhoods that truly gentrified, this model found this many.)")
    print(f"  - Precision: {precision:.2%}")
    print("    (When this model predicted 'Gentrified', it was correct this often.)")
    print(f"  - Specificity: {specificity:.2%}")
    print("    (Of all non-gentrified neighborhoods, it correctly identified this many, indicating a low false alarm rate for that class.)")


# --- 5. SHOW FINAL SIDE-BY-SIDE PREDICTIONS ---
print("\n" + "#"*80)
print("# FINAL PREDICTION COMPARISON OF CHAMPIONS")
print("#"*80)
print("\nThis table shows which neighborhoods each tuned model flagged as 'At Risk' (1).")

# Show all cases where there was a gentrified neighborhood or at least one prediction
final_comparison = predictions_df[(predictions_df['ACTUAL'] == 1) | (predictions_df.iloc[:, 1:].sum(axis=1) > 0)]
# Add the buurt_name for context
final_comparison = final_comparison.merge(df[['buurt_name']], left_index=True, right_index=True)
print(final_comparison[['buurt_name', 'ACTUAL'] + list(models_to_evaluate.keys())].to_string())


# FINAL PERFORMANCE: LightGBM (Champion)
[LightGBM] [Info] Number of positive: 39, number of negative: 312
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000131 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2782
[LightGBM] [Info] Number of data points in the train set: 351, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Classification Report:
                    precision    recall  f1-score   support

Non-Gentrified (0)       0.97      0.86      0.91        78
    Gentrified (1)       0.42      0.80      0.55        10

          accuracy                           0.85        88
         macro avg       0.70      0.83      0.73        88
      weighted avg       0.91      0.85      0.87        88

Confusion Matrix:
[[TN: 67  FP: 11]
 [FN: 2  TP: 8]]

Key Metrics for 'Gentrified' Class (1):
  - Recall (Sensitivity): 80.00%
    (Of all 