### now train the best-performing (so to speak) model on 2020 + 2024 data for future (2028) prediction

In [None]:
import pandas as pd
import lightgbm as lgb

# ============================================================================
# PART 1: PREPARE THE HISTORICAL DATA FOR FINAL TRAINING
# ============================================================================

# Load your complete, validated dataset
df = pd.read_csv('../data/buurt_data_for_modeling_STRICT.csv')

# Define your historical target variable (Y) - the known 2024 outcomes
target = 'gentrified'
y_historical = df[target]

# Define your historical features (X) - the known 2020 conditions
features_2020 = [col for col in df.columns if col.endswith('_2020')]
features_2020.append('buurt_area_m2')
exclude_cols = ['nearest_green_name_2020', 'nearest_green_type_2020']
features_2020 = [f for f in features_2020 if f not in exclude_cols]

X_historical = df[features_2020]

print("="*80)
print("PREPARING FOR FINAL PREDICTION")
print("="*80)
print(f"\nTraining the final model on {len(X_historical)} neighborhoods using 2020 data...")


# ============================================================================
# PART 2: BUILD AND TRAIN THE CHAMPION MODEL 🏆
# ============================================================================

# Initialize the LightGBM model with the BEST parameters from your GridSearch
final_model = lgb.LGBMClassifier(
    learning_rate=0.01,
    max_depth=10,
    n_estimators=200,
    num_leaves=20,
    random_state=42,
    class_weight='balanced'
)

# Train the model on ALL available historical data (no train/test split)
print("\nTraining the final champion model on 100% of historical data...")
final_model.fit(X_historical, y_historical)
print("...Final model training complete.")


# ============================================================================
# PART 3: PREPARE 2024 DATA TO PREDICT THE FUTURE (~2028)
# ============================================================================

# Select the features from 2024 that correspond to our 2020 training features
features_2024 = [col for col in df.columns if col.endswith('_2024')]
features_2024.append('buurt_area_m2')
X_future = df[features_2024].copy()

# CRUCIAL STEP: Rename the 2024 columns to match the 2020 column names the model was trained on
rename_dict = {col: col.replace('_2024', '_2020') for col in X_future.columns if col.endswith('_2024')}
X_future.rename(columns=rename_dict, inplace=True)

# Ensure the column order is identical to the training data
X_future = X_future[X_historical.columns]

print(f"\nPrepared 2024 data for {len(X_future)} neighborhoods to predict future risk.")
assert all(X_historical.columns == X_future.columns), "Error: Column names do not match!"


# ============================================================================
# PART 4: GENERATE AND SAVE THE 2028 RISK PREDICTIONS 🚀
# ============================================================================

print("\nGenerating future risk predictions for ~2028...")

# Predict the binary risk label (0 or 1)
future_predictions = final_model.predict(X_future)

# Predict the risk probability (a score from 0.0 to 1.0)
future_probabilities = final_model.predict_proba(X_future)[:, 1]

# Create a new DataFrame with the predictions
predictions_df = df[['buurtcode_2022', 'buurt_name', 'stadsdeel_name']].copy()
predictions_df['risk_prediction_2028'] = future_predictions
predictions_df['risk_probability_2028'] = future_probabilities

# Save the final predictions to a new CSV file
output_file = '../results/amsterdam_gentrification_risk_2028.csv'
predictions_df.to_csv(output_file, index=False)
print(f"\n✓ Final risk predictions saved to: {output_file}")


# ============================================================================
# PART 5: SUMMARY OF THE FUTURE FORECAST
# ============================================================================

print("\n" + "#"*80)
print("# SUMMARY OF ~2028 GENTRIFICATION RISK FORECAST")
print("#"*80)

num_at_risk = predictions_df['risk_prediction_2028'].sum()
print(f"\nNumber of neighborhoods predicted to be 'At Risk' by 2028: {num_at_risk}")

print("\nNeighborhoods with the Predicted Risk:")
print(predictions_df.sort_values(by='risk_probability_2028', ascending=False).head(30).to_string(index=False))

PREPARING FOR FINAL PREDICTION

Training the final model on 439 neighborhoods using 2020 data...

Training the final champion model on 100% of historical data...
[LightGBM] [Info] Number of positive: 49, number of negative: 390
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000272 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3485
[LightGBM] [Info] Number of data points in the train set: 439, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
...Final model training complete.

Prepared 2024 data for 439 neighborhoods to predict future risk.

Generating future risk predictions for ~2028...

✓ Final risk predictions saved to: amsterdam_gentrification_risk_2028.csv

################################################################################
# SUMMARY OF ~2028 GENTRIFICATION RISK FORECAST

In [5]:
# now print out those neighborhoods the model predicts are at risk of becoming gentrified that weren't already in 2024
already_gentrified = df[df['gentrified'] == 1]['buurtcode_2022'].tolist()
newly_at_risk = predictions_df[(predictions_df['risk_prediction_2028'] == 1) & (~predictions_df['buurtcode_2022'].isin(already_gentrified))]
print(f"\nNeighborhoods newly at risk (not gentrified by 2024):")
print(newly_at_risk.sort_values(by='risk_probability_2028', ascending=False).to_string(index=False))



Neighborhoods newly at risk (not gentrified by 2024):
buurtcode_2022           buurt_name stadsdeel_name  risk_prediction_2028  risk_probability_2028
          TM04               Gein 4       Zuidoost                     1               0.746402
          TD03    Hakfort/Huigenbos       Zuidoost                     1               0.740111
          NH02     De Kleine Wereld          Noord                     1               0.714109
          TL02         Reigersbos 2       Zuidoost                     1               0.691333
          TL03         Reigersbos 3       Zuidoost                     1               0.657703
          NJ05     Werengouw-Midden          Noord                     1               0.639338
          TL01         Reigersbos 1       Zuidoost                     1               0.631997
          TB01    Venserpolder-West       Zuidoost                     1               0.612816
          NH01 Buikslotermeer-Noord          Noord                     1         