<a href="https://colab.research.google.com/github/victor-radermecker/Capstone_JPMorgan/blob/conv-lstm/Notebooks_toMerge/Baseline%26XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline and XGBoost

Author: Andrea Zanon, Victor Radermecker

# Package Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [2]:
# Import packages
import sys
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import datetime
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Set preferences
tf.random.set_seed(42)
sns.set_style('darkgrid')
warnings.filterwarnings('ignore')

# update local libraries automatically
%load_ext autoreload
%autoreload 2

# Training XGBoost
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

sys.path.append('/content/drive/MyDrive/Code/')
import SequenceDataLoader
from get_data_loader import get_data_loader

sys.path.append('/content/drive/MyDrive/Google Earth Exports/')

In [22]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In this Jupyter Notebook, we predict urbanization (or urbanzation rate) using XGBoost. The training data consists of Dynamic World labels, extracted using the Fishnet Class, for different years.

In [3]:
TRAIN_PATH = '/content/drive/MyDrive/Code/Datasets/Train/urbanization_train.csv'
VAL_PATH = '/content/drive/MyDrive/Code/Datasets/Valid/urbanization_valid.csv'
TEST_PATH = '/content/drive/MyDrive/Code/Datasets/Test/urbanization_test.csv'

In [15]:
# train data, Texas region
train_data = pd.read_csv(TRAIN_PATH) # TRAIN_PATH

# validation data, Georgia region
val_data = pd.read_csv(VAL_PATH)

# test data, Ohio region
test_data = pd.read_csv(TEST_PATH)

In [38]:
# generate dataframe based on input data
def generate_tab_data(data, target_variable, target_year):

  if target_variable not in ['urbanization', 'urbanization_rate']:
    raise ValueError('target_variable must be urbanization or urbanization_rate')

  data_wide = data.pivot_table(index='tile_id', columns='year', values=target_variable, aggfunc='first')
  data_wide = data_wide.merge(right=data[['tile_id', 'Lat', 'Lon']], on='tile_id')
  data_wide.drop_duplicates(inplace = True, ignore_index = True)
  X = data_wide[[target_year-1, 'Lat', 'Lon']]
  y = data_wide[target_year]

  return X, y

In [39]:
target_variable = 'urbanization' # 'urbanization' or 'urbanization_rate'
target_year = 2022

X_train, y_train = generate_tab_data(train_data, target_variable, target_year)
X_val, y_val = generate_tab_data(val_data, target_variable, target_year)
X_test, y_test = generate_tab_data(test_data, target_variable, target_year)

In [40]:
# using cross-val, so can merge X_val into X_train
X_train = X_train.append(X_val)
y_train = y_train.append(y_val)

In [41]:
X_test.head()

Unnamed: 0,2021,Lat,Lon
0,0.0,36.869991,-80.170884
1,0.0,36.869991,-80.166452
2,0.0,36.869991,-80.162021
3,0.0,36.869991,-80.157589
4,0.0,36.869991,-80.153157


# Baseline

As baseline, predict that urbanization in _year_ is same as urbanization in _year-1_

In [42]:
y_pred = X_test[target_year-1]

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R2: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")

R2: 0.9943
MAE: 0.0028
MSE: 0.0002


# XGBoost Training on Dynamic World Dataset

In [None]:
# Train XGBoost using GridSearchCV
from sklearn.model_selection import GridSearchCV, KFold
from xgboost import XGBRegressor

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 250],
    'learning_rate': [0.1],
    'max_depth': [4, 5],
    'subsample': [0.6],
}

# Create the XGBoost Regressor
xgb_reg = XGBRegressor()

# Create K-Fold cross-validator
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=xgb_reg,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Use appropriate scoring metric
    cv=kfold,
    verbose = 10
)

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator from the grid search
xgb_params = grid_search.best_params_
xgb_model = grid_search.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.6
[CV 1/5; 1/4] END learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.6;, score=-0.005 total time=  34.4s
[CV 2/5; 1/4] START learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.6
[CV 2/5; 1/4] END learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.6;, score=-0.005 total time=  35.7s
[CV 3/5; 1/4] START learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.6
[CV 3/5; 1/4] END learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.6;, score=-0.005 total time=  36.1s
[CV 4/5; 1/4] START learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.6
[CV 4/5; 1/4] END learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.6;, score=-0.005 total time=  33.8s
[CV 5/5; 1/4] START learning_rate=0.1, max_depth=4, n_estimators=100, subsample=0.6


In [30]:
import sklearn
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',

In [None]:
# make predictions
y_pred = xgb_model.predict(X_test)

# evaluate predictions
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R2: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")