In [None]:
import os
import sys

sys.path.append("../../../")

import json
import pickle

import fasttreeshap
import numpy as np
import pandas as pd
import shap
from sklearn.model_selection import RepeatedKFold, cross_val_score
from povertymapping import settings
%reload_ext autoreload
%autoreload 2

# Load Training Data

In [None]:
COUNTRY_CODE = "tl"
ROLLOUT_DATE = None

In [None]:
DATA_DIR = settings.DATA_DIR.resolve()
ROLLOUT_DIR = DATA_DIR/f"rollout/{COUNTRY_CODE}"
ROLLOUT_DIR.mkdir(parents=True,exist_ok=True)

In [None]:
ROLLOUT_DATE = "-".join(os.getcwd().split("/")[-2].split("-")[:3]) if ROLLOUT_DATE is None else ROLLOUT_DATE

In [None]:
# CSV file contains all data
# Metadata JSON file lists the feature columns and label column
data = pd.read_csv(f"{ROLLOUT_DIR}/{ROLLOUT_DATE}-training-data.csv")
with open(f"{ROLLOUT_DIR}/{ROLLOUT_DATE}-training-data-columns.json", "r") as file:
    column_metadata = json.load(file)

features = data[column_metadata["features"]]
labels = data[column_metadata["label"]]

# Cross-Validation

In [None]:
# Set parameters
CV_K_FOLDS = 5
CV_NUM_REPEATS = 5
RANDOM_SEED = 42

In [None]:
print(f"Performing {CV_K_FOLDS}-fold CV...")
cv = RepeatedKFold(
    n_splits=CV_K_FOLDS,
    n_repeats=CV_NUM_REPEATS,
    random_state=RANDOM_SEED,
)

print(cv.split(features))

In [None]:
#papermill_description="Compute cross validation scores"
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED, verbose=0)

R_cv = cross_val_score(model, features.values, labels.values.ravel(), cv=cv)
cv_mean = round(np.array(R_cv).mean(), 2)
cv_std = round(np.array(R_cv).std(), 2)

print("Cross validation scores are: ", R_cv)
print(f"Cross validation R^2 mean: {cv_mean}")
print(f"Cross validation R^2 std: {cv_std}")

# Train the final model

For training the final model, we train on all the available data.

In [None]:
#papermill_description="Train final model"
model = RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED, verbose=0)
model.fit(features.values, labels.values.ravel())

In [None]:
#papermill_description="Save final model"
model_save_path = f"{ROLLOUT_DIR}/{ROLLOUT_DATE}-{COUNTRY_CODE}-single-country-model.pkl"
with open(model_save_path, "wb") as file:
    pickle.dump(model, file)

# SHAP

In [None]:
explainer = fasttreeshap.TreeExplainer(model, algorithm="auto", n_jobs=-1)
shap_values = explainer(features).values

In [None]:
shap.summary_plot(
    shap_values, features, feature_names=features.columns, plot_type="bar"
)

In [None]:
shap.summary_plot(shap_values, features.values, feature_names=features.columns)