> This notebook trains a tabular model to estimate wealth index over the Philippines.

In [25]:
import os
import pickle

import geopandas as gp
import numpy as np
import pandas as pd
import yaml
from adapt.feature_based import CORAL, FA, SA, fMMD
from adapt.instance_based import (
    KLIEP,
    KMM,
    LDM,
    WANN,
    NearestNeighborsWeighting,
    TrAdaBoostR2,
    TwoStageTrAdaBoostR2,
)
from lightgbm import LGBMRegressor
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import subprocess
from pathlib import Path

## Set Config

In [26]:
config = dict(
    save_path="../data/outputs/ph/model_ph/",
    repo_path="../data/SVII_PH_KH_MM_TL",
    download_gcs_uri="gs://poverty-mapping/outputs/",
    output_gcs_uri="gs://poverty-mapping/outputs/",
    data_dir="ph",
    country="ph",
    ookla_folder="ookla_ph",
    hdx_folder="hdx_ph",
    dhs_folder="dhs_ph",
    osm_folder="osm_ph",
    dhs_geo_zip_folder="PHGE71FL",
    dhs_zip_folder="PHHR71DT",
    viirs_folder="viirs_ph",
    traintest_folder="traintest_ph",
    # crs="4683",
    # ookla_feature="avg_d_mbps",
    # boundary_file="phl_adminboundaries_candidate_adm3",
    year="2020",
    quarter="2",
    sample=False,
    random_sample=False,
    no_samples=60,
    random_seed=42,
    # clust_rad=2000,
    # plot_ookla_features=True,
    # adm_level=3,
    # use_pcode=True,
    # shape_label="ADM3_PCODE",
    # bins=6,
    # show_legend=False,
    use_ookla=True,
    use_viirs=True,
    use_osm=True,
    model_name='RF',
    # train test seed for reproducibility
    use_seed=True,
    train_test_seed=100,
    fix_random_state=True,
    # dc_run_hash=<dc_run_hash>
    use_filt_clt=False,
    # filt_filename: '<filt_filename>'
    pop_thresh=10,
    no_folds=3,
    train_test_split_prop=0.7,  # should coincide with number of folds, so automate
    run_da=False,
    da_class='KMM'
)

## Download training data from cloud bucket

In [27]:
# create output directory
save_path = config["save_path"]

if not os.path.isdir(save_path):
    os.makedirs(save_path)


# dataset_folder_keys = ["traintest_folder"]

# for key in dataset_folder_keys:
#     gcs_download_folder = config['download_gcs_uri'] +  config[key] 
#     save_path = config['save_path']
#     subprocess.call([f'gsutil -m cp -r {gcs_download_folder} {save_path}'], shell=True)

## Download DHS shapefile

In [28]:
# ge71fl_gcs_uri = 'gs://poverty-mapping/SVII_PH_KH_MM_TL/ph/dhs_ph/PHGE71FL/'
# subprocess.call([f'gsutil -m cp -n -r {ge71fl_gcs_uri} {save_path}'], shell=True)

## Define helper functions

In [29]:
def get_model(model_name, fix_random_state=True):
    """Return sklearn model or ensemble estimator object
    Args:
        model_name (str): model abbreviation
    Returns:
        sklearn.estimator: sklearn model
    """
    if model_name == "RF":
        constructor = RandomForestRegressor
        if fix_random_state:
            model = constructor(n_estimators=100, random_state=42, verbose=50)
        else:
            model = constructor(n_estimators=100, random_state=None)

    elif model_name == "XGB":
        constructor = XGBRegressor
        model = constructor(n_estimators=100)
    elif model_name == "LGBM":
        constructor = LGBMRegressor
        model = constructor(n_estimators=100)
    elif model_name == "LR":
        model = LinearRegression()

    return model

In [30]:
def path_map(x):
    return os.path.join(save_path, f"{dhs_geo_zip_folder}_{x}.pkl")

In [31]:
get_model('LR')

In [32]:
def get_preds_and_r2(features, labels, train_index, test_index):
    """Return model predictions and r2 score
    Args:
        features (pd.DataFrame): dataframe of features
        labels (pd.Series): series of labels
        train_index (list): list of train indices
        test_index (list): list of test indices
    Returns:
        tuple: (array, float) corresp. to model predictions and r2 score
    """
    train_features, test_features = (
        features.iloc[train_index, :],
        features.iloc[test_index, :],
    )
    train_labels, test_labels = labels.iloc[train_index], labels.iloc[test_index]

    model.fit(train_features, train_labels)

    predictions = model.predict(test_features)
    r2 = r2_score(test_labels, predictions)

    return predictions, r2

def get_gt_and_preds(
    features,
    labels,
    train_index,
    test_index,
    model,
    dom_adapt=False,
    adapt_type="KMM",
):
    """_summary_
    Args:
        features (pd.DataFrame): dataframe of features
        labels (pd.Series): series of labels
        train_index (list): list of train indices
        test_index (list): list of test indices
        model (sklearn.estimator): sklearn model
        dom_adapt (bool, optional): whether to use domain adaptation. Defaults to False.
        adapt_type (str, optional): type of domain adaptation. Defaults to "KMM".
    Returns:
        tuple: (array, array) corresp. test labels and model predictions
    """
    train_features, test_features = (
        features.iloc[train_index, :],
        features.iloc[test_index, :],
    )
    train_labels, test_labels = labels.iloc[train_index], labels.iloc[test_index]

    # eliminate features
    selector = RFE(model, n_features_to_select=10, step=1)
    selector = selector.fit(train_features, train_labels)
    train_features = selector.transform(train_features)
    test_features = selector.transform(test_features)

    if not dom_adapt:
        model.fit(train_features, train_labels)
        predictions = model.predict(test_features)
    else:
        # feature-based da
        if adapt_type == "fMMD":
            da_model = fMMD(
                model, Xt=test_features, kernel="linear", random_state=0, verbose=0
            )
        elif adapt_type == "CORAL":
            da_model = CORAL(model, Xt=test_features, random_state=0)
        elif adapt_type == "SA":
            da_model = SA(model, Xt=test_features, random_state=0)
        elif adapt_type == "FA":  # supervised method
            da_model = FA(
                model, Xt=test_features[:10], yt=test_labels[:10], random_state=0
            )
        # instance-based da
        elif adapt_type == "NN":
            da_model = NearestNeighborsWeighting(
                model, n_neighbors=5, Xt=test_features, random_state=0
            )
        elif adapt_type == "KMM":
            da_model = KMM(
                model,
                Xt=test_features,
                kernel="rbf",
                gamma=1.0,
                verbose=0,
                random_state=0,
            )
        elif adapt_type == "KLIEP":  # TODO: Troubleshoot nan issue
            da_model = KLIEP(
                model,
                Xt=test_features,
                kernel="rbf",
                gamma=[0.1, 1.0],
                random_state=0,
            )
        elif adapt_type == "LDM":
            da_model = LDM(model, Xt=test_features, random_state=0)
        elif adapt_type == "TrAdaBoostR2":
            da_model = TrAdaBoostR2(
                model,
                n_estimators=10,
                Xt=test_features[:10],
                yt=test_labels[:10],
                random_state=0,
            )
        elif (
            adapt_type == "TwoStageTrAdaBoostR2"
        ):  # TODO: since method signature might differ from TrAdaBoostR2
            da_model = TwoStageTrAdaBoostR2(
                model,
                n_estimators=10,
                Xt=test_features[:10],
                yt=test_labels[:10],
                random_state=0,
            )
        elif adapt_type == "WANN":
            da_model = WANN(
                Xt=test_features[:10], yt=test_labels[:10], random_state=0
            )

        da_model.fit(train_features, train_labels)
        predictions = da_model.predict(test_features)

    return test_labels, predictions


In [33]:
def map_fold_pairs(idx):
    return {
        "train": list(adm1_col.iloc[train_indices[idx]].unique()),
        "test": list(adm1_col.iloc[test_indices[idx]].unique()),
    }

## Set filepaths

In [34]:
features_path = '../data/outputs/ph/model_ph/traintest_ph/PHGE71FL_features.pkl'
labels_path = '../data/outputs/ph/model_ph/traintest_ph/PHGE71FL_labels.pkl'
data_final_path = '../data/outputs/ph/model_ph/traintest_ph/data_final.pkl'
dhs_geo_path = '../data/outputs/ph/dhs_ph/PHHR71DT_PHGE71FL_by_cluster.csv'
dhs_base_path = '../data/outputs/ph/dhs_ph/PHHR71DT_base.csv'
ge71fl_path = '../data/outputs/ph/model_ph/PHGE71FL/PHGE71FL.shp'

In [35]:

dhs_geo_zip_folder = config['dhs_geo_zip_folder']
# extract model params
model_name = config["model_name"]  # this is the class name

print(f"Model chosen is: {model_name}")

# instantiate
fix_random_state = config["fix_random_state"]

if model_name == "RF":
    constructor = RandomForestRegressor
    if fix_random_state:
        model = constructor(n_estimators=100, random_state=42, verbose=50)
    else:
        model = constructor(n_estimators=100, random_state=None)
elif model_name == "XGB":
    constructor = XGBRegressor
    model = constructor(n_estimators=100)
elif model_name == "LGBM":
    constructor = LGBMRegressor
    model = constructor(n_estimators=100)
elif model_name == "LR":
    model = LinearRegression()

print("Loading pickled files...")
features = pd.read_pickle(features_path)
labels = pd.read_pickle(labels_path)

if config["use_seed"]:
    train_test_seed = config["train_test_seed"]
    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.33, random_state=train_test_seed
    )
else:
    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.33
    )

# # we save the test features and labels for the evaluation step
# test_features_path, test_labels_path = list(
#     map(path_map, ["test_features", "test_labels"])
# )


test_features_path = Path(save_path)/ f'{dhs_geo_zip_folder}_test_features.pkl'
test_labels_path = Path(save_path)/ f'{dhs_geo_zip_folder}_test_labels.pkl'

# pickle
test_features.to_pickle(test_features_path)
test_labels.to_pickle(test_labels_path)

# CROSS VALIDATION
cv_n_splits = config["no_folds"]
print(f"Performing {cv_n_splits}-fold CV...")
cv = model_selection.KFold(n_splits=cv_n_splits, shuffle=True, random_state=42)


# running in a loop instead of cross_val_score method
r2_scores = []
for train_index, test_index in cv.split(features.values):
    _, r2 = get_preds_and_r2(features, labels, train_index, test_index)
    r2_scores.append(r2)
print("loop results: ", r2_scores)


Rcross = model_selection.cross_val_score(model, features, labels, cv=cv)
print("Cross validation scores are: ", Rcross)
cross_mean = round(np.array(Rcross).mean(), 2)
cross_std = round(np.array(Rcross).std(), 2)
print(f"cv mean: {cross_mean}")
print(f"cv std: {cross_std}")

# SPATIAL CROSS VALIDATION
data_final = pd.read_pickle(data_final_path)

# also extract geo group labels (i.e. adm level labels)
dhs_geo = pd.read_csv(
   dhs_geo_path
)
base = pd.read_csv(dhs_base_path)

# group label
adm_label = "ADM1NAME"
ge71fl = gp.read_file(ge71fl_path)
# perform inner merge between data_final and dhs_geo on 'DHSID' column
dhsid_adm1 = data_final[["DHSID"]].merge(ge71fl[["DHSID", adm_label]], how="inner")
group_col_values = dhsid_adm1[adm_label].values

n_sp_splits = 3
group_kfold = model_selection.GroupKFold(n_splits=n_sp_splits)
# Generator for the train/test indices
city_kfold = group_kfold.split(features, labels, group_col_values)
# Create a nested list of train and test indices for each fold
train_indices, test_indices = [list(traintest) for traintest in zip(*city_kfold)]

# adm1 values corresp. to train_indices, test_indices pairs
# series object
adm1_col = dhsid_adm1[adm_label]
# write helper function to extract spatial cv test/train label list pairs



# idx: offset (0 to no_folds - 1)
sp_folds_labels = {f"fold_{idx}": map_fold_pairs(idx) for idx in range(n_sp_splits)}
# pickle dictionary
sp_folds_labels_save_path = os.path.join(save_path, "sp_folds_labels.pkl")
with open(sp_folds_labels_save_path, "wb") as file:
    pickle.dump(sp_folds_labels, file)

city_cv = [*zip(train_indices, test_indices)]
predictions = model_selection.cross_val_predict(
    model, features, labels, cv=city_cv
)  # TODO: find out of preds correspond to a single model
r2_scores_sp = model_selection.cross_val_score(model, features, labels, cv=city_cv)
print("sp predictions for all data")
spatial_cv_r2 = r2_score(labels, predictions)
print("spatial r2: ", spatial_cv_r2)
print("sp r2 scores are: ", r2_scores_sp)


model = get_model(model_name)
# save gt, preds pairs for all folds
print("Pickling spatial cv predictions...")
gt_pred_by_fold = {}
for i, _ in enumerate(train_indices):
    train_index, test_index = train_indices[i], test_indices[i]
    gt, pred = get_gt_and_preds(features, labels, train_index, test_index, model)
    gt_pred_by_fold[i] = {"ground_truth": gt, "predictions": pred}


# spatial_cv_save_path = os.path.join(save_path, "gt_pred_by_fold.pkl")
spatial_cv_save_path = Path(save_path) / "gt_pred_by_fold.pkl"

with open(spatial_cv_save_path, "wb") as file:
    pickle.dump(gt_pred_by_fold, file)

# if config["run_da"]:

#     # domain adaptation fold pairs
#     print("Performing domain adaptation...")
#     print("This might take a few minutes")
#     print()

#     model = get_model(model_name)
#     print("Pickling spatial cv predictions...")
#     da_gt_pred_by_fold = {}
#     for i, _ in enumerate(train_indices):
#         train_index, test_index = train_indices[i], test_indices[i]
#         gt, pred = get_gt_and_preds(
#             features,
#             labels,
#             train_index,
#             test_index,
#             model,
#             dom_adapt=True,
#             adapt_type=config["da_class"],
#         )
#         da_gt_pred_by_fold[i] = {"ground_truth": gt, "predictions": pred}

#     da_spatial_cv_save_path = os.path.join(save_path, "da_gt_pred_by_fold.pkl")

#     with open(da_spatial_cv_save_path, "wb") as file:
#         pickle.dump(da_gt_pred_by_fold, file)

# TODO: either plot prediction vs gt here in train
#       or save fold number of models for evaluation step...

# TODO: record what the model is and what data is being used to train model
print("Training model on split...")
model.fit(train_features, train_labels)

# let's save the model
save_path = config["save_path"]

pickle_name = "model"
if model_name == "RF":
    pickle_name = "random_forest"
elif model_name == "XGB":
    pickle_name = "xgboost"
elif model_name == "LGBM":
    pickle_name = "light"
elif model_name == "LR":
    pickle_name = "lr"

model_save_path = os.path.join(save_path, f"{pickle_name}.pkl")

with open(model_save_path, "wb") as file:
    pickle.dump(model, file)




Model chosen is: RF
Loading pickled files...
Performing 3-fold CV...
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
building tree 1 of 100
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
building tree 2 of 100
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
building tree 3 of 100
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
building tree 4 of 100
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
building tree 5 of 100
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
building tree 6 of 100
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
building tree 7 of 100
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.1s remaining:    0.0s
building tree 8 of 100
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.1s remaining:    0.0s
building tree 9 of 100
[Pa

Copying gs://poverty-mapping/outputs/traintest_ph/PHGE71FL_cluster_coords.csv...
Copying gs://poverty-mapping/outputs/traintest_ph/PHGE71FL_cluster_coords_osm_agg.csv...
Copying gs://poverty-mapping/outputs/traintest_ph/PHGE71FL_features.pkl...      
Copying gs://poverty-mapping/outputs/traintest_ph/PHGE71FL_labels.pkl...        
Copying gs://poverty-mapping/outputs/traintest_ph/data_final.pkl...             
Copying gs://poverty-mapping/outputs/traintest_ph/data_labels.csv...            
Copying gs://poverty-mapping/outputs/traintest_ph/ph_2017_viirs_avg_rad_zonal_stats.csv...
- [7/7 files][  1.9 MiB/  1.9 MiB] 100% Done                                    
Operation completed over 7 objects/1.9 MiB.                                      


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
building tree 1 of 100
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
building tree 2 of 100
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
building tree 3 of 100
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
building tree 4 of 100
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
building tree 5 of 100
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
building tree 6 of 100
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
building tree 7 of 100
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
building tree 8 of 100
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
building tree 9 of 100
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining: 

In [36]:
get_model(model_name)

## Upload to cloud storage

In [37]:
download_gcs_uri = config['download_gcs_uri']
subprocess.call(f'gsutil -m cp {save_path}*.pkl {download_gcs_uri}model_ph/', shell=True)