# Does the model travel well?

## Preamble

In [None]:
# Preamble
import pandas as pd
import numpy as np
pd.set_option("mode.chained_assignment", None)
import random
random.seed(1509)
import matplotlib.pyplot as plt
import lightgbm as lgb
import pyarrow.feather as feather
from os import chdir, getcwd
import statsmodels.api as sm
from pprint import pprint
from nested_cv import NestedCV

# sci-kit
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score
from sklearn import tree

In [None]:
data_dir = '/home/jovyan/work/Data/'
results_dir = '/home/jovyan/work/Results/'

In [None]:
select_features = ['reporter.ISO', 'partner.ISO', 'year',
                   'ln.Tot_IFF_t', 'ln.In_Tot_IFF_t',
                   'ln.gdp_o', 'ln.gdp_d', 'ln.pop_o', 'ln.pop_d', 
                   'dist', 'contig', 
                   'comlang', 'comcol', 'col45', 
                   'ihs.entry_cost_o', 'ihs.entry_cost_d', 'rta',
                   'rCorrCont', 'pCorrCont',
                   'rRegQual', 'pRegQual', 
                   'rRuleLaw', 'pRuleLaw',
                   'pSecrecyScore',
                   'pFSI.rank',
                   'pKFSI13',
                   'pKFSI17',
                   'pKFSI20',
                   'rFATF', 'pFATF',
                   'ihs.tariff',
                   'kai_o', 'kai_d', 'kao_o', 'kao_d',
                   'cc_o', 'cc_d', 'cci_o', 'cci_d', 'cco_o', 'cco_d',
                   'di_o', 'di_d', 'dii_o', 'dii_d', 'dio_o', 'dio_d']

features = [       'ln.gdp_o', 'ln.gdp_d', 'ln.pop_o', 'ln.pop_d', 
                   'dist', 'contig', 
                   'comlang', 'comcol', 'col45', 
                   'ihs.entry_cost_o', 'ihs.entry_cost_d', 'rta',
                   'rCorrCont', 'pCorrCont',
                   'rRegQual', 'pRegQual', 
                   'rRuleLaw', 'pRuleLaw',
                   'pSecrecyScore',
                   'pFSI.rank',
                   'pKFSI13',
                   'pKFSI17',
                   'pKFSI20',
                   'rFATF', 'pFATF',
                   'ihs.tariff',
                   'kai_o', 'kai_d', 'kao_o', 'kao_d',
                   'cc_o', 'cc_d', 'cci_o', 'cci_d', 'cco_o', 'cco_d',
                   'di_o', 'di_d', 'dii_o', 'dii_d', 'dio_o', 'dio_d']

ids = [       'reporter.ISO', 'partner.ISO', 'year']

In [None]:
def create_smp(data, features):
    """
    Create train and test samples that are complete.
    """
    smp = data[features]
    smp.dropna(axis=0, how='any', inplace=True)
    return smp

## Samples

### Import full sample

In [None]:
data = feather.read_feather(results_dir + 'Africa_agg.feather')

In [None]:
data_smp = create_smp(data, select_features)

In [None]:
idx = data_smp[ids]
X = data_smp[features]
Y_out = data_smp[['ln.Tot_IFF_t']]
Y_in = data_smp[['ln.In_Tot_IFF_t']]

In [None]:
feather.write_feather(idx, results_dir + 'idx.feather')
feather.write_feather(X, results_dir + 'X.feather')
feather.write_feather(Y_out, results_dir + 'Y_out.feather')
feather.write_feather(Y_in, results_dir + 'Y_in.feather')

In [None]:
print('X: ', X.shape, '\nY_out: ',  Y_out.shape)

### Import training and test sets

In [None]:
LMIC_agg = feather.read_feather(results_dir + 'LMIC_agg.feather')
HIC_agg = feather.read_feather(results_dir + 'HIC_agg.feather')

In [None]:
LMIC_agg_smp = create_smp(LMIC_agg, select_features)
HIC_agg_smp = create_smp(HIC_agg, select_features)

In [None]:
train_agg = feather.read_feather(results_dir + 'train_agg.feather')
test_agg = feather.read_feather(results_dir + 'test_agg.feather')

In [None]:
train_agg_smp = create_smp(train_agg, select_features)
test_agg_smp = create_smp(test_agg, select_features)

In [None]:
feather.write_feather(train_agg_smp, results_dir + 'train_agg_smp.feather')
feather.write_feather(test_agg_smp, results_dir + 'test_agg_smp.feather')

In [None]:
print('Training set: ', train_agg_smp.shape, '\nTest set: ',  test_agg_smp.shape, '\nLMIC set: ',  LMIC_agg_smp.shape)

### Create feature set and vector of outcome labels

In [None]:
# Training set
Y_train_out = train_agg_smp[['ln.Tot_IFF_t']]
Y_train_in = train_agg_smp[['ln.In_Tot_IFF_t']]
X_train = train_agg_smp[features]

# Test set
Y_test_out = test_agg_smp[['ln.Tot_IFF_t']]
Y_test_in = test_agg_smp[['ln.In_Tot_IFF_t']]
X_test = test_agg_smp[features]

# LMIC test set
Y_LMIC = LMIC_agg_smp[['ln.Tot_IFF_t']]
X_LMIC = LMIC_agg_smp[features]

# HIC test set
Y_HIC = HIC_agg_smp[['ln.Tot_IFF_t']]
X_HIC = HIC_agg_smp[features]

In [None]:
idx = train_agg_smp[ids]

### Tune Random Forest model

In [None]:
# The hyperparameters were found using a cross-validation randomized search strategy on the training sample X_train
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

### Results on real samples

In [None]:
# Fit tuned model on real training data
RF_out_tuned.fit(X_train, Y_train_out.values.ravel())

In [None]:
# Training R-squared
RF_out_tuned.score(X_train, Y_train_out)

In [None]:
# Test R-squared
RF_out_tuned.score(X_test, Y_test_out)

In [None]:
# Reset the estimator for cross_val_score
RF_out_tuned_placebo = RandomForestRegressor(random_state = 1509,
                                             n_estimators = 1278,
                                             max_depth = 195,
                                             min_samples_split = 12,
                                             min_samples_leaf = 1,
                                             max_features = 'auto',
                                             bootstrap = True)

In [None]:
# Cross-validated R-squared on the training sample using 5 folds
# This is the best score reported by RandomizedSearchCV
# This is the one to report because CV doesn't need train/test split
# X_train here is the one used in hyperparameter tuning
# Error here is from held-out test sets so model was not fit on X_train
CV_train_scores = cross_val_score(RF_out_tuned, X_train, Y_train_out.values.ravel())
print('R-squared from each fold (training):', CV_train_scores)

In [None]:
# Mean cross-validated score
CV_train_scores.mean()

### Results on LMIC set

In [None]:
# Fit tuned model on real training data
RF_out_tuned.fit(X_train, Y_train_out.values.ravel())

In [None]:
# LMIC R-squared
RF_out_tuned.score(X_LMIC, Y_LMIC)

In [None]:
# Cross-validated R-squared on the LMIC sample using 5 folds
CV_scores_LMIC = cross_val_score(RF_out_tuned, X_LMIC, Y_LMIC.values.ravel())
print('R-squared from each fold :', CV_scores_LMIC)

In [None]:
# Mean cross-validated score
CV_scores_LMIC.mean()

### Results on HIC set

In [None]:
# HIC R-squared
RF_out_tuned.score(X_HIC, Y_HIC)

In [None]:
# Cross-validated R-squared on the HIC sample using 5 folds
CV_scores_HIC = cross_val_score(RF_out_tuned, X_HIC, Y_HIC.values.ravel())
print('R-squared from each fold :', CV_scores_HIC)

In [None]:
# Mean cross-validated score
CV_scores_HIC.mean()

### Results on individual countries

In [None]:
# Import without ZAF
Africa_noZAF = feather.read_feather(results_dir + 'Africa_noZAF.feather')
Africa_noZAF_smp = create_smp(Africa_noZAF, select_features)

# Import ZAF
ZAF = feather.read_feather(results_dir + 'ZAF.feather')
ZAF_smp = create_smp(ZAF, select_features)

# Training and test sets for Africa with no ZAF
Y_Africa_noZAF = Africa_noZAF_smp[['ln.Tot_IFF_t']]
X_Africa_noZAF = Africa_noZAF_smp[features]

# Training and test sets for ZAF
Y_ZAF = ZAF_smp[['ln.Tot_IFF_t']]
X_ZAF = ZAF_smp[features]

In [None]:
# The hyperparameters were found using a cross-validation randomized search strategy on the training sample X_train
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

In [None]:
# Fit tuned model on training data without ZAF
RF_out_tuned.fit(X_Africa_noZAF, Y_Africa_noZAF.values.ravel())

In [None]:
# ZAF R-squared
RF_out_tuned.score(X_ZAF, Y_ZAF)

In [None]:
# Cross-validated R-squared for ZAF
CV_scores_ZAF = cross_val_score(RF_out_tuned, X_ZAF, Y_ZAF.values.ravel())
print('R-squared from each fold :', CV_scores_ZAF)

In [None]:
CV_scores_ZAF.mean()

#### Egypt

In [None]:
# Import without EGY
Africa_noEGY = feather.read_feather(results_dir + 'Africa_noEGY.feather')
Africa_noEGY_smp = create_smp(Africa_noEGY, select_features)

# Import EGY
EGY = feather.read_feather(results_dir + 'EGY.feather')
EGY_smp = create_smp(EGY, select_features)

# Training and test sets for Africa with no EGY
Y_Africa_noEGY = Africa_noEGY_smp[['ln.Tot_IFF_t']]
X_Africa_noEGY = Africa_noEGY_smp[features]

# Training and test sets for EGY
Y_EGY = EGY_smp[['ln.Tot_IFF_t']]
X_EGY = EGY_smp[features]

In [None]:
# The hyperparameters were found using a cross-validation randomized search strategy on the training sample X_train
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

In [None]:
# Fit tuned model on training data without ZAF
RF_out_tuned.fit(X_Africa_noEGY, Y_Africa_noEGY.values.ravel())

In [None]:
# ZAF R-squared
RF_out_tuned.score(X_EGY, Y_EGY)

In [None]:
# Cross-validated R-squared for EGY
CV_scores_EGY = cross_val_score(RF_out_tuned, X_EGY, Y_EGY.values.ravel())
print('R-squared from each fold :', CV_scores_EGY)

In [None]:
CV_scores_EGY.mean()