# Does the model travel well?

## Preamble

In [1]:
# Preamble
import pandas as pd
import numpy as np
pd.set_option("mode.chained_assignment", None)
import random
random.seed(1509)
import matplotlib.pyplot as plt
import lightgbm as lgb
import pyarrow.feather as feather
from os import chdir, getcwd
import statsmodels.api as sm
from pprint import pprint
from nested_cv import NestedCV

# sci-kit
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score
from sklearn import tree

In [2]:
data_dir = '/home/jovyan/work/Data/'
results_dir = '/home/jovyan/work/Results/'

In [3]:
select_features = ['reporter.ISO', 'partner.ISO', 'year',
                   'ln.Tot_IFF_t', 'ln.In_Tot_IFF_t',
                   'ln.gdp_o', 'ln.gdp_d', 'ln.pop_o', 'ln.pop_d', 
                   'dist', 'contig', 
                   'comlang', 'comcol', 'col45', 
                   'ihs.entry_cost_o', 'ihs.entry_cost_d', 'rta',
                   'rCorrCont', 'pCorrCont',
                   'rRegQual', 'pRegQual', 
                   'rRuleLaw', 'pRuleLaw',
                   'pSecrecyScore',
                   'pFSI.rank',
                   'pKFSI13',
                   'pKFSI17',
                   'pKFSI20',
                   'rFATF', 'pFATF',
                   'ihs.tariff',
                   'kai_o', 'kai_d', 'kao_o', 'kao_d',
                   'cc_o', 'cc_d', 'cci_o', 'cci_d', 'cco_o', 'cco_d',
                   'di_o', 'di_d', 'dii_o', 'dii_d', 'dio_o', 'dio_d']

features = [       'ln.gdp_o', 'ln.gdp_d', 'ln.pop_o', 'ln.pop_d', 
                   'dist', 'contig', 
                   'comlang', 'comcol', 'col45', 
                   'ihs.entry_cost_o', 'ihs.entry_cost_d', 'rta',
                   'rCorrCont', 'pCorrCont',
                   'rRegQual', 'pRegQual', 
                   'rRuleLaw', 'pRuleLaw',
                   'pSecrecyScore',
                   'pFSI.rank',
                   'pKFSI13',
                   'pKFSI17',
                   'pKFSI20',
                   'rFATF', 'pFATF',
                   'ihs.tariff',
                   'kai_o', 'kai_d', 'kao_o', 'kao_d',
                   'cc_o', 'cc_d', 'cci_o', 'cci_d', 'cco_o', 'cco_d',
                   'di_o', 'di_d', 'dii_o', 'dii_d', 'dio_o', 'dio_d']

ids = [       'reporter.ISO', 'partner.ISO', 'year']

In [4]:
def create_smp(data, features):
    """
    Create train and test samples that are complete.
    """
    smp = data[features]
    smp.dropna(axis=0, how='any', inplace=True)
    return smp

## Samples

### Import full sample

In [5]:
data = feather.read_feather(results_dir + 'Africa_agg.feather')

In [6]:
data_smp = create_smp(data, select_features)

In [7]:
idx = data_smp[ids]
X = data_smp[features]
Y_out = data_smp[['ln.Tot_IFF_t']]
Y_in = data_smp[['ln.In_Tot_IFF_t']]

In [8]:
feather.write_feather(idx, results_dir + 'idx.feather')
feather.write_feather(X, results_dir + 'X.feather')
feather.write_feather(Y_out, results_dir + 'Y_out.feather')
feather.write_feather(Y_in, results_dir + 'Y_in.feather')

In [9]:
print('X: ', X.shape, '\nY_out: ',  Y_out.shape)

X:  (5333, 42) 
Y_out:  (5333, 1)


### Import training and test sets

In [10]:
LMIC_agg = feather.read_feather(results_dir + 'LMIC_agg.feather')
HIC_agg = feather.read_feather(results_dir + 'HIC_agg.feather')

In [11]:
LMIC_agg_smp = create_smp(LMIC_agg, select_features)
HIC_agg_smp = create_smp(HIC_agg, select_features)

In [12]:
train_agg = feather.read_feather(results_dir + 'train_agg.feather')
test_agg = feather.read_feather(results_dir + 'test_agg.feather')

In [13]:
train_agg_smp = create_smp(train_agg, select_features)
test_agg_smp = create_smp(test_agg, select_features)

In [14]:
feather.write_feather(train_agg_smp, results_dir + 'train_agg_smp.feather')
feather.write_feather(test_agg_smp, results_dir + 'test_agg_smp.feather')

In [15]:
print('Training set: ', train_agg_smp.shape, '\nTest set: ',  test_agg_smp.shape, '\nLMIC set: ',  LMIC_agg_smp.shape)

Training set:  (4256, 47) 
Test set:  (1077, 47) 
LMIC set:  (9668, 47)


### Create feature set and vector of outcome labels

In [85]:
# Training set
Y_train_out = train_agg_smp[['ln.Tot_IFF_t']]
Y_train_in = train_agg_smp[['ln.In_Tot_IFF_t']]
X_train = train_agg_smp[features]

# Test set
Y_test_out = test_agg_smp[['ln.Tot_IFF_t']]
Y_test_in = test_agg_smp[['ln.In_Tot_IFF_t']]
X_test = test_agg_smp[features]

# LMIC outcome vectors and features
Y_out_LMIC = LMIC_agg_smp[['ln.Tot_IFF_t']]
Y_in_LMIC = LMIC_agg_smp[['ln.In_Tot_IFF_t']]
X_LMIC = LMIC_agg_smp[features]

# HIC outcome vectors and features
Y_out_HIC = HIC_agg_smp[['ln.Tot_IFF_t']]
Y_in_HIC = HIC_agg_smp[['ln.In_Tot_IFF_t']]
X_HIC = HIC_agg_smp[features]

In [17]:
idx = train_agg_smp[ids]

### Tune Random Forest model

In [18]:
# The hyperparameters were found using a cross-validation randomized search strategy on the training sample X_train
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

### Results on real samples

In [19]:
# Fit tuned model on real training data
RF_out_tuned.fit(X_train, Y_train_out.values.ravel())

RandomForestRegressor(max_depth=195, min_samples_split=12, n_estimators=1278,
                      random_state=1509)

In [20]:
# Training R-squared
RF_out_tuned.score(X_train, Y_train_out)

0.8918552097958046

In [21]:
# Test R-squared
RF_out_tuned.score(X_test, Y_test_out)

0.7081047138987878

In [22]:
# Reset the estimator for cross_val_score
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

In [23]:
# Cross-validated R-squared on the training sample using 5 folds
# This is the best score reported by RandomizedSearchCV
# This is the one to report because CV doesn't need train/test split
# X_train here is the one used in hyperparameter tuning
# Error here is from held-out test sets so model was not fit on X_train
CV_train_scores = cross_val_score(RF_out_tuned, X_train, Y_train_out.values.ravel())
print('R-squared from each fold (training):', CV_train_scores)

R-squared from each fold (training): [0.64624182 0.67827449 0.67677421 0.6886621  0.69110983]


In [24]:
# Mean cross-validated score
CV_train_scores.mean()

0.6762124915340071

### Results on LMIC set (outflows)

In [25]:
# Fit tuned model on real training data
RF_out_tuned.fit(X_train, Y_train_out.values.ravel())

RandomForestRegressor(max_depth=195, min_samples_split=12, n_estimators=1278,
                      random_state=1509)

In [26]:
# LMIC R-squared
RF_out_tuned.score(X_LMIC, Y_out_LMIC)

0.6018906625061435

In [27]:
# Cross-validated R-squared on the LMIC sample using 5 folds
CV_scores_out_LMIC = cross_val_score(RF_out_tuned, X_LMIC, Y_out_LMIC.values.ravel())
print('R-squared from each fold :', CV_scores_out_LMIC)

R-squared from each fold : [0.40623784 0.19606454 0.50883527 0.32490603 0.47968467]


In [28]:
# Mean cross-validated score
CV_scores_out_LMIC.mean()

0.3831456695816001

### Results on LMIC set (inflows)

In [76]:
# Reset the estimator
RF_in_tuned = RandomForestRegressor(random_state = 1509,
                                    n_estimators = 1278,
                                    max_depth = 195,
                                    min_samples_split = 12,
                                    min_samples_leaf = 1,
                                    max_features = 'auto',
                                    bootstrap = True)

In [77]:
# Fit tuned model on real training data
RF_in_tuned.fit(X_train, Y_train_in.values.ravel())

RandomForestRegressor(max_depth=195, min_samples_split=12, n_estimators=1278,
                      random_state=1509)

In [86]:
# LMIC R-squared
RF_in_tuned.score(X_LMIC, Y_in_LMIC)

0.556747215488813

In [87]:
# Cross-validated R-squared on the LMIC sample using 5 folds
CV_scores_in_LMIC = cross_val_score(RF_in_tuned, X_LMIC, Y_in_LMIC.values.ravel())
print('R-squared from each fold :', CV_scores_in_LMIC)

R-squared from each fold : [0.4834182  0.50600178 0.5637943  0.47190186 0.55095169]


In [80]:
# Mean cross-validated score
CV_scores_in_LMIC.mean()

0.3831456695816001

### Results on HIC set (outflows)

In [22]:
# Reset the estimator
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

In [29]:
# HIC R-squared
RF_out_tuned.score(X_HIC, Y_out_HIC)

0.5442467500220617

In [30]:
# Cross-validated R-squared on the HIC sample using 5 folds
CV_scores_out_HIC = cross_val_score(RF_out_tuned, X_HIC, Y_out_HIC.values.ravel())
print('R-squared from each fold :', CV_scores_out_HIC)

R-squared from each fold : [0.65855662 0.66300172 0.54843668 0.51804504 0.64355374]


In [31]:
# Mean cross-validated score
CV_scores_out_HIC.mean()

0.6063187597435487

### Results on HIC set (inflows)

In [88]:
# Reset the estimator
RF_in_tuned = RandomForestRegressor(random_state = 1509,
                                    n_estimators = 1278,
                                    max_depth = 195,
                                    min_samples_split = 12,
                                    min_samples_leaf = 1,
                                    max_features = 'auto',
                                    bootstrap = True)

In [89]:
# Fit tuned model on real training data
RF_in_tuned.fit(X_train, Y_train_in.values.ravel())

RandomForestRegressor(max_depth=195, min_samples_split=12, n_estimators=1278,
                      random_state=1509)

In [None]:
test = RF_in_tune.predict(X_HIC)
r2_score(test, Y_in_HIC)

In [90]:
# HIC R-squared
RF_in_tuned.score(X_HIC, Y_in_HIC)

0.41827836617468195

In [91]:
# Cross-validated R-squared on the HIC sample using 5 folds
CV_scores_in_HIC = cross_val_score(RF_in_tuned, X_HIC, Y_in_HIC.values.ravel())
print('R-squared from each fold :', CV_scores_in_HIC)

R-squared from each fold : [0.63386736 0.61708207 0.61974949 0.52931547 0.54927167]


In [92]:
# Mean cross-validated score
CV_scores_in_HIC.mean()

0.5898572126667652

### Results on individual countries

In [32]:
# Import without ZAF
Africa_noZAF = feather.read_feather(results_dir + 'Africa_noZAF.feather')
Africa_noZAF_smp = create_smp(Africa_noZAF, select_features)

# Import ZAF
ZAF = feather.read_feather(results_dir + 'ZAF.feather')
ZAF_smp = create_smp(ZAF, select_features)

# Training and test sets for Africa with no ZAF
Y_Africa_noZAF = Africa_noZAF_smp[['ln.Tot_IFF_t']]
X_Africa_noZAF = Africa_noZAF_smp[features]

# Training and test sets for ZAF
Y_ZAF = ZAF_smp[['ln.Tot_IFF_t']]
X_ZAF = ZAF_smp[features]

In [33]:
# The hyperparameters were found using a cross-validation randomized search strategy on the training sample X_train
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

In [34]:
# Fit tuned model on training data without ZAF
RF_out_tuned.fit(X_Africa_noZAF, Y_Africa_noZAF.values.ravel())

RandomForestRegressor(max_depth=195, min_samples_split=12, n_estimators=1278,
                      random_state=1509)

In [35]:
# ZAF R-squared
RF_out_tuned.score(X_ZAF, Y_ZAF)

-1.625300992707332

In [36]:
# Cross-validated R-squared for ZAF
CV_scores_ZAF = cross_val_score(RF_out_tuned, X_ZAF, Y_ZAF.values.ravel())
print('R-squared from each fold :', CV_scores_ZAF)

R-squared from each fold : [0.80488705 0.88518689 0.92474492 0.91204537 0.86323192]


In [37]:
CV_scores_ZAF.mean()

0.8780192320819709

#### Egypt

In [38]:
# Import without EGY
Africa_noEGY = feather.read_feather(results_dir + 'Africa_noEGY.feather')
Africa_noEGY_smp = create_smp(Africa_noEGY, select_features)

# Import EGY
EGY = feather.read_feather(results_dir + 'EGY.feather')
EGY_smp = create_smp(EGY, select_features)

# Training and test sets for Africa with no EGY
Y_Africa_noEGY = Africa_noEGY_smp[['ln.Tot_IFF_t']]
X_Africa_noEGY = Africa_noEGY_smp[features]

# Training and test sets for EGY
Y_EGY = EGY_smp[['ln.Tot_IFF_t']]
X_EGY = EGY_smp[features]

In [39]:
# The hyperparameters were found using a cross-validation randomized search strategy on the training sample X_train
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

In [40]:
# Fit tuned model on training data without ZAF
RF_out_tuned.fit(X_Africa_noEGY, Y_Africa_noEGY.values.ravel())

RandomForestRegressor(max_depth=195, min_samples_split=12, n_estimators=1278,
                      random_state=1509)

In [41]:
# ZAF R-squared
RF_out_tuned.score(X_EGY, Y_EGY)

0.3425570448698736

In [42]:
# Cross-validated R-squared for EGY
CV_scores_EGY = cross_val_score(RF_out_tuned, X_EGY, Y_EGY.values.ravel())
print('R-squared from each fold :', CV_scores_EGY)

R-squared from each fold : [0.50547577 0.38975005 0.72135843 0.76685198 0.48261659]


In [43]:
CV_scores_EGY.mean()

0.5732105632835115

#### Nigeria inflows

In [45]:
# Import without NGA
Africa_noNGA = feather.read_feather(results_dir + 'Africa_noNGA.feather')
Africa_noNGA_smp = create_smp(Africa_noNGA, select_features)

# Import NGA
NGA = feather.read_feather(results_dir + 'NGA.feather')
NGA_smp = create_smp(NGA, select_features)

# Training and test sets for Africa with no NGA
Y_Africa_noNGA = Africa_noNGA_smp[['ln.In_Tot_IFF_t']]
X_Africa_noNGA = Africa_noNGA_smp[features]

# Training and test sets for NGA
Y_NGA = NGA_smp[['ln.In_Tot_IFF_t']]
X_NGA = NGA_smp[features]

In [46]:
# The hyperparameters were found using a cross-validation randomized search strategy on the training sample X_train
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

In [47]:
# Fit tuned model on training data without NGA
RF_out_tuned.fit(X_Africa_noNGA, Y_Africa_noNGA.values.ravel())

RandomForestRegressor(max_depth=195, min_samples_split=12, n_estimators=1278,
                      random_state=1509)

In [48]:
# NGA R-squared
RF_out_tuned.score(X_NGA, Y_NGA)

0.2785436385753627

In [49]:
# Cross-validated R-squared for NGA
CV_scores_NGA = cross_val_score(RF_out_tuned, X_NGA, Y_NGA.values.ravel())
print('R-squared from each fold :', CV_scores_NGA)

R-squared from each fold : [0.32530725 0.18508736 0.47920391 0.10461769 0.02040674]


In [50]:
CV_scores_NGA.mean()

0.2229245913162742

#### Algeria inflows

In [51]:
# Import without DZA
Africa_noDZA = feather.read_feather(results_dir + 'Africa_noDZA.feather')
Africa_noDZA_smp = create_smp(Africa_noDZA, select_features)

# Import DZA
DZA = feather.read_feather(results_dir + 'DZA.feather')
DZA_smp = create_smp(DZA, select_features)

# Training and test sets for Africa with no DZA
Y_Africa_noDZA = Africa_noDZA_smp[['ln.In_Tot_IFF_t']]
X_Africa_noDZA = Africa_noDZA_smp[features]

# Training and test sets for DZA
Y_DZA = DZA_smp[['ln.In_Tot_IFF_t']]
X_DZA = DZA_smp[features]

In [52]:
# The hyperparameters were found using a cross-validation randomized search strategy on the training sample X_train
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

In [53]:
# Fit tuned model on training data without DZA
RF_out_tuned.fit(X_Africa_noDZA, Y_Africa_noDZA.values.ravel())

RandomForestRegressor(max_depth=195, min_samples_split=12, n_estimators=1278,
                      random_state=1509)

In [54]:
# DZA R-squared
RF_out_tuned.score(X_DZA, Y_DZA)

0.34809948394153345

In [55]:
# Cross-validated R-squared for DZA
CV_scores_DZA = cross_val_score(RF_out_tuned, X_DZA, Y_DZA.values.ravel())
print('R-squared from each fold :', CV_scores_DZA)

R-squared from each fold : [0.57658982 0.62844979 0.52698586 0.24954457 0.39537351]


In [56]:
CV_scores_DZA.mean()

0.47538870998516114

#### Angola outflows

In [60]:
# Import without AGO
Africa_noAGO = feather.read_feather(results_dir + 'Africa_noAGO.feather')
Africa_noAGO_smp = create_smp(Africa_noAGO, select_features)

# Import AGO
AGO = feather.read_feather(results_dir + 'AGO.feather')
AGO_smp = create_smp(AGO, select_features)

# Training and test sets for Africa with no AGO
Y_Africa_noAGO = Africa_noAGO_smp[['ln.Tot_IFF_t']]
X_Africa_noAGO = Africa_noAGO_smp[features]

# Training and test sets for AGO
Y_AGO = AGO_smp[['ln.Tot_IFF_t']]
X_AGO = AGO_smp[features]

In [61]:
# The hyperparameters were found using a cross-validation randomized search strategy on the training sample X_train
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

In [62]:
# Fit tuned model on training data without AGO
RF_out_tuned.fit(X_Africa_noAGO, Y_Africa_noAGO.values.ravel())

RandomForestRegressor(max_depth=195, min_samples_split=12, n_estimators=1278,
                      random_state=1509)

In [63]:
# AGO R-squared
RF_out_tuned.score(X_AGO, Y_AGO)

-0.38009779591092774

In [64]:
# Cross-validated R-squared for AGO
CV_scores_AGO = cross_val_score(RF_out_tuned, X_AGO, Y_AGO.values.ravel())
print('R-squared from each fold :', CV_scores_AGO)

R-squared from each fold : [-1.19720059 -0.42828461 -0.59817001 -0.54257651  0.16855888]


In [65]:
CV_scores_AGO.mean()

-0.5195345692400766

#### Morocco inflows

In [67]:
# Import without MAR
Africa_noMAR = feather.read_feather(results_dir + 'Africa_noMAR.feather')
Africa_noMAR_smp = create_smp(Africa_noMAR, select_features)

# Import MAR
MAR = feather.read_feather(results_dir + 'MAR.feather')
MAR_smp = create_smp(MAR, select_features)

# Training and test sets for Africa with no MAR
Y_Africa_noMAR = Africa_noMAR_smp[['ln.In_Tot_IFF_t']]
X_Africa_noMAR = Africa_noMAR_smp[features]

# Training and test sets for MAR
Y_MAR = MAR_smp[['ln.In_Tot_IFF_t']]
X_MAR = MAR_smp[features]

In [68]:
# The hyperparameters were found using a cross-validation randomized search strategy on the training sample X_train
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

In [69]:
# Fit tuned model on training data without MAR
RF_out_tuned.fit(X_Africa_noMAR, Y_Africa_noMAR.values.ravel())

RandomForestRegressor(max_depth=195, min_samples_split=12, n_estimators=1278,
                      random_state=1509)

In [54]:
# MAR R-squared
RF_out_tuned.score(X_MAR, Y_MAR)

0.34809948394153345

In [55]:
# Cross-validated R-squared for MAR
CV_scores_MAR = cross_val_score(RF_out_tuned, X_MAR, Y_MAR.values.ravel())
print('R-squared from each fold :', CV_scores_MAR)

R-squared from each fold : [0.57658982 0.62844979 0.52698586 0.24954457 0.39537351]


In [56]:
CV_scores_MAR.mean()

0.47538870998516114