# Predictions

## Data wrangling

In [2]:
# Preamble
import pandas as pd
import numpy as np
pd.set_option("mode.chained_assignment", None)
import random
random.seed(1509)
import matplotlib.pyplot as plt
import lightgbm as lgb
import pyarrow.feather as feather
from os import chdir, getcwd
import statsmodels.api as sm
from pprint import pprint
from nested_cv import NestedCV

# sci-kit
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score
from sklearn import tree

In [3]:
data_dir = '/home/jovyan/work/Data/'
results_dir = '/home/jovyan/work/Results/'

In [4]:
select_features = ['reporter.ISO', 'partner.ISO', 'year',
                   'ln.Tot_IFF_t', 'ln.In_Tot_IFF_t',
                   'ln.gdp_o', 'ln.gdp_d', 'ln.pop_o', 'ln.pop_d', 
                   'dist', 'contig', 
                   'comlang', 'comcol', 'col45', 
                   'ihs.entry_cost_o', 'ihs.entry_cost_d', 'rta',
                   'rCorrCont', 'pCorrCont',
                   'rRegQual', 'pRegQual', 
                   'rRuleLaw', 'pRuleLaw',
                   'pSecrecyScore',
                   'pFSI.rank',
                   'pKFSI13',
                   'pKFSI17',
                   'pKFSI20',
                   'rFATF', 'pFATF',
                   'ihs.tariff',
                   'kai_o', 'kai_d', 'kao_o', 'kao_d',
                   'cc_o', 'cc_d', 'cci_o', 'cci_d', 'cco_o', 'cco_d',
                   'di_o', 'di_d', 'dii_o', 'dii_d', 'dio_o', 'dio_d']

features = [       'ln.gdp_o', 'ln.gdp_d', 'ln.pop_o', 'ln.pop_d', 
                   'dist', 'contig', 
                   'comlang', 'comcol', 'col45', 
                   'ihs.entry_cost_o', 'ihs.entry_cost_d', 'rta',
                   'rCorrCont', 'pCorrCont',
                   'rRegQual', 'pRegQual', 
                   'rRuleLaw', 'pRuleLaw',
                   'pSecrecyScore',
                   'pFSI.rank',
                   'pKFSI13',
                   'pKFSI17',
                   'pKFSI20',
                   'rFATF', 'pFATF',
                   'ihs.tariff',
                   'kai_o', 'kai_d', 'kao_o', 'kao_d',
                   'cc_o', 'cc_d', 'cci_o', 'cci_d', 'cco_o', 'cco_d',
                   'di_o', 'di_d', 'dii_o', 'dii_d', 'dio_o', 'dio_d']

ids = [       'reporter.ISO', 'partner.ISO', 'year']

## Subset sample

In [5]:
data = feather.read_feather(results_dir + 'Africa_agg.feather')

In [6]:
def create_smp(data, features):
    """
    Create train and test samples that are complete.
    """
    smp = data[features]
    smp.dropna(axis=0, how='any', inplace=True)
    return smp

In [7]:
data_smp = create_smp(data, select_features)

In [8]:
idx = data_smp[ids]
X = data_smp[features]
# Y_out = data_smp[['ln.Tot_IFF_t']]
Y_in = data_smp[['ln.In_Tot_IFF_t']]

In [51]:
feather.write_feather(idx, results_dir + 'idx.feather')
feather.write_feather(X, results_dir + 'X.feather')
# feather.write_feather(Y_out, results_dir + 'Y_out.feather')
feather.write_feather(Y_in, results_dir + 'Y_in.feather')

In [50]:
print('X: ', X.shape)
print('Y_in: ', Y_in.shape)
print('idx: ', idx.shape)

X:  (5333, 42)
Y_out:  (5333, 1)
idx:  (4256, 3)


### Train/test split

In [11]:
train_agg = feather.read_feather(results_dir + 'train_agg.feather')
test_agg = feather.read_feather(results_dir + 'test_agg.feather')

In [12]:
train_agg_smp = create_smp(train_agg, select_features)
test_agg_smp = create_smp(test_agg, select_features)

In [13]:
feather.write_feather(train_agg_smp, results_dir + 'train_agg_smp.feather')
feather.write_feather(test_agg_smp, results_dir + 'test_agg_smp.feather')

In [14]:
# Y_train_out = train_agg_smp[['ln.Tot_IFF_t']]
Y_train_in = train_agg_smp[['ln.In_Tot_IFF_t']]
X_train = train_agg_smp[features]
# Y_test_out = test_agg_smp[['ln.Tot_IFF_t']]
Y_test_in = test_agg_smp[['ln.In_Tot_IFF_t']]
X_test = test_agg_smp[features]

In [48]:
feather.write_feather(X_train, results_dir + 'X_train_in.feather')
feather.write_feather(Y_train_out, results_dir + 'Y_train_in.feather')

In [15]:
print('X_train: ', X_train.shape, '\nX_test: ',  X_test.shape)

X_train:  (4256, 42) 
X_test:  (1077, 42)


In [49]:
idx = train_agg_smp[ids]

## Random Forests

### Fit baseline random forests regression

In [11]:
RF_0_mod_in = RandomForestRegressor(random_state = 1509)
RF_0_mod_in.fit(X, Y_in.values.ravel())

### Tune hyperparameters with randomized search

In [17]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 3000, num = 100)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 500, num = 100)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(2, 50, num = 10)]

# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(1, 100, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_features': max_features,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [10, 40, 70, 100, 130, 161, 191, 221, 251, 281, 312, 342, 372, 402, 432, 463, 493, 523, 553, 583, 614, 644, 674, 704, 734, 765, 795, 825, 855, 885, 916, 946, 976, 1006, 1036, 1067, 1097, 1127, 1157, 1187, 1218, 1248, 1278, 1308, 1338, 1369, 1399, 1429, 1459, 1489, 1520, 1550, 1580, 1610, 1640, 1671, 1701, 1731, 1761, 1791, 1822, 1852, 1882, 1912, 1942, 1973, 2003, 2033, 2063, 2093, 2124, 2154, 2184, 2214, 2244, 2275, 2305, 2335, 2365, 2395, 2426, 2456, 2486, 2516, 2546, 2577, 2607, 2637, 2667, 2697, 2728, 2758, 2788, 2818, 2848, 2879, 2909, 2939, 2969, 3000], 'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215, 220, 225, 230, 235, 240, 245, 250, 255, 260, 265, 270, 275, 280, 285, 290, 295, 300, 305, 310, 315, 320, 325, 330, 335, 340, 345, 350, 355, 360, 365, 370, 375, 380, 385, 390, 395, 400, 405, 410, 415, 420, 425

In [20]:
# Create the base model to tune
RF_0_mod_out = RandomForestRegressor(random_state = 1509)

# Random search of parameters on base model using 3 fold cross validation 
# Search across 100 different combinations, and use all available cores
RF_random_out = RandomizedSearchCV(random_state = 1509,
                                   estimator = RF_0_mod_out, 
                                   param_distributions = random_grid,
                                   scoring = 'r2',
                                   n_iter = 100,
                                   verbose = 3, n_jobs = -1)

In [21]:
# Fit the random search model
RF_random_out.fit(X_train, Y_train_in.values.ravel())

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 2/5] END bootstrap=True, max_depth=475, max_features=sqrt, min_samples_leaf=45, min_samples_split=12, n_estimators=372;, score=0.530 total time=   2.1s
[CV 3/5] END bootstrap=True, max_depth=475, max_features=sqrt, min_samples_leaf=45, min_samples_split=12, n_estimators=372;, score=0.515 total time=   2.0s
[CV 5/5] END bootstrap=True, max_depth=475, max_features=sqrt, min_samples_leaf=45, min_samples_split=12, n_estimators=372;, score=0.523 total time=   2.0s
[CV 2/5] END bootstrap=True, max_depth=195, max_features=auto, min_samples_leaf=1, min_samples_split=12, n_estimators=1278;, score=0.678 total time=  56.0s
[CV 4/5] END bootstrap=True, max_depth=195, max_features=auto, min_samples_leaf=1, min_samples_split=12, n_estimators=1278;, score=0.689 total time=  57.5s
[CV 1/5] END bootstrap=False, max_depth=430, max_features=auto, min_samples_leaf=78, min_samples_split=2, n_estimators=2093;, score=0.460 total time= 1.2min


RandomizedSearchCV(estimator=RandomForestRegressor(random_state=1509),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [5, 10, 15, 20, 25, 30, 35,
                                                      40, 45, 50, 55, 60, 65,
                                                      70, 75, 80, 85, 90, 95,
                                                      100, 105, 110, 115, 120,
                                                      125, 130, 135, 140, 145,
                                                      150, ...],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 12, 23, 34, 45,
                                                             56, 67, 78, 89,
                                                             100],
                                        'min_samples_split': 

[CV 5/5] END bootstrap=False, max_depth=500, max_features=auto, min_samples_leaf=89, min_samples_split=50, n_estimators=2788;, score=0.497 total time= 1.4min
[CV 3/5] END bootstrap=True, max_depth=110, max_features=auto, min_samples_leaf=100, min_samples_split=39, n_estimators=2214;, score=0.495 total time=  40.8s
[CV 5/5] END bootstrap=True, max_depth=110, max_features=auto, min_samples_leaf=100, min_samples_split=39, n_estimators=2214;, score=0.512 total time=  40.3s
[CV 2/5] END bootstrap=True, max_depth=460, max_features=auto, min_samples_leaf=89, min_samples_split=2, n_estimators=1218;, score=0.508 total time=  22.7s
[CV 4/5] END bootstrap=True, max_depth=460, max_features=auto, min_samples_leaf=89, min_samples_split=2, n_estimators=1218;, score=0.551 total time=  23.4s
[CV 1/5] END bootstrap=True, max_depth=470, max_features=auto, min_samples_leaf=89, min_samples_split=23, n_estimators=644;, score=0.488 total time=  12.1s
[CV 3/5] END bootstrap=True, max_depth=470, max_features=a

In [115]:
# Mean cross-validated score of the best_estimator (5-fold cross-validation by default)
print('Best Score: %s' % RF_random_in.best_score_)

Best Score: 0.6762124915340071


In [116]:
# Save best estimator that gave highest score on left out data
best_random = RF_random_in.best_estimator_

In [35]:
# Check and evaluate score using cross-validation (5-fold cross-validation by default)
scores = cross_val_score(RF_random_in.best_estimator_, X_train, Y_train_in.values.ravel())

In [117]:
scores.mean()

0.6762124915340071

In [52]:
# Check and evaluate score using cross-validation (5-fold cross-validation by default) on full sample
scores = cross_val_score(RF_random_in.best_estimator_, X, Y_in.values.ravel())

In [117]:
scores.mean()

0.6762124915340071

In [366]:
RF_random_in.best_params_

{'n_estimators': 1278,
 'min_samples_split': 12,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 195,
 'bootstrap': True}

In [121]:
print(best_random.score(X_train, Y_train_in))

0.8918552097958046


In [122]:
print(best_random.score(X_test, Y_test_in))

0.7081047138987878


In [123]:
print(best_random.score(X, Y_in))

0.8539825727395456


In [43]:
predictions = cross_val_predict(best_random, X_train, Y_train_in.values.ravel())
r2_score(Y_train_in, predictions)

0.6770733080450901

In [44]:
feather.write_feather(pd.DataFrame(predictions), results_dir + 'preds.RF.CV_in.feather')