# Placebo trials

## Preamble

In [None]:
# Preamble
import pandas as pd
import numpy as np
pd.set_option("mode.chained_assignment", None)
import random
random.seed(1509)
import matplotlib.pyplot as plt
import lightgbm as lgb
import pyarrow.feather as feather
from os import chdir, getcwd
import statsmodels.api as sm
from pprint import pprint
from nested_cv import NestedCV

# sci-kit
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score
from sklearn import tree

In [None]:
data_dir = '/home/jovyan/work/Data/'
results_dir = '/home/jovyan/work/Results/'

In [None]:
select_features = ['reporter.ISO', 'partner.ISO', 'year',
                   'ln.Tot_IFF_t', 'ln.In_Tot_IFF_t',
                   'ln.gdp_o', 'ln.gdp_d', 'ln.pop_o', 'ln.pop_d', 
                   'dist', 'contig', 
                   'comlang', 'comcol', 'col45', 
                   'ihs.entry_cost_o', 'ihs.entry_cost_d', 'rta',
                   'rCorrCont', 'pCorrCont',
                   'rRegQual', 'pRegQual', 
                   'rRuleLaw', 'pRuleLaw',
                   'pSecrecyScore',
                   'pFSI.rank',
                   'pKFSI13',
                   'pKFSI17',
                   'pKFSI20',
                   'rFATF', 'pFATF',
                   'ihs.tariff',
                   'kai_o', 'kai_d', 'kao_o', 'kao_d',
                   'cc_o', 'cc_d', 'cci_o', 'cci_d', 'cco_o', 'cco_d',
                   'di_o', 'di_d', 'dii_o', 'dii_d', 'dio_o', 'dio_d']

features = [       'ln.gdp_o', 'ln.gdp_d', 'ln.pop_o', 'ln.pop_d', 
                   'dist', 'contig', 
                   'comlang', 'comcol', 'col45', 
                   'ihs.entry_cost_o', 'ihs.entry_cost_d', 'rta',
                   'rCorrCont', 'pCorrCont',
                   'rRegQual', 'pRegQual', 
                   'rRuleLaw', 'pRuleLaw',
                   'pSecrecyScore',
                   'pFSI.rank',
                   'pKFSI13',
                   'pKFSI17',
                   'pKFSI20',
                   'rFATF', 'pFATF',
                   'ihs.tariff',
                   'kai_o', 'kai_d', 'kao_o', 'kao_d',
                   'cc_o', 'cc_d', 'cci_o', 'cci_d', 'cco_o', 'cco_d',
                   'di_o', 'di_d', 'dii_o', 'dii_d', 'dio_o', 'dio_d']

ids = [       'reporter.ISO', 'partner.ISO', 'year']

In [None]:
def create_smp(data, features):
    """
    Create train and test samples that are complete.
    """
    smp = data[features]
    smp.dropna(axis=0, how='any', inplace=True)
    return smp

## Samples

### Import full sample

In [None]:
data = feather.read_feather(results_dir + 'Africa_agg.feather')

In [None]:
data_smp = create_smp(data, select_features)

In [None]:
idx = data_smp[ids]
X = data_smp[features]
Y_out = data_smp[['ln.Tot_IFF_t']]
Y_in = data_smp[['ln.In_Tot_IFF_t']]

In [None]:
feather.write_feather(idx, results_dir + 'idx.feather')
feather.write_feather(X, results_dir + 'X.feather')
feather.write_feather(Y_out, results_dir + 'Y_out.feather')
feather.write_feather(Y_in, results_dir + 'Y_in.feather')

In [None]:
print('X: ', X.shape, '\nY_out: ',  Y_out.shape)

### Import training and test sets

In [None]:
train_agg = feather.read_feather(results_dir + 'train_agg.feather')
test_agg = feather.read_feather(results_dir + 'test_agg.feather')

In [None]:
train_agg_smp = create_smp(train_agg, select_features)
test_agg_smp = create_smp(test_agg, select_features)

In [None]:
feather.write_feather(train_agg_smp, results_dir + 'train_agg_smp.feather')
feather.write_feather(test_agg_smp, results_dir + 'test_agg_smp.feather')

In [None]:
print('Training set: ', train_agg_smp.shape, '\nTest set: ',  test_agg_smp.shape)

### Create feature set and vector of outcome labels

In [None]:
# Training set
Y_train_out = train_agg_smp[['ln.Tot_IFF_t']]
Y_train_in = train_agg_smp[['ln.In_Tot_IFF_t']]
X_train = train_agg_smp[features]

# Test set
Y_test_out = test_agg_smp[['ln.Tot_IFF_t']]
Y_test_in = test_agg_smp[['ln.In_Tot_IFF_t']]
X_test = test_agg_smp[features]

In [None]:
idx = train_agg_smp[ids]

### Create placebo samples

In [None]:
# Unilateral features of reporter i
Xi = X_train[['ln.gdp_o', 'ln.pop_o', 'ihs.entry_cost_o', 
              'rCorrCont', 'rRegQual', 'rRuleLaw',
              'rFATF', 
              'kai_o', 'kao_o', 'cc_o', 'cci_o', 'cco_o', 'di_o', 'dii_o', 'dio_o']]

In [None]:
# Unilateral features of partner j
Xj = X_train[['ln.gdp_d', 'ln.pop_d', 'ihs.entry_cost_d', 
              'pCorrCont', 'pRegQual', 'pRuleLaw',
              'pSecrecyScore', 'pFSI.rank', 'pKFSI13', 'pKFSI17', 'pKFSI20',
              'pFATF', 
              'kai_d', 'kao_d', 'cc_d', 'cci_d', 'cco_d', 'di_d', 'dii_d', 'dio_d']]

In [None]:
# Bilateral features of i and j
Xij = X_train[['dist', 'contig', 'comlang', 'comcol', 'col45', 'rta', 'ihs.tariff']]

#### Randomly re-assign partner j

In [None]:
# Shuffle partners j, keeping same unilateral characteristics, but randomly assigned to reporter i
Xj_shuffle = Xj.sample(frac = 1)

In [None]:
# Combine all features into placebo predictor matrix
Xj_placebo = pd.concat([Xi, Xj_shuffle, Xij], axis = 1, ignore_index = True)

In [None]:
print('Xi: ', Xi.shape, '\nXj_shuffle: ',  Xj_shuffle.shape, '\nXij: ',  Xij.shape)
print('X_train: ', X_train.shape, '\nXj_placebo: ',  Xj_placebo.shape)

#### Randomly shuffle all bilateral transactions

In [None]:
# Shuffle all rows in the feature set
X_placebo = X_train.sample(frac = 1)

In [None]:
print('X_train: ', X_train.shape, '\nX_placebo: ',  X_placebo.shape)

### Tune Random Forest model

In [None]:
# The hyperparameters were found using a cross-validation randomized search strategy on the training sample X_train
RF_out_tuned = RandomForestRegressor(random_state = 1509,
                                     n_estimators = 1278,
                                     max_depth = 195,
                                     min_samples_split = 12,
                                     min_samples_leaf = 1,
                                     max_features = 'auto',
                                     bootstrap = True)

# Set up another RF estimator for placebo tests (exactly the same as above) to not confuse them
RF_out_tuned_placebo = RandomForestRegressor(random_state = 1509,
                                             n_estimators = 1278,
                                             max_depth = 195,
                                             min_samples_split = 12,
                                             min_samples_leaf = 1,
                                             max_features = 'auto',
                                             bootstrap = True)

### Results on real samples

In [None]:
# Fit tuned model on real training data
RF_out_tuned.fit(X_train, Y_train_out.values.ravel())

In [None]:
# Training R-squared
RF_out_tuned.score(X_train, Y_train_out)

In [None]:
# Test R-squared
RF_out_tuned.score(X_test, Y_test_out)

In [None]:
# Reset the estimator for cross_val_score
RF_out_tuned_placebo = RandomForestRegressor(random_state = 1509,
                                             n_estimators = 1278,
                                             max_depth = 195,
                                             min_samples_split = 12,
                                             min_samples_leaf = 1,
                                             max_features = 'auto',
                                             bootstrap = True)

In [None]:
# Cross-validated R-squared on the full sample using 5 folds
CV_scores = cross_val_score(RF_out_tuned, X, Y_out.values.ravel())
print('R-squared from each fold :', CV_scores)

In [None]:
# Mean cross-validated score
CV_scores.mean()

In [None]:
# Cross-validated R-squared on the training sample using 5 folds
# This is the best score reported by RandomizedSearchCV
# This is the one to report because CV doesn't need train/test split
# X_train here is the one used in hyperparameter tuning
# Error here is from held-out test sets so model was not fit on X_train
CV_train_scores = cross_val_score(RF_out_tuned, X_train, Y_train_out.values.ravel())
print('R-squared from each fold (training):', CV_train_scores)

In [None]:
# Mean cross-validated score
CV_train_scores.mean()

In [None]:
# Cross-validated R-squared on the test sample using 5 folds
CV_test_scores = cross_val_score(RF_out_tuned, X_test, Y_test_out.values.ravel())
print('R-squared from each fold (test):', CV_test_scores)

In [None]:
# Mean cross-validated score
CV_test_scores.mean()

### Results on placebo data where partners are shuffled

In [None]:
# Fit tuned model on placebo data where partners j have been randomly re-assigned
RF_out_tuned_placebo.fit(Xj_placebo, Y_train_out.values.ravel())

In [None]:
# Training R-squared
RF_out_tuned_placebo.score(Xj_placebo, Y_train_out)

In [None]:
# Test R-squared
RF_out_tuned_placebo.score(X_test, Y_test_out)

In [None]:
# Reset the estimator for cross_val_score
RF_out_tuned_placebo = RandomForestRegressor(random_state = 1509,
                                             n_estimators = 1278,
                                             max_depth = 195,
                                             min_samples_split = 12,
                                             min_samples_leaf = 1,
                                             max_features = 'auto',
                                             bootstrap = True)

In [None]:
# Cross-validated R-squared on the full sample using 5 folds
CV_scores_placebo = cross_val_score(RF_out_tuned_placebo, X, Y_out.values.ravel())
print('R-squared from each fold :', CV_scores_placebo)

In [None]:
CV_scores_placebo.mean()

In [None]:
# Cross-validated R-squared on the training sample using 5 folds
CV_train_scores_placebo = cross_val_score(RF_out_tuned_placebo, Xj_placebo, Y_train_out.values.ravel())
print('R-squared from each fold (training):', CV_train_scores_placebo)

In [None]:
# Mean cross-validated score
CV_train_scores_placebo.mean()

In [None]:
# Cross-validated R-squared on the test sample using 5 folds
CV_test_scores_placebo = cross_val_score(RF_out_tuned_placebo, X_test, Y_test_out.values.ravel())
print('R-squared from each fold (test):', CV_test_scores_placebo)

In [None]:
CV_test_scores_placebo.mean()

### Results on placebo data where bilateral trades are reshuffled

In [None]:
# Fit tuned model on placebo data where rows 
RF_out_tuned_placebo.fit(X_placebo, Y_train_out.values.ravel())

In [None]:
# Training R-squared
RF_out_tuned_placebo.score(X_placebo, Y_train_out)

In [None]:
# Test R-squared
RF_out_tuned_placebo.score(X_test, Y_test_out)

In [None]:
# Reset the estimator for cross_val_score
RF_out_tuned_placebo = RandomForestRegressor(random_state = 1509,
                                             n_estimators = 1278,
                                             max_depth = 195,
                                             min_samples_split = 12,
                                             min_samples_leaf = 1,
                                             max_features = 'auto',
                                             bootstrap = True)

In [None]:
# Cross-validated R-squared on the full sample using 5 folds
CV_scores_placebo = cross_val_score(RF_out_tuned_placebo, X, Y_out.values.ravel())
print('R-squared from each fold :', CV_scores_placebo)

In [None]:
CV_scores_placebo.mean()

In [None]:
# Cross-validated R-squared on the training sample using 5 folds
CV_train_scores_placebo = cross_val_score(RF_out_tuned_placebo, X_placebo, Y_train_out.values.ravel())
print('R-squared from each fold (training):', CV_train_scores_placebo)

In [None]:
# Mean cross-validated score
CV_train_scores_placebo.mean()

In [None]:
# Cross-validated R-squared on the test sample using 5 folds
CV_test_scores_placebo = cross_val_score(RF_out_tuned_placebo, X_test, Y_test_out.values.ravel())
print('R-squared from each fold (test):', CV_test_scores_placebo)

In [None]:
CV_test_scores_placebo.mean()

### Placebo trials

In [None]:
# Reset the estimator for cross_val_score
RF_out_tuned_placebo = RandomForestRegressor(random_state = 1509,
                                             n_estimators = 1278,
                                             max_depth = 195,
                                             min_samples_split = 12,
                                             min_samples_leaf = 1,
                                             max_features = 'auto',
                                             bootstrap = True)

In [None]:
def placebo_trials(estimator, X_train, Y_train, X_test, Y_test, trials):
    """
    Run placebo trials where RF model is refit on placebo training data
    and performance is evaluated on a test set
    """
    r2_scores = np.empty((0,3))
    
    for i in range(trials):
        # Shuffle all rows in the feature set
        X_placebo = X_train.sample(frac = 1)
        
        # Fit model
        estimator.fit(X_placebo, Y_train.values.ravel())
        
        # Get predictions on training
#         preds = estimator.predict(X_placebo)
        
        # Get predictions on test
        preds_tst = estimator.predict(X_test)
        
        # Get test MSE
        MSE_tst = mean_squared_error(Y_test, preds_tst)
        
        # Get training scores
        r2_trn = estimator.score(X_placebo, Y_train)
        
        # Get test scores
        r2_tst = estimator.score(X_test, Y_test)
        
        # Keep track of scores
        r2_scores = np.append(r2_scores, np.array([[r2_trn, r2_tst, MSE_tst]]), 0)
    return r2_scores

In [None]:
placebo_trials(RF_out_tuned_placebo, X_train, Y_train_out, X_test, Y_test_out, 2)