# Linear Model

## Data wrangling

In [1]:
# Preamble
import pandas as pd
import numpy as np
pd.set_option("mode.chained_assignment", None)
import random
random.seed(1509)
import matplotlib.pyplot as plt
import lightgbm as lgb
import pyarrow.feather as feather
from os import chdir, getcwd
import statsmodels.api as sm
from pprint import pprint
from nested_cv import NestedCV

# sci-kit
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score
from sklearn import tree

In [2]:
data_dir = '/home/jovyan/work/Data/'
results_dir = '/home/jovyan/work/Results/'

In [3]:
select_features = ['reporter.ISO', 'partner.ISO', 'year',
                   'ln.Tot_IFF_t', 'ln.In_Tot_IFF_t',
                   'ln.gdp_o', 'ln.gdp_d', 'ln.pop_o', 'ln.pop_d', 
                   'dist', 'contig', 
                   'comlang', 'comcol', 'col45', 
                   'ihs.entry_cost_o', 'ihs.entry_cost_d', 'rta',
                   'rCorrCont', 'pCorrCont',
                   'rRegQual', 'pRegQual', 
                   'rRuleLaw', 'pRuleLaw',
                   'pSecrecyScore',
                   'pFSI.rank',
                   'pKFSI13',
                   'pKFSI17',
                   'pKFSI20',
                   'rFATF', 'pFATF',
                   'ihs.tariff',
                   'kai_o', 'kai_d', 'kao_o', 'kao_d',
                   'cc_o', 'cc_d', 'cci_o', 'cci_d', 'cco_o', 'cco_d',
                   'di_o', 'di_d', 'dii_o', 'dii_d', 'dio_o', 'dio_d']

features = [       'ln.gdp_o', 'ln.gdp_d', 'ln.pop_o', 'ln.pop_d', 
                   'dist', 'contig', 
                   'comlang', 'comcol', 'col45', 
                   'ihs.entry_cost_o', 'ihs.entry_cost_d', 'rta',
                   'rCorrCont', 'pCorrCont',
                   'rRegQual', 'pRegQual', 
                   'rRuleLaw', 'pRuleLaw',
                   'pSecrecyScore',
                   'pFSI.rank',
                   'pKFSI13',
                   'pKFSI17',
                   'pKFSI20',
                   'rFATF', 'pFATF',
                   'ihs.tariff',
                   'kai_o', 'kai_d', 'kao_o', 'kao_d',
                   'cc_o', 'cc_d', 'cci_o', 'cci_d', 'cco_o', 'cco_d',
                   'di_o', 'di_d', 'dii_o', 'dii_d', 'dio_o', 'dio_d']

ids = [       'reporter.ISO', 'partner.ISO', 'year']

In [4]:
def create_smp(data, features):
    """
    Create train and test samples that are complete.
    """
    smp = data[features]
    smp.dropna(axis=0, how='any', inplace=True)
    return smp

## Samples

### Import full sample

In [5]:
data = feather.read_feather(results_dir + 'Africa_agg.feather')

In [6]:
data_smp = create_smp(data, select_features)

In [35]:
idx = data_smp[ids]
X = data_smp[features]
Y_out = data_smp[['ln.Tot_IFF_t']]
Y_in = data_smp[['ln.In_Tot_IFF_t']]

In [36]:
print('X: ', X.shape, '\nY_out: ',  Y_out.shape)

X:  (5333, 42) 
Y_out:  (5333, 1)


### Import training and test sets

In [7]:
train_agg = feather.read_feather(results_dir + 'train_agg.feather')
test_agg = feather.read_feather(results_dir + 'test_agg.feather')

In [8]:
train_agg_smp = create_smp(train_agg, select_features)
test_agg_smp = create_smp(test_agg, select_features)

In [9]:
feather.write_feather(train_agg_smp, results_dir + 'train_agg_smp.feather')
feather.write_feather(test_agg_smp, results_dir + 'test_agg_smp.feather')

In [10]:
print('Training set: ', train_agg_smp.shape, '\nTest set: ',  test_agg_smp.shape)

Training set:  (4256, 47) 
Test set:  (1077, 47)


### Create feature set and vector of outcome labels

In [11]:
# Training set
Y_train_out = train_agg_smp[['ln.Tot_IFF_t']]
Y_train_in = train_agg_smp[['ln.In_Tot_IFF_t']]
X_train = train_agg_smp[features]

# Test set
Y_test_out = test_agg_smp[['ln.Tot_IFF_t']]
Y_test_in = test_agg_smp[['ln.In_Tot_IFF_t']]
X_test = test_agg_smp[features]

In [12]:
idx = train_agg_smp[ids]

## Linear regression

### Fit linear regression model

In [13]:
linear_mod_out = LinearRegression()  
linear_mod_out.fit(X_train.values, Y_train_out.values)

LinearRegression()

In [14]:
linear_mod_in = LinearRegression()  
linear_mod_in.fit(X_train.values, Y_train_in.values)

LinearRegression()

### Print coefficients

In [15]:
print(linear_mod_out.intercept_)
print(linear_mod_out.coef_)

[-31.76690854]
[[ 1.44017621e+00  4.73285338e-01 -6.53251608e-01  7.80489825e-01
  -2.19969261e-04  3.20234393e+00  4.92334684e-01 -4.68218951e-02
   4.08187955e-01  3.73398260e-01  2.67072904e-01  5.93048923e-01
  -3.75487341e-03  2.58208547e-02 -1.02622238e-02  9.90414719e-03
   2.38703745e-02 -6.29400188e-03 -2.27177124e-02 -6.89098597e-03
   1.15326900e+00 -1.60963323e+00  1.83543200e+00  3.54464689e+00
   9.92499305e-01 -1.92548570e-01  1.19257406e+00 -2.84898815e-01
   4.00270172e+00  9.04923521e-01  1.93546485e-01 -8.56771122e-02
  -3.48350370e-01 -5.82263401e-01  7.35443340e-01  4.10909177e-01
  -1.57394039e+00 -4.34301715e-03  5.97881435e-01  8.61853165e-02
  -3.74576221e+00 -9.48713508e-02]]


In [16]:
print(linear_mod_in.intercept_)
print(linear_mod_in.coef_)

[-25.31188105]
[[ 1.14106241e+00  2.92109996e-01 -3.44411560e-01  8.82678096e-01
  -2.46086687e-04  2.75474686e+00  5.63761313e-01 -1.47046144e-02
   1.17458517e+00  1.56377306e-01  1.56810073e-01  8.65111119e-01
  -1.14229593e-02  4.21426180e-02 -1.04543855e-02  5.03802594e-03
   2.71894170e-02 -2.03280029e-02 -8.89087311e-03 -7.48900155e-03
   1.05182561e+00 -1.68239571e+00  1.09670453e+00  2.53443942e+00
   1.38546764e+00 -1.89947113e-01  5.25340039e-01 -3.75436307e-01
   3.05504120e+00  8.49604257e-01  1.52924380e-01 -2.05325935e-01
   1.20206969e-01 -6.33621215e-01  1.85641792e-01  2.22969344e-01
  -1.25261476e+00 -3.88369572e-02  1.38789072e-01 -3.37708301e-01
  -2.64401859e+00  2.60034387e-01]]


In [17]:
Xconst = sm.add_constant(X_train)
est = sm.OLS(Y_train_out, Xconst)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:           ln.Tot_IFF_t   R-squared:                       0.581
Model:                            OLS   Adj. R-squared:                  0.577
Method:                 Least Squares   F-statistic:                     153.6
Date:                Tue, 12 Oct 2021   Prob (F-statistic):               0.00
Time:                        06:36:12   Log-Likelihood:                -9092.6
No. Observations:                4256   AIC:                         1.826e+04
Df Residuals:                    4217   BIC:                         1.851e+04
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const              -31.7669      1.734  

  x = pd.concat(x[::order], 1)


In [18]:
Xconst = sm.add_constant(X_train)
est = sm.OLS(Y_train_in, Xconst)
est2 = est.fit()
print(est2.summary())

  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:        ln.In_Tot_IFF_t   R-squared:                       0.534
Model:                            OLS   Adj. R-squared:                  0.530
Method:                 Least Squares   F-statistic:                     127.2
Date:                Tue, 12 Oct 2021   Prob (F-statistic):               0.00
Time:                        06:36:12   Log-Likelihood:                -9345.6
No. Observations:                4256   AIC:                         1.877e+04
Df Residuals:                    4217   BIC:                         1.902e+04
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const              -25.3119      1.840  

### Predictions

In [19]:
preds_LM_train_out = linear_mod_out.predict(X_train)
preds_LM_test_out = linear_mod_out.predict(X_test)

In [20]:
preds_LM_train_in = linear_mod_in.predict(X_train)
preds_LM_test_in = linear_mod_in.predict(X_test)

In [21]:
feather.write_feather(pd.DataFrame(preds_LM_train_out), results_dir + 'preds.LM.train_out_agg.feather')
feather.write_feather(pd.DataFrame(preds_LM_test_out), results_dir + 'preds.LM.test_out_agg.feather')

In [22]:
feather.write_feather(pd.DataFrame(preds_LM_train_in), results_dir + 'preds.LM.train_in_agg.feather')
feather.write_feather(pd.DataFrame(preds_LM_test_in), results_dir + 'preds.LM.test_in_agg.feather')

### Predictive accuracy

In [33]:
print("MSE of the training set (outflows):", (mean_squared_error(Y_train_out, preds_LM_train_out)))
print("R^2 of the training set (outflows):", r2_score(Y_train_out, preds_LM_train_out))
print("MSE of the test set (outflows):", (mean_squared_error(Y_test_out, preds_LM_test_out)))
print("R^2 of the test set (outflows):", r2_score(Y_test_out, preds_LM_test_out))

MSE of the training set (outflows): 4.199509548761939
R^2 of the training set (outflows): 0.5805291191489295
MSE of the test set (outflows): 4.275250958229833
R^2 of the test set (outflows): 0.5838053963543813


In [34]:
print("MSE of the training set (inflows):", (mean_squared_error(Y_train_in, preds_LM_train_in)))
print("R^2 of the training set (inflows):", r2_score(Y_train_in, preds_LM_train_in))
print("MSE of the test set (inflows):", (mean_squared_error(Y_test_in, preds_LM_test_in)))
print("R^2 of the test set (inflows):", r2_score(Y_test_in, preds_LM_test_in))

MSE of the training set (inflows): 4.729761399361896
R^2 of the training set (inflows): 0.5340113721982074
MSE of the test set (inflows): 4.609102779487201
R^2 of the test set (inflows): 0.5659306281217347


### Cross-validation

In [37]:
linear_mod_out = LinearRegression()  
linear_mod_out.fit(X.values, Y_out.values)

LinearRegression()

In [39]:
CV_scores_out = cross_val_score(linear_mod_out, X, Y_out.values.ravel(), cv = 6)
print('Cross-validated scores:', CV_scores_out)

Cross-validated scores: [0.3260559  0.3998275  0.53365486 0.56711401 0.41115843 0.00178382]


In [40]:
CV_scores_out.mean()

0.3732657533440225

In [41]:
predictions = cross_val_predict(linear_mod_out, X, Y_out.values.ravel(), cv = 6)
r2_score(Y_out, predictions)

0.49905493879044427

In [29]:
linear_mod_out = LinearRegression()  
linear_mod_out.fit(X_train.values, Y_train_out.values)

LinearRegression()

In [31]:
CV_scores_out = cross_val_score(linear_mod_out, X_train, Y_train_out.values.ravel())
print('Cross-validated scores:', CV_scores_out)

Cross-validated scores: [0.56387324 0.55694373 0.55167506 0.58997033 0.58388886]


In [32]:
CV_scores_out.mean()

0.569270244316766

In [45]:
predictions_CV = cross_val_predict(linear_mod_out, X_train, Y_train_out.values.ravel())
r2_score(Y_train_out, predictions_CV)

0.5703527724676976

In [46]:
(mean_squared_error(Y_train_out, predictions_CV))

4.301389481339472

In [47]:
preds_test = linear_mod_out.predict(X_test)

In [48]:
(mean_squared_error(Y_test_out, preds_test))

4.191281430466131