# Linear Model (base)

## Data wrangling

In [1]:
# Preamble
import pandas as pd
import numpy as np
pd.set_option("mode.chained_assignment", None)
import random
random.seed(1509)
import matplotlib.pyplot as plt
import lightgbm as lgb
import pyarrow.feather as feather
from os import chdir, getcwd
import statsmodels.api as sm
from pprint import pprint
from nested_cv import NestedCV

# sci-kit
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score
from sklearn import tree

In [2]:
data_dir = '/home/jovyan/work/Data/'
results_dir = '/home/jovyan/work/Results/'

In [3]:
select_out_features = ['reporter.ISO', 'partner.ISO', 'year',
                       'ln.Tot_IFF_t',
                       'ln.gdp_o',
                       'ln.gdp_d',
                       'comlang',
                       'comcol',
                       'rta',
                       'rCorrCont',
                       'pCorrCont',
                       'pRegQual',
                       'rFATF',
                       'pFATF',
                       'ihs.tariff',
                       'kao_o',
                       'kai_d'  ]

select_in_features = ['reporter.ISO', 'partner.ISO', 'year',
                      'ln.In_Tot_IFF_t',
                      'ln.gdp_o',
                      'ln.gdp_d',
                      'comlang',
                      'comcol',
                      'rta',
                      'rCorrCont',
                      'pCorrCont',
                      'rRegQual',
                      'rFATF',
                      'pFATF',
                      'ihs.tariff',
                      'kai_o',
                      'kao_d'  ]

out_features = ['ln.gdp_o',
                'ln.gdp_d',
                'comlang',
                'comcol',
                'rta',
                'rCorrCont',
                'pCorrCont',
                'pRegQual',
                'rFATF',
                'pFATF',
                'ihs.tariff',
                'kao_o',
                'kai_d'  ]


in_features = ['ln.gdp_o',
               'ln.gdp_d',
               'comlang',
               'comcol',
               'rta',
               'rCorrCont',
               'pCorrCont',
               'rRegQual',
               'rFATF',
               'pFATF',
               'ihs.tariff',
               'kai_o',
               'kao_d'  ]

ids = [       'reporter.ISO', 'partner.ISO', 'year']

In [4]:
def create_smp(data, features):
    """
    Create train and test samples that are complete.
    """
    smp = data[features]
    smp.dropna(axis=0, how='any', inplace=True)
    return smp

## Samples

### Import full sample

In [5]:
data = feather.read_feather(results_dir + 'Africa_agg.feather')

In [6]:
data_out_smp = create_smp(data, select_out_features)
data_in_smp = create_smp(data, select_in_features)

In [7]:
# idx = data_smp[ids]
X_out = data_out_smp[out_features]
X_in = data_in_smp[in_features]
Y_out = data_out_smp[['ln.Tot_IFF_t']]
Y_in = data_in_smp[['ln.In_Tot_IFF_t']]

In [8]:
print('X_out: ', X_out.shape, '\nY_out: ',  Y_out.shape,
      '\nX_in: ', X_in.shape, '\nY_in: ',  Y_in.shape)

X_out:  (7695, 13) 
Y_out:  (7695, 1) 
X_in:  (7342, 13) 
Y_in:  (7342, 1)


### Import training and test sets

In [9]:
train_agg = feather.read_feather(results_dir + 'train_agg.feather')
test_agg = feather.read_feather(results_dir + 'test_agg.feather')

In [10]:
train_agg_out_smp = create_smp(train_agg, select_out_features)
train_agg_in_smp = create_smp(train_agg, select_in_features)
test_agg_out_smp = create_smp(test_agg, select_out_features)
test_agg_in_smp = create_smp(test_agg, select_in_features)

In [11]:
# feather.write_feather(train_agg_smp, results_dir + 'train_agg_smp.feather')
# feather.write_feather(test_agg_smp, results_dir + 'test_agg_smp.feather')

In [12]:
print('Training set out: ', train_agg_out_smp.shape, '\nTest set out: ',  test_agg_out_smp.shape,
      '\nTraining set in: ', train_agg_in_smp.shape, '\nTest set in: ',  test_agg_in_smp.shape)

Training set out:  (6165, 17) 
Test set out:  (1530, 17) 
Training set in:  (5874, 17) 
Test set in:  (1468, 17)


### Create feature set and vector of outcome labels

In [13]:
# Training set
Y_train_out = train_agg_out_smp[['ln.Tot_IFF_t']]
Y_train_in = train_agg_in_smp[['ln.In_Tot_IFF_t']]
X_train_out = train_agg_out_smp[out_features]
X_train_in = train_agg_in_smp[in_features]

# Test set
Y_test_out = test_agg_out_smp[['ln.Tot_IFF_t']]
Y_test_in = test_agg_in_smp[['ln.In_Tot_IFF_t']]
X_test_out = test_agg_out_smp[out_features]
X_test_in = test_agg_in_smp[in_features]

In [14]:
# idx = train_agg_smp[ids]

## Linear regression

### Fit linear regression model

In [15]:
linear_mod_out = LinearRegression()  
linear_mod_out.fit(X_train_out.values, Y_train_out.values)

LinearRegression()

In [16]:
linear_mod_in = LinearRegression()  
linear_mod_in.fit(X_train_in.values, Y_train_in.values)

LinearRegression()

### Print coefficients

In [17]:
print(linear_mod_out.intercept_)
print(linear_mod_out.coef_)

[-25.75850241]
[[ 7.47678566e-01  9.73665232e-01  7.14276875e-01  1.14636256e+00
   1.86879791e+00 -7.88709899e-03 -7.99694386e-04 -4.31802907e-03
   1.89981495e+00  8.84608524e-01 -3.43001552e-02  2.67954923e-01
   4.12362257e-01]]


In [18]:
print(linear_mod_in.intercept_)
print(linear_mod_in.coef_)

[-22.5391019]
[[ 6.64663495e-01  8.84948094e-01  9.76499302e-01  1.03643124e+00
   2.32941258e+00 -3.67595175e-03  4.03197186e-04 -1.42880206e-02
   1.32772959e+00  1.19378115e+00  3.52201236e-02 -4.33533093e-01
   8.64279487e-01]]


In [38]:
Xconst = sm.add_constant(X_train_out)
est = sm.OLS(Y_train_out, Xconst)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:           ln.Tot_IFF_t   R-squared:                       0.431
Model:                            OLS   Adj. R-squared:                  0.430
Method:                 Least Squares   F-statistic:                     358.5
Date:                Tue, 12 Oct 2021   Prob (F-statistic):               0.00
Time:                        08:04:43   Log-Likelihood:                -14272.
No. Observations:                6165   AIC:                         2.857e+04
Df Residuals:                    6151   BIC:                         2.867e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -25.7585      0.767    -33.565      0.0

  x = pd.concat(x[::order], 1)


In [20]:
Xconst = sm.add_constant(X_train_in)
est = sm.OLS(Y_train_in, Xconst)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:        ln.In_Tot_IFF_t   R-squared:                       0.403
Model:                            OLS   Adj. R-squared:                  0.402
Method:                 Least Squares   F-statistic:                     304.6
Date:                Tue, 12 Oct 2021   Prob (F-statistic):               0.00
Time:                        08:00:29   Log-Likelihood:                -13757.
No. Observations:                5874   AIC:                         2.754e+04
Df Residuals:                    5860   BIC:                         2.764e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -22.5391      0.845    -26.659      0.0

  x = pd.concat(x[::order], 1)


### Predictions

In [21]:
preds_LM_train_out = linear_mod_out.predict(X_train_out)
preds_LM_test_out = linear_mod_out.predict(X_test_out)

In [22]:
preds_LM_train_in = linear_mod_in.predict(X_train_in)
preds_LM_test_in = linear_mod_in.predict(X_test_in)

In [23]:
# feather.write_feather(pd.DataFrame(preds_LM_train_out), results_dir + 'preds.LM.train_out_agg.feather')
# feather.write_feather(pd.DataFrame(preds_LM_test_out), results_dir + 'preds.LM.test_out_agg.feather')

In [24]:
# feather.write_feather(pd.DataFrame(preds_LM_train_in), results_dir + 'preds.LM.train_in_agg.feather')
# feather.write_feather(pd.DataFrame(preds_LM_test_in), results_dir + 'preds.LM.test_in_agg.feather')

### Predictive accuracy

In [40]:
print("MSE of the training set (outflows):", (mean_squared_error(Y_train_out, preds_LM_train_out)))
print("R^2 of the training set (outflows):", r2_score(Y_train_out, preds_LM_train_out))
print("MSE of the test set (outflows):", (mean_squared_error(Y_test_out, preds_LM_test_out)))
print("R^2 of the test set (outflows):", r2_score(Y_test_out, preds_LM_test_out))

MSE of the training set (outflows): 6.002425580558888
R^2 of the training set (outflows): 0.4310349784778611
MSE of the test set (outflows): 5.733935786541133
R^2 of the test set (outflows): 0.4449508815928588


In [26]:
print("MSE of the training set (inflows):", (mean_squared_error(Y_train_in, preds_LM_train_in)))
print("R^2 of the training set (inflows):", r2_score(Y_train_in, preds_LM_train_in))
print("MSE of the test set (inflows):", (mean_squared_error(Y_test_in, preds_LM_test_in)))
print("R^2 of the test set (inflows):", r2_score(Y_test_in, preds_LM_test_in))

MSE of the training set (inflows): 6.335397353800778
R^2 of the training set (inflows): 0.4032852743834341
MSE of the test set (inflows): 6.444884104740987
R^2 of the test set (inflows): 0.3887680880549481


### Cross-validation

In [27]:
linear_mod_out = LinearRegression()  
linear_mod_out.fit(X_out.values, Y_out.values)

LinearRegression()

In [28]:
CV_scores_out = cross_val_score(linear_mod_out, X_out, Y_out.values.ravel())
print('Cross-validated scores:', CV_scores_out)

Cross-validated scores: [ 0.29142436  0.20750024  0.52432337  0.49375608 -0.22217336]


In [29]:
CV_scores_out.mean()

0.2589661402904098

In [30]:
predictions = cross_val_predict(linear_mod_out, X_out, Y_out.values.ravel())
r2_score(Y_out, predictions)

0.3733821431659239

In [31]:
linear_mod_out = LinearRegression()  
linear_mod_out.fit(X_train_out.values, Y_train_out.values)

LinearRegression()

In [32]:
CV_scores_out = cross_val_score(linear_mod_out, X_train_out, Y_train_out.values.ravel())
print('Cross-validated scores:', CV_scores_out)

Cross-validated scores: [0.42139936 0.42728439 0.4025989  0.42570633 0.45792348]


In [33]:
CV_scores_out.mean()

0.42698249103482666

In [34]:
predictions_CV = cross_val_predict(linear_mod_out, X_train_out, Y_train_out.values.ravel())
r2_score(Y_train_out, predictions_CV)

0.4274052405772063

In [35]:
(mean_squared_error(Y_train_out, predictions_CV))

6.0407183240518485

In [36]:
preds_test = linear_mod_out.predict(X_test_out)

In [37]:
(mean_squared_error(Y_test_out, preds_test))

5.733935786541133