# Predictions

## Data wrangling

In [1]:
# Preamble
import pandas as pd
import numpy as np
pd.set_option("mode.chained_assignment", None)
import matplotlib.pyplot as plt
import lightgbm as lgb
import pyarrow.feather as feather
from os import chdir, getcwd

# sci-kit
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [2]:
data_dir = '/home/jovyan/work/Data/'
results_dir = '/home/jovyan/work/Results/'

In [None]:
select_features = ['reporter.ISO', 'partner.ISO', 'year', 
                   'commodity.code', 'section', 'section.code', 'SITC.section', 'SITC.code',
                   'Net_Tot_IFF', 'GER_Tot_IFF', 'In_GER_Tot_IFF', 'ln.GER_Tot_IFF', 'ln.In_GER_Tot_IFF',
                   'dist',
                   'gdp_o', 'gdp_d', 'pop_o', 'pop_d',
                   'entry_cost_o', 'entry_cost_d',
                   'tariff',
                   'rSecrecyScore', 'pSecrecyScore',
                   'rCorrCont', 'pCorrCont',
                   'rRegQual', 'pRegQual', 
                   'rRuleLaw', 'pRuleLaw',
                   'Import_value', 'NetExport_value']
features = ['dist',
            'gdp_o', 'gdp_d', 'pop_o', 'pop_d',
            'entry_cost_o', 'entry_cost_d',
            'tariff',
            'rSecrecyScore', 'pSecrecyScore',
            'rCorrCont', 'pCorrCont',
            'rRegQual', 'pRegQual', 
            'rRuleLaw', 'pRuleLaw',
            'Import_value', 'NetExport_value',
            'SITC.code']

## Split into training and test samples

In [None]:
train = feather.read_feather(results_dir + 'train.feather')
test = feather.read_feather(results_dir + 'test.feather')

In [None]:
def create_smp(data, features):
    """
    Create train and test samples that are complete.
    """
    smp = data[features]
    smp.dropna(axis=0, how='any', inplace=True)
    return smp

In [None]:
train_smp = create_smp(train, select_features)
test_smp = create_smp(test, select_features)

In [None]:
feather.write_feather(train_smp, results_dir + 'train_smp.feather')
feather.write_feather(test_smp, results_dir + 'test_smp.feather')

In [None]:
Y_train = train_smp[['GER_Tot_IFF']]
X_train = train_smp[features]
Y_test = test_smp[['GER_Tot_IFF']]
X_test = test_smp[features]

In [None]:
print('X_train: ', X_train.shape, '\nX_test: ',  X_test.shape)

## Linear regression

In [None]:
linear_mod = LinearRegression()  
linear_mod.fit(X_train.values, Y_train.values)

In [None]:
print(linear_mod.intercept_)
print(linear_mod.coef_)

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [None]:
import statsmodels.api as sm
Xconst = sm.add_constant(X_train)
est = sm.OLS(Y_train, Xconst)
est2 = est.fit()
print(est2.summary())

In [None]:
preds_LM_train = linear_mod.predict(X_train)
preds_LM_test = linear_mod.predict(X_test)

In [None]:
feather.write_feather(pd.DataFrame(preds_LM_train), results_dir + 'preds.LM.train.feather')
feather.write_feather(pd.DataFrame(preds_LM_test), results_dir + 'preds.LM.test.feather')

In [None]:
print("RMSE of the training set:", np.sqrt(mean_squared_error(Y_train, preds_LM_train)))
print("R^2 of the training set:", r2_score(Y_train, preds_LM_train))

In [None]:
print("RMSE of the test set:", np.sqrt(mean_squared_error(Y_test, preds_LM_test)))
print("R^2 of the test set:", r2_score(Y_test, preds_LM_test))

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, preds_LM_test))  
print('Mean Squared Error:', mean_squared_error(Y_test, preds_LM_test))  

## Light GBM

In [None]:
Y_train = train_smp[['GER_Tot_IFF']]
X_train = train_smp[features]
Y_test = test_smp[['GER_Tot_IFF']]
X_test = test_smp[features]

In [None]:
lightGBM_train = lgb.Dataset(X_train, Y_train)
lightGBM_test = lgb.Dataset(X_test, Y_test)

In [None]:
params = {
    'objective' : 'regression',
    'metric' : 'rmse',
    'num_leaves' : 100,
    'max_depth': 10,
    'learning_rate' : 0.1,
    'feature_fraction' : 0.6,
    'verbosity' : -1
}
lightGBM_mod = lgb.train(
    params,
    lightGBM_train,
    500,
    valid_sets = [lightGBM_train, lightGBM_test],
    valid_names = ["train", "test"],
    early_stopping_rounds = 50,
    verbose_eval = 500
)

In [None]:
preds_lightGBM_train = pd.DataFrame(lightGBM_mod.predict(X_train))
preds_lightGBM_test = pd.DataFrame(lightGBM_mod.predict(X_test))

In [None]:
feather.write_feather(preds_lightGBM_train, results_dir + 'preds.lightGBM.train.feather')
feather.write_feather(preds_lightGBM_test, results_dir + 'preds.lightGBM.test.feather')

In [None]:
print("RMSE of the training set:", np.sqrt(mean_squared_error(Y_train, preds_lightGBM_train)))
print("R^2 of the training set:", r2_score(Y_train, preds_lightGBM_train))

In [None]:
print("RMSE of the test set:", np.sqrt(mean_squared_error(Y_test, preds_lightGBM_test)))
print("R^2 of the test set:", r2_score(Y_test, preds_lightGBM_test))

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
lgb.plot_importance(lightGBM_mod, height = 0.8, ax = ax)
ax.grid(False)
plt.title("LightGBM - Feature Importance", fontsize = 15)
plt.show()

## Neural Networks

In [None]:
Y_train = train_smp[['GER_Tot_IFF']]
X_train = train_smp[features]
Y_test = test_smp[['GER_Tot_IFF']]
X_test = test_smp[features]

In [None]:
std_sc = StandardScaler()
NN_train = std_sc.fit_transform(X_train)
NN_test = std_sc.transform(X_test)

In [None]:
NN_mod = MLPRegressor(hidden_layer_sizes = (10),
                      activation = "logistic",
                      random_state = 1, 
                      max_iter = 1000).fit(NN_train, Y_train.values.ravel())

In [None]:
preds_NN_train = NN_mod.predict(NN_train)
preds_NN_test = NN_mod.predict(NN_test)

In [None]:
feather.write_feather(pd.DataFrame(preds_NN_train), results_dir + 'preds.NN.train.feather')
feather.write_feather(pd.DataFrame(preds_NN_test), results_dir + 'preds.NN.test.feather')

In [None]:
print("RMSE of the training set:", np.sqrt(mean_squared_error(Y_train, preds_NN_train)))
print("R^2 of the training set:", r2_score(Y_train, preds_NN_train))

In [None]:
print("RMSE of the test set:", np.sqrt(mean_squared_error(Y_test, preds_NN_test)))
print("R^2 of the test set:", r2_score(Y_test, preds_NN_test))

### Tuning hyperparameters

In [None]:
NN = MLPRegressor(max_iter = 1000)
parameter_space = {
    'hidden_layer_sizes': [(10,20)],
    'activation': ['logistic', 'relu'],
    #'solver': ['sgd', 'adam'],
    #'alpha': [0.0001, 0.05],
    #'learning_rate': ['constant','adaptive'],
}

In [None]:
NN_mod_tuned = GridSearchCV(NN, parameter_space, n_jobs = -1, cv = 3)

In [None]:
NN_mod_tuned.fit(NN_train, Y_train.values.ravel())

In [None]:
print('Best parameters found:\n', NN_mod_tuned.best_params_)

In [None]:
# All results
means = NN_mod_tuned.cv_results_['mean_test_score']
stds = NN_mod_tuned.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, NN_mod_tuned.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
preds_NN_train = NN_mod_tuned.predict(NN_train)
preds_NN_test = NN_mod_tuned.predict(NN_test)

In [None]:
feather.write_feather(pd.DataFrame(preds_NN_train), results_dir + 'preds.NN.train.feather')
feather.write_feather(pd.DataFrame(preds_NN_test), results_dir + 'preds.NN.test.feather')

In [None]:
print("RMSE of the training set:", np.sqrt(mean_squared_error(Y_train, preds_NN_train)))
print("R^2 of the training set:", r2_score(Y_train, preds_NN_train))

In [None]:
print("RMSE of the test set:", np.sqrt(mean_squared_error(Y_test, preds_NN_test)))
print("R^2 of the test set:", r2_score(Y_test, preds_NN_test))

## Support Vector Machine

In [None]:
SVM_mod = svm.SVR()
SVM_mod.fit(X_train, Y_train.values.ravel())

In [None]:
preds_SVM_train = SVM_mod.predict(X_train)
preds_SVM_test = SVM_mod.predict(X_test)

In [None]:
feather.write_feather(pd.DataFrame(preds_SVM_train), results_dir + 'preds.SVM.train.feather')
feather.write_feather(pd.DataFrame(preds_SVM_test), results_dir + 'preds.SVM.test.feather')

In [None]:
print("RMSE of the training set:", np.sqrt(mean_squared_error(Y_train, preds_SVM_train)))
print("R^2 of the training set:", r2_score(Y_train, preds_SVM_train))

In [None]:
print("RMSE of the test set:", np.sqrt(mean_squared_error(Y_test, preds_SVM_test)))
print("R^2 of the test set:", r2_score(Y_test, preds_SVM_test))

## Random Forests

In [None]:
RF_mod = RandomForestRegressor(max_depth = 10, random_state = 1)
RF_mod.fit(X_train, Y_train.values.ravel())

In [None]:
preds_RF_train = RF_mod.predict(X_train)
preds_RF_test = RF_mod.predict(X_test)

In [None]:
feather.write_feather(pd.DataFrame(preds_RF_train), results_dir + 'preds.RF.train.feather')
feather.write_feather(pd.DataFrame(preds_RF_test), results_dir + 'preds.RF.test.feather')

In [None]:
print("RMSE of the training set:", np.sqrt(mean_squared_error(Y_train, preds_RF_train)))
print("R^2 of the training set:", r2_score(Y_train, preds_RF_train))

In [None]:
print("RMSE of the test set:", np.sqrt(mean_squared_error(Y_test, preds_RF_test)))
print("R^2 of the test set:", r2_score(Y_test, preds_RF_test))