# Importing modules

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt  

# !pip install catboost
import catboost

# Data processing

In [35]:
# get the preferences data
data_pref_raw = pd.read_csv("SCI_and_preferences.csv", encoding="utf-8").iloc[:, 1:]
data_pref_raw = data_pref_raw.drop_duplicates(subset=data_pref_raw.columns[2:]).reset_index(drop=True)
data_pref_raw.scaled_sci = np.log(data_pref_raw.scaled_sci)

pref_features = list(data_pref_raw.columns[2:-1])
log_SCI = [data_pref_raw.columns[-1]]

In [36]:
# get the exogenous variables (containing extra regions and duplicates)
data_exog_raw = pd.read_csv("SCI_and_exogenous_variables.csv", encoding="utf-8").iloc[:, 1:]
data_exog_raw["log_distance"] = np.log(data_exog_raw.distance)

exog_features = list(data_exog_raw.columns[2:])

In [37]:
# combine the datasets omitting extra regions and duplicates
data_joint = pd.merge(data_pref_raw, data_exog_raw,  how="inner", left_on=["user_loc", "fr_loc"], right_on=["user_loc", "fr_loc"])

In [38]:
# create two datasets with equal number of rows, but different sets of predictors
data_pref = data_joint.loc[:, log_SCI + pref_features] # preferences
data_exog = data_joint.loc[:, log_SCI + exog_features] # exogenous variables

# Building models for prediction of $log(SCI)$

## 1) $log(SCI)$ ~ preferences

In [41]:
X = data_pref.loc[:, pref_features]
y = data_pref.loc[:, log_SCI]

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, train_size=0.8, random_state=1)

In [43]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [44]:
boosting_reg = catboost.CatBoostRegressor(eval_metric='RMSE', logging_level='Silent', random_state=0)

In [45]:
boosting_reg.grid_search({'iterations': [500, 1000], 'learning_rate': [0.1, 0.3]}, 
                           X_train, y_train, plot=True, refit=True, cv=5);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	loss: 0.7144601	best: 0.7144601 (0)	total: 21.5s	remaining: 1m 4s
1:	loss: 0.7245154	best: 0.7144601 (0)	total: 42.6s	remaining: 42.6s
2:	loss: 0.6756735	best: 0.6756735 (2)	total: 1m 24s	remaining: 28.2s
3:	loss: 0.7151705	best: 0.6756735 (2)	total: 2m 6s	remaining: 0us
Estimating final quality...


In [46]:
# define MAPE score
def mape(actual, pred):
    return np.mean(np.abs((actual - pred) / actual)) * 100

In [47]:
print('train MAE = {}'.format(mean_absolute_error(y_train, boosting_reg.predict(X_train))))

train MAE = 0.2599625916993611


In [48]:
print('test MAE = {}'.format(mean_absolute_error(y_test, boosting_reg.predict(X_test))))

test MAE = 0.4816478655500689


In [49]:
print('train MAPE = {}%'.format(mape(y_train, boosting_reg.predict(X_train))))

train MAPE = 23.71867982563325%


In [50]:
print('test MAPE = {}%'.format(mape(y_test, boosting_reg.predict(X_test))))

test MAPE = 23.279117383529517%


In [51]:
print('train R_squared = {}'.format(r2_score(y_train, boosting_reg.predict(X_train))))

train R_squared = 0.9410961922875284


In [52]:
print('test R_squared = {}'.format(r2_score(y_test, boosting_reg.predict(X_test))))

test R_squared = 0.7878284976336678


In [54]:
lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [55]:
lin_reg.score(X_train, y_train)

0.3685975165720533

In [56]:
lin_reg.score(X_test, y_test)

0.37609748763088213

In [57]:
print('train MAPE = {}%'.format(mape(y_train, lin_reg.predict(X_train))))

train MAPE = 14.623900595851513%


In [58]:
print('test MAPE = {}%'.format(mape(y_test, lin_reg.predict(X_test))))

test MAPE = 14.713743361792867%
