# Importing modules

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt  

# !pip install catboost
import catboost

# Data processing

In [4]:
# get the preferences data
data_pref_raw = pd.read_csv("SCI_and_preferences.csv", encoding="utf-8").iloc[:, 1:]
data_pref_raw = data_pref_raw.drop_duplicates(subset=data_pref_raw.columns[2:]).reset_index(drop=True)
data_pref_raw.scaled_sci = np.log(data_pref_raw.scaled_sci)

pref_features = list(data_pref_raw.columns[2:-1])
log_SCI = [data_pref_raw.columns[-1]]

In [5]:
# get the exogenous variables (containing extra regions and duplicates)
data_exog_raw = pd.read_csv("SCI_and_exogenous_variables.csv", encoding="utf-8").iloc[:, 1:]
data_exog_raw["log_distance"] = np.log(data_exog_raw.distance)

exog_features = list(data_exog_raw.columns[2:])

In [6]:
# combine the datasets omitting extra regions and duplicates
data_joint = pd.merge(data_pref_raw, data_exog_raw,  how="inner", left_on=["user_loc", "fr_loc"], right_on=["user_loc", "fr_loc"])

In [7]:
# create two datasets with equal number of rows, but different sets of predictors
data_pref = data_joint.loc[:, log_SCI + pref_features] # preferences
data_exog = data_joint.loc[:, log_SCI + exog_features] # exogenous variables

# Building models for prediction of $log(SCI)$

## 1) $log(SCI)$ ~ preferences

In [37]:
X = data_pref.loc[:, pref_features]
y = data_pref.loc[:, log_SCI]

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, train_size=0.8, random_state=1)

In [39]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [40]:
boosting_reg = catboost.CatBoostRegressor(eval_metric='RMSE', logging_level='Silent', random_state=0)

In [41]:
boosting_reg.grid_search({'iterations': [500, 1000], 'learning_rate': [0.1, 0.3]}, 
                           X_train, y_train, plot=True, refit=True, cv=5);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	loss: 0.7144601	best: 0.7144601 (0)	total: 17.7s	remaining: 53.2s
1:	loss: 0.7245154	best: 0.7144601 (0)	total: 35.5s	remaining: 35.5s
2:	loss: 0.6756735	best: 0.6756735 (2)	total: 1m 12s	remaining: 24.2s
3:	loss: 0.7151705	best: 0.6756735 (2)	total: 1m 48s	remaining: 0us
Estimating final quality...


In [42]:
boosting_reg.get_params()

{'eval_metric': 'RMSE',
 'iterations': 1000,
 'learning_rate': 0.1,
 'logging_level': 'Silent',
 'loss_function': 'RMSE',
 'random_state': 0}

In [13]:
# define MAPE score
def mape(actual, pred):
    return np.mean(np.abs((actual - pred) / actual)) * 100

In [43]:
print('train MAE = {}'.format(mean_absolute_error(y_train, boosting_reg.predict(X_train))))
print('test MAE = {}'.format(mean_absolute_error(y_test, boosting_reg.predict(X_test))))
print('train MAPE = {}%'.format(mape(y_train, boosting_reg.predict(X_train))))
print('test MAPE = {}%'.format(mape(y_test, boosting_reg.predict(X_test))))
print('train R_squared = {}'.format(r2_score(y_train, boosting_reg.predict(X_train))))
print('test R_squared = {}'.format(r2_score(y_test, boosting_reg.predict(X_test))))

train MAE = 0.2599625916993611
test MAE = 0.4816478655500689
train MAPE = 23.71867982563325%
test MAPE = 23.279117383529517%
train R_squared = 0.9410961922875284
test R_squared = 0.7878284976336678


In [15]:
lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [16]:
print('train MAPE = {}%'.format(mape(y_train, lin_reg.predict(X_train))))
print('test MAPE = {}%'.format(mape(y_test, lin_reg.predict(X_test))))
print('train R_squared = {}'.format(r2_score(y_train, lin_reg.predict(X_train))))
print('test R_squared = {}'.format(r2_score(y_test, lin_reg.predict(X_test))))

train MAPE = 14.623900595851513%
test MAPE = 14.713743361792867%
train R_squared = 0.3685975165720533
test R_squared = 0.37609748763088213


## 2) $log(SCI)$ ~ exogenous variables

In [44]:
X = data_exog.loc[:, exog_features]
y = data_exog.loc[:, log_SCI]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, train_size=0.8, random_state=1)

In [46]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [30]:
boosting_reg = catboost.CatBoostRegressor(eval_metric='RMSE', logging_level='Silent', random_state=0)

In [31]:
boosting_reg.grid_search({'iterations': [300, 500, 700], 'learning_rate': [0.1, 0.3, 0.5]}, 
                           X_train, y_train, plot=True, refit=True, cv=5);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	loss: 0.8353100	best: 0.8353100 (0)	total: 957ms	remaining: 7.66s
1:	loss: 0.8345792	best: 0.8345792 (1)	total: 1.93s	remaining: 6.74s
2:	loss: 0.8377145	best: 0.8345792 (1)	total: 2.9s	remaining: 5.8s
3:	loss: 0.8353100	best: 0.8345792 (1)	total: 4.48s	remaining: 5.61s
4:	loss: 0.8345792	best: 0.8345792 (1)	total: 6.08s	remaining: 4.87s
5:	loss: 0.8377145	best: 0.8345792 (1)	total: 7.68s	remaining: 3.84s
6:	loss: 0.8353100	best: 0.8345792 (1)	total: 9.91s	remaining: 2.83s
7:	loss: 0.8345792	best: 0.8345792 (1)	total: 12.1s	remaining: 1.51s
8:	loss: 0.8377145	best: 0.8345792 (1)	total: 14.4s	remaining: 0us
Estimating final quality...


In [35]:
boosting_reg.get_params()

{'eval_metric': 'RMSE',
 'iterations': 300,
 'learning_rate': 0.3,
 'logging_level': 'Silent',
 'loss_function': 'RMSE',
 'random_state': 0}

In [36]:
print('train MAPE = {}%'.format(mape(y_train, boosting_reg.predict(X_train))))
print('test MAPE = {}%'.format(mape(y_test, boosting_reg.predict(X_test))))
print('train R_squared = {}'.format(r2_score(y_train, boosting_reg.predict(X_train))))
print('test R_squared = {}'.format(r2_score(y_test, boosting_reg.predict(X_test))))

train MAPE = 21.207769835043266%
test MAPE = 21.779977453726058%
train R_squared = 0.6629703385885135
test R_squared = 0.6505219247073549


In [47]:
lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [48]:
print('train MAPE = {}%'.format(mape(y_train, lin_reg.predict(X_train))))
print('test MAPE = {}%'.format(mape(y_test, lin_reg.predict(X_test))))
print('train R_squared = {}'.format(r2_score(y_train, lin_reg.predict(X_train))))
print('test R_squared = {}'.format(r2_score(y_test, lin_reg.predict(X_test))))

train MAPE = 12.591634870545901%
test MAPE = 12.672519870149001%
train R_squared = 0.5981079185491187
test R_squared = 0.6107835826712049
