# Linear Regreesion/Lasso/Ridge

This notebook shows our model training by using linear regresson, Lasso and ridge.

Import the libraries:

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc

from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

## 1. Read Input

In [2]:
gc.enable()
df_train = pd.read_csv('train_P1.csv', dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv('test_P1.csv', dtype={'fullVisitorId': 'str'})

In [3]:
df_trn, df_val = train_test_split(df_train, test_size=0.33)

y_train = df_train['totals.transactionRevenue']
y_test = np.log1p(df_test['totals.transactionRevenue'].fillna(0))
y_trn = df_trn['totals.transactionRevenue']
y_val = df_val['totals.transactionRevenue']

Id_train = df_train['fullVisitorId']
Id_test  = df_test['fullVisitorId']
Id_trn = df_trn['fullVisitorId']
Id_val  = df_val['fullVisitorId']

X_train = df_train.drop(['fullVisitorId', 'totals.transactionRevenue'], axis=1)
X_test = df_test.drop(['fullVisitorId', 'totals.transactionRevenue'], axis=1)
X_trn = df_trn.drop(['fullVisitorId', 'totals.transactionRevenue'], axis=1)
X_val = df_val.drop(['fullVisitorId', 'totals.transactionRevenue'], axis=1)

del df_train
del df_test
del df_trn
del df_val

standarization:

In [4]:
col_name = X_train.columns

scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = col_name)
X_test = pd.DataFrame(scaler.transform(X_test), columns = col_name)

Define some functions:

In [5]:
def initialize_list():
    Coef_list = pd.DataFrame(columns=X_train.columns)
    RMSE_list = pd.DataFrame(columns=['RMSE_train','RMSE_val','RMSE_test'])
    
    return Coef_list, RMSE_list

def update_list(Coef_list, RMSE_list, Coef, RMSE):
    Coef_list = Coef_list.append(Coef,ignore_index=True)
    RMSE_list = RMSE_list.append(RMSE,ignore_index=True)
    
    return Coef_list, RMSE_list

def fit_data(reg, X_trn,X_val,X_test,y_trn,y_val,y_test):
    reg.fit(X_trn,y_trn)
    y_pred_trn = reg.predict(X_trn)
    y_pred_val = reg.predict(X_val)
    y_pred_test = reg.predict(X_test)

    RMSE_trn = np.sqrt(mean_squared_error(y_trn, y_pred_trn))   
    RMSE_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    RMSE_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    Coef = pd.DataFrame.transpose(pd.DataFrame(reg.coef_, index = X_trn.columns))
    RMSE = pd.DataFrame([[RMSE_trn,RMSE_val,RMSE_test]],columns=['RMSE_train','RMSE_val','RMSE_test'])
    
    return Coef, RMSE

def RMSE_group(Id, y, yhat, setName):
    df_final = pd.DataFrame({"fullVisitorId":Id})
    df_final['PredictedLogRevenue'] = np.expm1(yhat)

    df_final = df_final.groupby('fullVisitorId')['PredictedLogRevenue'].sum().reset_index()
    df_final.columns = ['fullVisitorId', 'PredictedLogRevenue']
    df_final['PredictedLogRevenue'] = np.log1p(df_final['PredictedLogRevenue'])
    
    
    df_true = pd.DataFrame({"fullVisitorId":Id})
    df_true['TrueLogRevenue'] = np.expm1(y)
        
    df_true = df_true.groupby('fullVisitorId')['TrueLogRevenue'].sum().reset_index()
    df_true.columns = ['fullVisitorId', 'TrueLogRevenue']
    df_true['TrueLogRevenue'] = np.log1p(df_true['TrueLogRevenue'])

    print('RMSE on ', setName, ' set:', 
      np.sqrt(mean_squared_error(df_final['PredictedLogRevenue'], df_true['TrueLogRevenue'])))


## 2. Linear Regression

In [6]:
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)

yhat_train = lm.predict(X_train)
yhat_test = lm.predict(X_test)
yhat_train[yhat_train<0] = 0
yhat_test[yhat_test<0] = 0

lm.fit(X_trn, y_trn)
yhat_val=lm.predict(X_val)
yhat_val[yhat_val<0] = 0
lm.fit(X_train, y_train)
yhat_train=lm.predict(X_train)
yhat_train[yhat_train<0] = 0
yhat_test=lm.predict(X_test)
yhat_test[yhat_test<0] = 0
print('RMSE on \ntraining set:{:.6f}\n     val set:{:.6f}\n    test set:{:.6f}'.format(np.sqrt(mean_squared_error(yhat_train, y_train)),
                                                                                np.sqrt(mean_squared_error(yhat_val, y_val)),
                                                               np.sqrt(mean_squared_error(yhat_test, y_test))))

RMSE on 
training set:1.706329
     val set:1.695255
    test set:1.480720


In [7]:
pd.Series(yhat_test).describe()

count    343084.000000
mean          0.333143
std           0.871962
min           0.000000
25%           0.000000
50%           0.000000
75%           0.243247
max          46.676039
dtype: float64

Postprocessing:

In [8]:
RMSE_group(Id_train, y_train, yhat_train, 'train')
RMSE_group(Id_val, y_val, yhat_val, 'validation')
RMSE_group(Id_test, y_test, yhat_test, 'test')

RMSE on  train  set: 1.7426793886092309
RMSE on  validation  set: 1.718036810870003
RMSE on  test  set: 1.5627121971808704


## 3. Ridge

In [9]:
lam_list_ridge = np.arange(0, 100000, 10000)

In [10]:
Coef_ridge_train,RMSE_ridge_train = initialize_list()

for lam in lam_list_ridge:
    print(lam)
    reg = linear_model.Ridge(alpha = lam, fit_intercept=False)
    Coef,RMSE = fit_data(reg, X_trn, X_val, X_test, y_trn, y_val, y_test)
    Coef_ridge_train,RMSE_ridge_train = update_list(Coef_ridge_train, RMSE_ridge_train, Coef, RMSE)
    
Coef_ridge_train = pd.concat([pd.DataFrame(lam_list_ridge, columns=['lambda']), Coef_ridge_train], axis = 1)
Table_ridge_train = pd.concat([pd.DataFrame(lam_list_ridge, columns=['lambda']), RMSE_ridge_train], axis = 1)
Table_ridge_train

0
10000
20000
30000
40000
50000
60000
70000
80000
90000


Unnamed: 0,lambda,RMSE_train,RMSE_val,RMSE_test
0,0,1.716557,1.700471,1.567363
1,10000,1.716606,1.700402,1.567946
2,20000,1.716659,1.700368,1.567694
3,30000,1.716708,1.70034,1.567433
4,40000,1.716753,1.700316,1.567248
5,50000,1.716795,1.700293,1.567127
6,60000,1.716836,1.700272,1.567052
7,70000,1.716876,1.700253,1.567011
8,80000,1.716914,1.700235,1.566995
9,90000,1.716953,1.700219,1.566997


Postprocessing:

In [11]:
reg = linear_model.Ridge(alpha = 0, fit_intercept=False)
Coef,RMSE = fit_data(reg, X_trn, X_val, X_test, y_trn, y_val, y_test)

In [12]:
RMSE_group(Id_train, y_train, yhat_train, 'train')
RMSE_group(Id_val, y_val, yhat_val, 'validation')
RMSE_group(Id_test, y_test, yhat_test, 'test')

RMSE on  train  set: 1.7426793886092309
RMSE on  validation  set: 1.718036810870003
RMSE on  test  set: 1.5627121971808704


## 4. Lasso

In [13]:
lam_list_lasso = np.arange(1, 11, 1)

In [14]:
Coef_lasso_train,RMSE_lasso_train = initialize_list()

for lam in lam_list_lasso:
    print(lam)
    reg = linear_model.Lasso(alpha = lam, fit_intercept=False)
    Coef,RMSE = fit_data(reg, X_trn, X_val, X_test, y_trn, y_val, y_test)
    Coef_lasso_train,RMSE_lasso_train = update_list(Coef_lasso_train, RMSE_lasso_train, Coef, RMSE)
    
Coef_lasso_train = pd.concat([pd.DataFrame(lam_list_lasso, columns=['lambda']), Coef_lasso_train], axis = 1)
Table_lasso_train = pd.concat([pd.DataFrame(lam_list_lasso, columns=['lambda']), RMSE_lasso_train], axis = 1)
Table_lasso_train

1
2
3
4
5
6
7
8
9
10


Unnamed: 0,lambda,RMSE_train,RMSE_val,RMSE_test
0,1,1.749128,1.724481,1.596405
1,2,1.77565,1.749797,1.611463
2,3,1.812421,1.785508,1.623116
3,4,1.828518,1.801122,1.625309
4,5,1.828535,1.801113,1.62531
5,6,1.828556,1.801108,1.625312
6,7,1.828581,1.801107,1.625313
7,8,1.82861,1.801109,1.625314
8,9,1.828643,1.801116,1.625315
9,10,1.82868,1.801126,1.625317


Postprocessing:

In [15]:
reg = linear_model.Lasso(alpha = 1, fit_intercept=False)
Coef,RMSE = fit_data(reg, X_trn, X_val, X_test, y_trn, y_val, y_test)

In [16]:
RMSE_group(Id_train, y_train, yhat_train, 'train')
RMSE_group(Id_val, y_val, yhat_val, 'validation')
RMSE_group(Id_test, y_test, yhat_test, 'test')

RMSE on  train  set: 1.7426793886092309
RMSE on  validation  set: 1.718036810870003
RMSE on  test  set: 1.5627121971808704
