In [1]:
# load package
import numpy as np
import pandas as pd

# set path
import os
default_path = "/Users/francislin/Desktop/Machine_Learning_Workshop/linear_regression/"
os.chdir(default_path)

# skip interation warning
import warnings; 
warnings.filterwarnings('ignore')

In [2]:
# Data Prepararion
# read data
review_data = pd.read_csv("Example_LasVegasTrip.csv")

print(review_data.head())


  User country  Nr. reviews  Nr. hotel reviews  Helpful votes  Score  \
0          USA           11                  4             13      5   
1          USA          119                 21             75      3   
2          USA           36                  9             25      5   
3           UK           14                  7             14      4   
4       Canada            5                  5              2      4   

  Period of stay Traveler type Pool  Gym Tennis court Spa Casino  \
0        Dec-Feb       Friends   NO  YES           NO  NO    YES   
1        Dec-Feb      Business   NO  YES           NO  NO    YES   
2        Mar-May      Families   NO  YES           NO  NO    YES   
3        Mar-May       Friends   NO  YES           NO  NO    YES   
4        Mar-May          Solo   NO  YES           NO  NO    YES   

  Free internet                              Hotel name  Hotel stars  \
0           YES  Circus Circus Hotel & Casino Las Vegas            3   
1           YE

In [3]:

#data description
review_data.head
review_data.values
review_data.shape
review_data.columns
review_data.index
review_data.info()
review_data.dtypes
review_data.describe()

#one hot encoding
category_var = ['User country', 'Period of stay', 'Pool', 'Gym', 'Tennis court','Spa','Casino','Free internet', 'Traveler type','Hotel name','User continent','Review month','Review weekday']
for col in category_var: 
    review_data[col] = review_data[col].astype('category')
    dummies = pd.get_dummies(review_data.loc[:, col], prefix=col ) 
    review_data = pd.concat( [review_data, dummies], axis = 1)

# drop original variable
fields_to_drop =  category_var
review_data = review_data.drop(fields_to_drop, axis = 1 )

# replace missing value with zero
review_data = review_data.fillna(review_data.mean())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 20 columns):
User country         504 non-null object
Nr. reviews          504 non-null int64
Nr. hotel reviews    504 non-null int64
Helpful votes        504 non-null int64
Score                504 non-null int64
Period of stay       504 non-null object
Traveler type        504 non-null object
Pool                 504 non-null object
Gym                  504 non-null object
Tennis court         504 non-null object
Spa                  504 non-null object
Casino               504 non-null object
Free internet        504 non-null object
Hotel name           504 non-null object
Hotel stars          504 non-null int64
Nr. rooms            408 non-null float64
User continent       408 non-null object
Member years         408 non-null float64
Review month         408 non-null object
Review weekday       408 non-null object
dtypes: float64(2), int64(5), object(13)
memory usage: 78.8+ KB


In [4]:
# Split to Training and Testing
#from sklearn import cross_validation
from sklearn.model_selection import train_test_split
seed = 7
test_size = 0.3
X = review_data.loc[:, review_data.columns != 'Score']
y = review_data[['Score']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [5]:
# Linear Regression (OLS)
# load package
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)
print(regr.score(X_test, y_test, sample_weight=None))

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
regr_mse = mean_squared_error(y_test, y_pred)

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))
regr_score = r2_score(y_test, y_pred)

-0.2366563347014048
Coefficients: 
 [[ 2.64652589e-04  1.60420539e-06  1.79533303e-04  2.33841549e-01
  -1.46983707e-04 -1.09650791e-03  2.81607029e-01 -8.63709753e-01
   1.57877061e+00  1.12318057e-01 -2.12923536e+00  2.46707331e-01
  -3.15563730e-01 -6.90495590e-16  2.56351761e-01 -2.82779079e-01
  -3.21675960e-01  5.53904393e-01  4.11722530e-01 -9.54401478e-01
   2.31491902e-03  7.54306526e-16  4.31824295e-01 -7.16769637e-02
  -1.19388197e-01 -1.19087449e+00  4.31574689e-01 -4.75324254e-01
   2.17650017e-02 -7.60994741e-01 -1.39708987e-15  5.03329834e-01
  -6.79120237e-01  3.23356609e-02 -1.77276555e-01  1.15837561e+00
   1.97374821e-01  1.19983169e-02  1.00348586e+00  5.28300664e-16
   1.99798318e+00 -1.37190094e-15  1.84197616e+00 -4.17365208e-01
  -4.85462680e-01 -1.05901280e+00  5.81995645e-16  2.07311472e-01
  -9.70840496e-01  5.49315236e-01 -1.42752185e-02  1.61149710e-01
   1.83712419e-01 -8.88231695e-01  2.70931593e-01  3.53932504e-01
  -2.71031338e-01 -3.53832759e-01 -2.420

In [6]:
# Ridge regression
# set parameter
alphas = np.logspace(-4, 30, 30) # default base = 10
#May's demo: alphas = np.logspace(-4, -0.5, 30) n_folds = 3 Mean squared error: 1.19 Variance score: -0.19
tuned_parameters = [{'alpha': alphas}]
n_folds = 10

# load package
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# model
model = Ridge()
ridge = GridSearchCV(estimator=model, param_grid = tuned_parameters, cv=n_folds, refit=False)
ridge.fit(X_train, y_train)

# Final Model
ridge_final = Ridge(alpha = ridge.best_params_['alpha'])
ridge_final.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = ridge_final.predict(X_test)
# ridge_score = ridge_final.score(X_test, y_test, sample_weight=None)
print(ridge_final.score(X_test, y_test, sample_weight=None))

# The coefficients
print('Coefficients: \n', ridge_final.coef_)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
ridge_mse = mean_squared_error(y_test, y_pred)

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))
ridge_score = r2_score(y_test, y_pred)

-0.011041744621945293
Coefficients: 
 [[ 5.05402594e-07  5.17679071e-09  1.87850756e-07  6.47577755e-08
  -3.69864253e-05 -2.02618255e-06  3.07742922e-09 -1.47153612e-09
   1.87708651e-09 -1.03735381e-08 -3.93733802e-09 -2.79297518e-10
  -1.40500682e-09  0.00000000e+00  1.11661930e-09 -4.46999088e-09
  -2.29220313e-10  1.13952656e-09  3.25493508e-10 -2.66839098e-09
  -5.13293033e-10  0.00000000e+00 -1.30440499e-10  1.33680772e-09
  -2.02618523e-10 -2.11220254e-10  3.95005272e-09 -2.85049737e-10
  -2.32774814e-10 -1.42114116e-09  0.00000000e+00 -8.45551304e-11
  -8.68404050e-11 -1.52194925e-10 -1.36060495e-10  5.28259147e-09
   8.29571163e-10 -4.53114640e-09  2.10100934e-09  0.00000000e+00
   1.15460728e-09  0.00000000e+00  1.13850133e-09  9.75351394e-10
  -2.66840162e-09 -2.75415173e-09  0.00000000e+00  1.18657394e-09
  -1.35353188e-09  1.11678660e-09 -2.88197408e-10  7.20125011e-09
   7.68742957e-09 -1.61075097e-09  1.58980682e-08 -5.92082922e-09
  -3.00155561e-09 -6.97568339e-09 -2.4



In [7]:
# LASSO
# load packages
from sklearn.linear_model import Lasso

# model
lasso = Lasso(random_state=0, normalize = True)
clf = GridSearchCV(lasso,  param_grid= tuned_parameters, cv=n_folds, refit=False)
clf.fit(X_train, y_train)

# Final Model
clf_final = Lasso(alpha = clf.best_params_['alpha'])
clf_final.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = clf_final.predict(X_test)
print(clf_final.score(X_test, y_test, sample_weight=None))

# The coefficients
print('Coefficients: \n', clf_final.coef_)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
clf_mse = mean_squared_error(y_test, y_pred)

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))
clf_score = r2_score(y_test, y_pred)

-0.14712049671718175
Coefficients: 
 [ 3.04364359e-04 -1.46105127e-04  1.25021037e-04  2.98834902e-01
 -1.40951976e-04 -1.01677051e-03  2.66961369e-02 -5.39997180e-01
  0.00000000e+00 -9.21545632e-02 -1.12615389e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.74921363e-01
 -1.02600719e-01  0.00000000e+00  9.64139915e-02 -5.06449532e-01
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  2.28099189e-01
  0.00000000e+00 -1.57785401e-02  2.37220474e-01 -0.00000000e+00
 -0.00000000e+00 -5.21234972e-01  0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  6.48708964e-01
  0.00000000e+00 -1.40145810e-01  4.99521374e-01  0.00000000e+00
  1.10843215e+00  0.00000000e+00  1.22063798e+00  0.00000000e+00
 -0.00000000e+00 -9.25973994e-01  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -8.61334451e-02  3.07532676e-01  3.00223261e-01
 -5.88467606e-02 -9.22803614e-02 -5.32974606e-01  0.0



In [8]:
# compare
print('------------------------------------------------')
print('Compare Score: \n', round(regr_score,4), round(ridge_score,4), round(clf_score,4))
print('Compare MSE: \n', round(regr_mse,4), round(ridge_mse,4), round(clf_mse,4))

------------------------------------------------
Compare Score: 
 -0.2367 -0.011 -0.1471
Compare MSE: 
 1.2315 1.0068 1.1424
