In [129]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import numpy.random as rnd
import os

# to make this notebook's output stable across runs
seednumber = rnd.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "fundamentals"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

    

In [130]:
import pandas as pd

datapath = "/Users/tuanle/DynamicPricing/Data/"

Ha_Noi = pd.read_csv(datapath+"HaNoi_3month_LWT.csv")
#Add an additional column into the table
sLength = len(Ha_Noi['accept_rate'])
Ha_Noi['accept_rate_timeT'] = pd.Series(Ha_Noi['accept_rate'], index=Ha_Noi.index)
#Shift the entries in the accept_rate column upward
Ha_Noi.accept_rate = Ha_Noi.accept_rate.shift(-1)

Ha_Noi = Ha_Noi.dropna(subset = ["longwait_percent4"])
Ha_Noi = Ha_Noi.dropna(subset=["accept_rate"])
Ha_Noi = Ha_Noi.dropna(subset = ["longwait_percent2"])
df2 = pd.DataFrame(Ha_Noi)
 

# threshold for request
# df2 = df2.drop(df2[(df2.request < 15)].index)
#Dong_Da.describe()

In [131]:
corr_matrix = Ha_Noi.corr()
corr_matrix["accept_rate"].sort_values(ascending=False)

accept_rate          1.000000
accept_rate_timeT    0.757362
request             -0.264904
long_waiting        -0.653561
longwait_percent1   -0.735186
longwait_percent2   -0.741524
longwait_percent4   -0.742168
longwait_percent3   -0.742899
Name: accept_rate, dtype: float64

In [132]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random

train_set, test_set = train_test_split(Ha_Noi, test_size=0.2, random_state= random.randint(40, 200))
Xtrain = train_set['longwait_percent2'].reshape(-1,1)
Ytrain = train_set['accept_rate'].reshape(-1,1)

Xtest = test_set['longwait_percent2'].reshape(-1,1)
Ytest = test_set['accept_rate'].reshape(-1,1)

Xtrain2 = train_set[['longwait_percent2', 'accept_rate_timeT']]
Xtest2 = test_set[['longwait_percent2', 'accept_rate_timeT']]





# Linear Regression - ML model

In [133]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
result = model.fit(Xtrain, Ytrain)

model2 = LinearRegression()
result2 = model2.fit(Xtrain2, Ytrain)


In [134]:
Accept_rate_prediction = model.predict(Xtest)
Accept_rate_train  = model.predict(Xtrain)

Accept_rate_prediction2 = model2.predict(Xtest2)
Accept_rate_train2 = model2.predict(Xtrain2)


In [135]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

lin_mse = mean_squared_error(Ytest, Accept_rate_prediction2)
lin_rmse = np.sqrt(lin_mse) #taking square root of MSE converts the units back to the original units of the output variable

lin_mae = mean_absolute_error(Ytest, Accept_rate_prediction2)
print("Mean Square Error:\t",lin_rmse)
# print("Mean Absolute Error:\t", lin_mae)

lin_reg = LinearRegression()
#Evaluate Lin-Reg algo on the whole training set by cross-validation
scores2 = cross_val_score(lin_reg, Xtrain, Ytrain.ravel(), scoring = "neg_mean_squared_error", cv = 100)
linreg_rmse_scores2 = np.sqrt(-scores2)

#Evaluate Lin-Reg algo on the whole test set by cross-validation
scores4 = cross_val_score(lin_reg, Xtest, Ytest.ravel(), scoring = "neg_mean_squared_error", cv = 100)
linreg_rmse_scores4 = np.sqrt(-scores4)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard:", scores.std())
    print("Max:", scores.max())
    print("Min:", scores.min())

# display_scores(-scores2)
display_scores(-scores4)

r_squared = r2_score(Ytest, Accept_rate_prediction) #built-in formula for R^2 
print("R^2:\t", r_squared) 

adjusted_Rsquared = 1 - (1-r_squared)*(len(Xtest)-1)/(len(Xtest)-1-1) #formula for adjusted R^2 by theory
print("Adjusted R^2:\t", adjusted_Rsquared) #correct the error when adding more and more independent variables to the linear regression model


# lin_mse = mean_squared_error(Ytrain, Accept_rate_train)
# lin_rmse = np.sqrt(lin_mse)
# print("Mean Squred Error_train:\t", lin_rmse)

Mean Square Error:	 0.0578004772197
Scores: [ 0.0029248   0.00189466  0.00483605  0.00209622  0.00130198  0.00164246
  0.00231686  0.00362979  0.00180117  0.00461553  0.00250365  0.0010258
  0.00131888  0.00112202  0.00139695  0.00374178  0.00434781  0.00111361
  0.00681554  0.00344193  0.00147712  0.00272718  0.00188037  0.00727646
  0.00319633  0.00429922  0.01359191  0.00206998  0.00207949  0.00836108
  0.00145494  0.00483344  0.00127086  0.00266644  0.00155137  0.01051266
  0.00137555  0.00282087  0.01019255  0.00769242  0.01068033  0.00126362
  0.00262418  0.0045139   0.01044291  0.00545069  0.00709477  0.00817063
  0.00182809  0.00065894  0.00370172  0.00460224  0.00063118  0.00462414
  0.00346366  0.00422658  0.00280235  0.00171392  0.0055925   0.00329271
  0.00714694  0.00246459  0.00276448  0.00621169  0.00287723  0.00310797
  0.00291965  0.00764012  0.00144257  0.00086711  0.00161267  0.00184941
  0.00492247  0.00838847  0.00267388  0.00067411  0.01914487  0.0025709
  0.00265

# Plot of RMSE errors when testing our models on training set vs test set

In [136]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

def plot_learning_curves(model):
    train_errors, test_errors = [], []
    for m in range(1, len(Xtrain)):
        model.fit(Xtrain[:m], Ytrain[:m])
        Ytrain_predict = model.predict(Xtrain[:m])
        Ytest_predict = model.predict(Xtest)
        train_errors.append(mean_squared_error(Ytrain_predict, Ytrain[:m]))
        test_errors.append(mean_squared_error(Ytest_predict, Ytest))
#         plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="training set")
#         plt.plot(np.sqrt(test_errors), "b-", linewidth=3, label="validation set")
    print("Average RMSE on the training set is: %.3f" %np.mean(np.sqrt(train_errors)))
    print("Average RMSE on the test set is: %.3f" %np.mean(np.sqrt(test_errors)))

def plot_learning_curves2(model):   
    train_errors2, test_errors2 = [], []
    for n in range(1, len(Xtrain)):
        model.fit(Xtrain2[:n], Ytrain[:n])
        Ytrain_predict2 = model.predict(Xtrain2[:n])
        Ytest_predict2 = model.predict(Xtest2)
        train_errors2.append(mean_squared_error(Ytrain_predict2, Ytrain[:n]))
        test_errors2.append(mean_squared_error(Ytest_predict2, Ytest))
#         plt.plot(np.sqrt(train_errors2), "g-+", linewidth=2, label="training set")
#         plt.plot(np.sqrt(test_errors2), "o-", linewidth=3, label="validation set")
    print("Average RMSE2 on the training set is: %.3f" %np.mean(np.sqrt(train_errors2)))
    print("Average RMSE2 on the test set is: %.3f" %np.mean(np.sqrt(test_errors2)))
        

polynomial_regression = Pipeline((("poly_features", PolynomialFeatures(degree=2, include_bias=False)),
                                  ("sgd_reg", LinearRegression()),
    ))

        
lin_reg = LinearRegression()
plot_learning_curves2(lin_reg)
plot_learning_curves(lin_reg)









Average RMSE2 on the training set is: 0.060
Average RMSE2 on the test set is: 0.058
Average RMSE on the training set is: 0.063
Average RMSE on the test set is: 0.063


In [137]:
from sklearn.externals import joblib
filename = 'my_model_3monthdata.sav'
joblib.dump(model, filename)

# load the model from disk
loaded_model = joblib.load(filename)
result = loaded_model.score(Xtest, Ytest) #R^2 result
print(result)

0.51053917054
