In [56]:
# A linear regression learning algorithm example using TensorFlow library.

from __future__ import print_function

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.contrib import learn
from sklearn import metrics
import random

rng = np.random

# Parameters
num_epochs = 1000
STEPS = 10000
BATCH_SIZE = 1000


#read csv file
datapath = "/Users/tuanle/DynamicPricing/Data/"
Ha_Noi = pd.read_csv(datapath+"OnlineDrivers_HaNoi_10days.csv")
#Add an additional column into the table
# sLength = len(Ha_Noi['accept_rate'])
Ha_Noi['accept_rate_timeT'] = pd.Series(Ha_Noi['accept_rate'], index=Ha_Noi.index)
#Shift the entries in the accept_rate column upward
Ha_Noi.accept_rate = Ha_Noi.accept_rate.shift(-1)
#Drop all the "na" entries in the original table
Ha_Noi = Ha_Noi.dropna(subset = ["longwait_percent4"])
Ha_Noi = Ha_Noi.dropna(subset=["accept_rate"])
Ha_Noi = Ha_Noi.dropna(subset = ["longwait_percent2"])
Ha_Noi = Ha_Noi.drop(Ha_Noi[Ha_Noi.Percentchange_onlinedrivers == 0].index)
Ha_Noi = Ha_Noi.dropna(subset = ["DriverBusyRate"])
Ha_Noi = Ha_Noi.drop(Ha_Noi[Ha_Noi.DriverBusyRate < 0.1].index)
Ha_Noi['log_onlinedrivers'] = pd.Series(np.log(Ha_Noi.online_drivers), index=Ha_Noi.index)

#define normalized function for our dataset
# def normalize(array):
#     return (array - array.mean()) / array.std()

df2 = pd.DataFrame(Ha_Noi)
print(len(df2))


#split the dataset into training and testing sets
train_set, test_set = train_test_split(Ha_Noi, test_size=0.2, random_state = random.randint(20, 200))

# Training Data
train_X =  np.array(train_set[['longwait_percent2', 'accept_rate_timeT','Percentchange_onlinedrivers', 'DriverBusyRate']], dtype=np.float32)
# train_X2 = np.array(train_set['accept_rate_timeT'], dtype=np.float32)
train_Y =  np.array(train_set['accept_rate'], dtype=np.float32)
n_samples = train_X.shape[0]

#Testing Data
Xtest = np.array(test_set[['longwait_percent2', 'accept_rate_timeT','Percentchange_onlinedrivers', 'DriverBusyRate']], dtype=np.float32)
# Xtest2 = np.array(test_set['accept_rate_timeT'], dtype=np.float32)
Ytest = np.array(test_set['accept_rate'], dtype=np.float32)

#Deep Neural Network Regressor 
feature_column1 = learn.infer_real_valued_columns_from_input(train_X)
# feature_column2 = learn.infer_real_valued_columns_from_input(train_X2)
regressor = learn.DNNRegressor(feature_columns = feature_column1, hidden_units= [50,50])
regressor.fit(train_X, train_Y, max_steps= STEPS, batch_size= BATCH_SIZE)
Ypred = regressor.predict_scores(Xtest, as_iterable=True)
Ypred = np.asarray(list(Ypred))
rmse = np.sqrt(((Ypred - Ytest) ** 2).mean(axis=0))
print("Root mean square Error: %.3f" %rmse)






359
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_id': 0, '_master': '', '_save_checkpoints_steps': None, '_save_summary_steps': 100, '_evaluation_master': '', '_num_worker_replicas': 0, '_task_type': None, '_tf_random_seed': None, '_keep_checkpoint_every_n_hours': 10000, '_environment': 'local', '_save_checkpoints_secs': 600, '_model_dir': '/var/folders/tn/4g20x2q119jdjnlf36j_8j_m0000gn/T/tmpr6syfp1v', '_is_chief': True, '_keep_checkpoint_max': 5, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11b79a400>, '_num_ps_replicas': 0, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_session_config': None}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Inst

  equality = a == b


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/tn/4g20x2q119jdjnlf36j_8j_m0000gn/T/tmpr6syfp1v/model.ckpt.
INFO:tensorflow:loss = 0.655576, step = 1
INFO:tensorflow:global_step/sec: 398.797
INFO:tensorflow:loss = 0.00357647, step = 101 (0.252 sec)
INFO:tensorflow:global_step/sec: 497.357
INFO:tensorflow:loss = 0.0035468, step = 201 (0.200 sec)
INFO:tensorflow:global_step/sec: 569.389
INFO:tensorflow:loss = 0.00352227, step = 301 (0.176 sec)
INFO:tensorflow:global_step/sec: 504.179
INFO:tensorflow:loss = 0.00350649, step = 401 (0.198 sec)
INFO:tensorflow:global_step/sec: 568.757
INFO:tensorflow:loss = 0.00349178, step = 501 (0.176 sec)
INFO:tensorflow:global_step/sec: 526.53
INFO:tensorflow:loss = 0.00348365, step = 601 (0.190 sec)
INFO:tensorflow:global_step/sec: 542.811
INFO:tensorflow:loss = 0.00347786, step = 701 (0.184 sec)
INFO:tensorflow:global_step/sec: 539.231
INFO:tensorflow:loss = 0.00347275, step = 801 (0.186 sec)
INFO:

# Correlation testing between acceptance rate, online drivers and percent change in online drivers

In [18]:
corr_matrix = Ha_Noi.corr()
corr_matrix["accept_rate"].sort_values(ascending=False)

# %matplotlib inline
# import matplotlib.pyplot as plt
# df2.hist(bins = 50, figsize = (15, 15))



accept_rate                    1.000000
accept_rate_timeT              0.766752
log_onlinedrivers              0.335491
online_drivers                 0.323698
Percentchange_onlinedrivers    0.265362
DriverBusyRate                -0.081558
request                       -0.089699
long_waiting                  -0.584625
longwait_percent1             -0.736557
longwait_percent2             -0.743146
longwait_percent4             -0.743859
longwait_percent3             -0.744769
Name: accept_rate, dtype: float64

# Random Forest Algorithm and Model Evaluations using Cross-Validation

In [57]:
import numpy as np
from sklearn.preprocessing import LabelEncoder  
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression


forest_reg = RandomForestRegressor()
forest_reg.fit(train_X, train_Y.ravel())
Ypred2 = forest_reg.predict(Xtest)

lin_reg = LinearRegression()


lin_mse = mean_squared_error(Ytest, Ypred2)
forest_rmse = np.sqrt(lin_mse)
print("Root Mean Square Error of RF Algo:\t",forest_rmse)

lin_mse2 = mean_squared_error(Ytest, Ypred)
lin_rmse = np.sqrt(lin_mse2)
print("Root Mean Square Error of DNN Algo:\t", lin_rmse)

#Evaluate RF algo on the whole training set by cross-validation
scores = cross_val_score(forest_reg, train_X, train_Y.ravel(), scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-scores)

#Evaluate RF algo on the whole test set by cross-validation
scores3 = cross_val_score(forest_reg, Xtest, Ytest.ravel(), scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores3 = np.sqrt(-scores3)

#Evaluate Lin-Reg algo on the whole training set by cross-validation with k = 50 folds
scores2 = cross_val_score(lin_reg, train_X, train_Y.ravel(), scoring = "neg_mean_squared_error", cv = len(train_X))
linreg_rmse_scores2 = np.sqrt(-scores2)

#Evaluate Lin-Reg algo on the test set by cross-validation
scores4 = cross_val_score(lin_reg, Xtest, Ytest.ravel(), scoring = "neg_mean_squared_error", cv = len(Xtest))
linreg_rmse_scores4 = np.sqrt(-scores4)


def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard", scores.std())
    print("Max:", scores.max())
    print("Min:", scores.min())

display_scores(linreg_rmse_scores2)
display_scores(linreg_rmse_scores4)
lin_mae_RF = mean_absolute_error(Ytest, Ypred)
lin_mae_DNN = mean_absolute_error(Ytest, Ypred2)

# display_scores(Accept_rate_prediction)
# print("Mean Square Error:\t", linreg_rmse_scores2)
# print("Mean Absolute Error:\t", lin_mae)

Root Mean Square Error of RF Algo:	 0.0479646140219
Root Mean Square Error of DNN Algo:	 0.0484971
Scores: [  1.49921952e-02   5.24170954e-02   5.26724005e-02   2.92611346e-02
   1.45782964e-02   2.79729234e-02   2.55782253e-02   4.53886508e-02
   5.69399600e-02   1.04844831e-02   2.84607175e-02   1.65080457e-01
   4.44059223e-02   2.41274281e-02   9.49695115e-04   5.56148100e-03
   2.23169518e-04   1.14140786e-02   5.92193886e-02   3.00681501e-02
   9.21124540e-02   3.75927412e-03   5.00135469e-02   8.45393281e-02
   5.55044475e-03   2.35133487e-01   2.60741736e-02   7.19403784e-02
   1.11308526e-02   2.51296419e-02   5.76826247e-02   5.21830875e-02
   7.03167819e-02   3.56306026e-03   1.04477988e-01   7.08061587e-02
   3.68297783e-02   3.86519696e-02   4.01505412e-02   3.45718963e-02
   1.66753051e-02   1.88675348e-02   1.01839854e-01   1.71553144e-02
   2.99609165e-02   2.94957953e-02   3.80529463e-03   7.54459159e-02
   6.09166671e-02   9.95185710e-02   5.69595077e-02   1.31114782e