# Regression Models in R (tips)

In [1]:
library(readr)
library(mechkar)
library(dplyr)
library(ggplot2)

"package 'dplyr' was built under R version 4.0.2"

Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


"package 'ggplot2' was built under R version 4.0.2"


In [2]:

df <- read.csv(../data/tips.csv)

Parsed with column specification:
cols(
  holiday = [31mcol_character()[39m,
  temp = [32mcol_double()[39m,
  rain_1h = [32mcol_double()[39m,
  snow_1h = [32mcol_double()[39m,
  clouds_all = [32mcol_double()[39m,
  weather_main = [31mcol_character()[39m,
  weather_description = [31mcol_character()[39m,
  date_time = [34mcol_datetime(format = "")[39m,
  traffic_volume = [32mcol_double()[39m
)



In [3]:
head(df)
dim(df)

holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dttm>,<dbl>
,288.28,0,0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
,289.36,0,0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
,289.58,0,0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
,290.13,0,0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
,291.14,0,0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918
,291.72,0,0,1,Clear,sky is clear,2012-10-02 14:00:00,5181


# EDA

In [4]:
summary(df)

   holiday               temp          rain_1h            snow_1h         
 Length:48204       Min.   :  0.0   Min.   :   0.000   Min.   :0.0000000  
 Class :character   1st Qu.:272.2   1st Qu.:   0.000   1st Qu.:0.0000000  
 Mode  :character   Median :282.4   Median :   0.000   Median :0.0000000  
                    Mean   :281.2   Mean   :   0.334   Mean   :0.0002224  
                    3rd Qu.:291.8   3rd Qu.:   0.000   3rd Qu.:0.0000000  
                    Max.   :310.1   Max.   :9831.300   Max.   :0.5100000  
   clouds_all     weather_main       weather_description
 Min.   :  0.00   Length:48204       Length:48204       
 1st Qu.:  1.00   Class :character   Class :character   
 Median : 64.00   Mode  :character   Mode  :character   
 Mean   : 49.36                                         
 3rd Qu.: 90.00                                         
 Max.   :100.00                                         
   date_time                   traffic_volume
 Min.   :2012-10-02 09:00:00  

In [13]:
summary(df)

                      holiday           temp          rain_1h        
 None                     :48143   Min.   :  0.0   Min.   :   0.000  
 Labor Day                :    7   1st Qu.:272.2   1st Qu.:   0.000  
 Christmas Day            :    6   Median :282.4   Median :   0.000  
 Martin Luther King Jr Day:    6   Mean   :281.2   Mean   :   0.334  
 New Years Day            :    6   3rd Qu.:291.8   3rd Qu.:   0.000  
 Thanksgiving Day         :    6   Max.   :310.1   Max.   :9831.300  
 (Other)                  :   30                                     
    snow_1h            clouds_all      weather_main         weather_description
 Min.   :0.0000000   Min.   :  0.00   Clouds :15164   sky is clear    :11665   
 1st Qu.:0.0000000   1st Qu.:  1.00   Clear  :13391   mist            : 5950   
 Median :0.0000000   Median : 64.00   Mist   : 5950   overcast clouds : 5081   
 Mean   :0.0002224   Mean   : 49.36   Rain   : 5672   broken clouds   : 4666   
 3rd Qu.:0.0000000   3rd Qu.: 90.00   Sn

# DATASET PARTITION

In [15]:
tab1 <- train_test(data=df,train_name="train",test_name="test",prop=0.7,seed=5,tableone=TRUE)
tab1

Dataset partitioned into:

 + Train dataset: train

 + Test dataset: test

"The following variables have unique values and will not be included in the analysis: "




 

You got a perfectly balanced training and test datasets

 



V1,V2,Pop,1,2,pval
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Individuals,n,48204,33742,14462,
holiday,Christmas Day,6 (0.0%),5 (0.0%),1 (0.0%),
holiday,Columbus Day,5 (0.0%),2 (0.0%),3 (0.0%),0.716
holiday,Independence Day,5 (0.0%),2 (0.0%),3 (0.0%),
holiday,Labor Day,7 (0.0%),3 (0.0%),4 (0.0%),
holiday,Martin Luther King Jr Day,6 (0.0%),3 (0.0%),3 (0.0%),
holiday,Memorial Day,5 (0.0%),3 (0.0%),2 (0.0%),
holiday,New Years Day,6 (0.0%),4 (0.0%),2 (0.0%),
holiday,,"48,143 (99.9%)","33,710 (99.9%)","14,433 (99.8%)",
holiday,State Fair,5 (0.0%),1 (0.0%),4 (0.0%),


In [16]:
tab1 %>% filter(pval < 0.05)

V1,V2,Pop,1,2,pval
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
traintest_ind_,Median (IQR),1.0 (1.0-2.0),,,0


# MODELS

In [17]:
### The error we will use is the RMSE and RMSLE
rmse <- function(y,y_hat) {
    err <- sqrt(sum((y_hat-y)^2,na.rm=T)/length(y))
    return(err)
}

rmsle <- function(y,y_hat) {
    err <- sqrt(sum((log(y_hat+1)-log(y+1))^2,na.rm=T)/length(y))
    return(err)
}


In [18]:
### Table of resulting errors
### Name, Model, RMSE, RMSLE
err_res <- NULL

## Linear Models

In [20]:
## model with only the original variables
mod1 <- lm(tip ~., data=train)
summary(mod1)


Call:
lm(formula = traffic_volume ~ ., data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-5171.1 -1627.8   -81.9  1514.8  6491.0 

Coefficients: (11 not defined because of singularities)
                                                         Estimate Std. Error
(Intercept)                                            -3084.3422   850.1374
holidayColumbus Day                                     -572.6340  1527.9083
holidayIndependence Day                                   -7.9221  1527.9341
holidayLabor Day                                         103.4222  1333.9556
holidayMartin Luther King Jr Day                        -127.4329  1333.5332
holidayMemorial Day                                      -95.5211  1333.6732
holidayNew Years Day                                     263.5315  1225.2343
holidayNone                                             1084.4241   816.9740
holidayState Fair                                       -736.0851  2000.1034
holidayThanksgiving Day  

In [21]:
pred1 <- predict(mod1,newdata=test)
rmse(test$tip,pred1)
rmsle(test$tip,pred1)
err_res <- rbind(err_res, data.frame(Name="Base Linear regression", Model="mod1", 
                                     RMSE=rmse(test$tip,pred1), 
                                     RMSLE=rmsle(test$tip,pred1)))

ERROR: Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels): factor weather_description has new levels shower snow, thunderstorm with drizzle


In [46]:
## model with all the variables
mod2 <- lm(tip ~., data=train)
summary(mod2)


Call:
lm(formula = traffic_volume ~ ., data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-4653.7  -399.4    -7.6   490.3  3175.5 

Coefficients: (3 not defined because of singularities)
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)               -969.7498   205.5091  -4.719 2.38e-06 ***
holiday1                   -85.0700   125.2393  -0.679 0.496978    
temp                         7.4783     0.7592   9.851  < 2e-16 ***
rain_1h                    -29.3448     4.5463  -6.455 1.10e-10 ***
snow_1h                   -376.3486   550.3242  -0.684 0.494064    
clouds_all                  -1.1722     0.1841  -6.367 1.95e-10 ***
weather_mainClouds          75.7197    27.3652   2.767 0.005660 ** 
weather_mainDrizzle         31.4907   186.8627   0.169 0.866173    
weather_mainFog            -90.5065    40.9239  -2.212 0.027002 *  
weather_mainHaze             8.0339    36.6493   0.219 0.826487    
weather_mainMist            -1.1421    29.3550  

In [47]:
pred2 <- predict(mod2,newdata=test)
rmse(test$tip,pred2)
rmsle(test$tip,pred2)
err_res <- rbind(err_res, data.frame(Name="Extended Linear regression", Model="mod2", 
                                     RMSE=rmse(test$tip,pred2), 
                                     RMSLE=rmsle(test$tip,pred2)))

## Desicion trees

In [48]:
library(tree)
library(rpart)

In [49]:
mod3 <- tree(tip ~., data=train)
mod3

node), split, n, deviance, yval
      * denotes terminal node

 1) root 33735 1.333e+11 3265.0  
   2) hour: 0,1,2,3,4,5,20,21,22,23 14202 1.559e+10 1403.0  
     4) hour: 0,1,2,3,4,23 8567 2.125e+09  716.5 *
     5) hour: 5,20,21,22 5635 3.296e+09 2446.0 *
   3) hour: 6,7,8,9,10,11,12,13,14,15,16,17,18,19 19533 3.266e+10 4619.0  
     6) weekday: Saturday,Sunday 5466 8.371e+09 3460.0  
      12) hour: 6,7,8 1229 6.333e+08 1681.0 *
      13) hour: 9,10,11,12,13,14,15,16,17,18,19 4237 2.718e+09 3976.0 *
     7) weekday: Friday,Monday,Thursday,Tuesday,Wednesday 14067 1.410e+10 5070.0  
      14) hour: 9,10,11,12,13,18,19 6998 3.785e+09 4472.0  
        28) hour: 19 1011 1.865e+08 3288.0 *
        29) hour: 9,10,11,12,13,18 5987 1.940e+09 4673.0 *
      15) hour: 6,7,8,14,15,16,17 7069 5.344e+09 5661.0 *

In [50]:
pred3 <- predict(mod3,newdata=test)
rmse(test$tip,pred3)
rmsle(test$tip,pred3)
err_res <- rbind(err_res, data.frame(Name="Decision Trees-tree", Model="mod3", 
                                     RMSE=rmse(test$tip,pred3), 
                                     RMSLE=rmsle(test$tip,pred3)))

In [51]:
mod4 <- rpart(tip ~., data=train)
mod4

n= 33735 

node), split, n, deviance, yval
      * denotes terminal node

 1) root 33735 133336700000 3265.2030  
   2) hour=0,1,2,3,4,5,20,21,22,23 14202  15585880000 1402.6780  
     4) hour=0,1,2,3,4,23 8567   2124686000  716.5288 *
     5) hour=5,20,21,22 5635   3295873000 2445.8430 *
   3) hour=6,7,8,9,10,11,12,13,14,15,16,17,18,19 19533  32663340000 4619.4030  
     6) weekday=Saturday,Sunday 5466   8370710000 3460.3710  
      12) hour=6,7,8 1229    633278500 1681.1450 *
      13) hour=9,10,11,12,13,14,15,16,17,18,19 4237   2718337000 3976.4600 *
     7) weekday=Friday,Monday,Thursday,Tuesday,Wednesday 14067  14096690000 5069.7660  
      14) hour=9,10,11,12,13,18,19 6998   3784788000 4472.4930  
        28) hour=19 1011    186461500 3287.7770 *
        29) hour=9,10,11,12,13,18 5987   1939718000 4672.5510 *
      15) hour=6,7,8,14,15,16,17 7069   5344096000 5661.0410 *

In [52]:
pred4 <- predict(mod4,newdata=test)
rmse(test$tip,pred4)
rmsle(test$tip,pred4)
err_res <- rbind(err_res, data.frame(Name="Decision Trees-rpart", Model="mod4", 
                                     RMSE=rmse(test$tip,pred4), 
                                     RMSLE=rmsle(test$tip,pred4)))

## Random Forest

In [53]:
library(randomForest)
library(ranger)

randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘randomForest’

The following object is masked from ‘package:ggplot2’:

    margin

The following object is masked from ‘package:dplyr’:

    combine


Attaching package: ‘ranger’

The following object is masked from ‘package:randomForest’:

    importance



In [54]:
#mod5 <- randomForest(tip ~., data=train)
#mod5

In [55]:
#pred5 <- predict(mod5,newdata=test)
#rmse(test$tip,pred5)
#rmsle(test$tip,pred5)

In [56]:
mod6 <- ranger(tip ~., data=train)
mod6

Ranger result

Call:
 ranger(traffic_volume ~ ., data = train) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      33735 
Number of independent variables:  15 
Mtry:                             3 
Target node size:                 5 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       329935.8 
R squared (OOB):                  0.9165267 

In [61]:
pred6 <- predict(mod6,data=test)
#head(pred6)
rmse(test$tip,pred6$predictions)
rmsle(test$tip,pred6$predictions)
err_res <- rbind(err_res, data.frame(Name="RandomForest (ranger)", Model="mod6", 
                                     RMSE=rmse(test$tip,pred6$predictions), 
                                     RMSLE=rmsle(test$tip,pred6$predictions)))

## XGBoost

In [90]:
library(xgboost)


Attaching package: ‘xgboost’

The following object is masked from ‘package:dplyr’:

    slice



In [91]:
train1 <- Matrix::sparse.model.matrix(tip ~ .-1, data = train)

In [92]:
test1 <- Matrix::sparse.model.matrix(tip ~ .-1, data = test)

In [93]:
#X_train <- xgb.DMatrix(train1)
X_train <- train1
y_train <- train$tip
mod7 <- xgboost(data=X_train,label=y_train, nrounds=100,print_every_n = 10)

[1]	train-rmse:2857.081055 
[11]	train-rmse:858.049011 
[21]	train-rmse:670.329956 
[31]	train-rmse:602.543091 
[41]	train-rmse:571.487122 
[51]	train-rmse:545.621826 
[61]	train-rmse:525.861023 
[71]	train-rmse:509.539185 
[81]	train-rmse:493.998077 
[91]	train-rmse:480.588318 
[100]	train-rmse:468.334961 


In [94]:
#X_test <- xgb.DMatrix(test1)
X_test <- test1
y_test <- test$tip

pred7 <- predict(mod7,newdata=X_test)
rmse(y_test,pred7)
rmsle(y_test,pred7)
err_res <- rbind(err_res, data.frame(Name="XGBoost", Model="mod7", 
                                     RMSE=rmse(test$tip,pred7), 
                                     RMSLE=rmsle(test$tip,pred7)))

## kNN 

In [95]:
### adaboost needs that values to be normalized
min_max <- function(x) { (x -min(x))/(max(x)-min(x))   }

In [96]:
X_train <- sapply(data.frame(as.matrix(train1)),min_max)

In [97]:
X_test <- sapply(data.frame(as.matrix(test1)),min_max)

In [98]:
summary(X_train)

    holiday0         holiday1             temp           rain_1h        
 Min.   :0.0000   Min.   :0.000000   Min.   :0.0000   Min.   :0.000000  
 1st Qu.:1.0000   1st Qu.:0.000000   1st Qu.:0.4369   1st Qu.:0.000000  
 Median :1.0000   Median :0.000000   Median :0.5932   Median :0.000000  
 Mean   :0.9987   Mean   :0.001275   Mean   :0.5751   Mean   :0.002263  
 3rd Qu.:1.0000   3rd Qu.:0.000000   3rd Qu.:0.7349   3rd Qu.:0.000000  
 Max.   :1.0000   Max.   :1.000000   Max.   :1.0000   Max.   :1.000000  
    snow_1h            clouds_all    weather_mainClouds weather_mainDrizzle
 Min.   :0.0000000   Min.   :0.000   Min.   :0.0000     Min.   :0.00000    
 1st Qu.:0.0000000   1st Qu.:0.010   1st Qu.:0.0000     1st Qu.:0.00000    
 Median :0.0000000   Median :0.640   Median :0.0000     Median :0.00000    
 Mean   :0.0004365   Mean   :0.493   Mean   :0.3146     Mean   :0.03762    
 3rd Qu.:0.0000000   3rd Qu.:0.900   3rd Qu.:1.0000     3rd Qu.:0.00000    
 Max.   :1.0000000   Max.   :1.00

In [99]:
library(class)
mod8 <- knn(X_train,X_test,cl=train$tip)

In [101]:
str(mod8)

 Factor w/ 6491 levels "0","1","2","3",..: 3818 4423 4823 3219 2245 5110 179 203 5038 4739 ...


In [102]:
pred8 <- as.numeric(as.character(mod8))

rmse(test$tip,pred8)
rmsle(test$tip,pred8)
err_res <- rbind(err_res, data.frame(Name="kNN", Model="mod8", 
                                     RMSE=rmse(test$tip,pred8), 
                                     RMSLE=rmsle(test$tip,pred8)))

## SVM

In [103]:
#install.packages("liquidSVM")
library(liquidSVM)

mod9 <- svm(tip ~., train)

In [104]:
pred9 <- predict(mod9, newdata=test)

rmse(test$tip,pred9)
rmsle(test$tip,pred9)
err_res <- rbind(err_res, data.frame(Name="SVM", Model="mod9", 
                                     RMSE=rmse(test$tip,pred9), 
                                     RMSLE=rmsle(test$tip,pred9)))

In [105]:
err_res %>% arrange(RMSLE)

Name,Model,RMSE,RMSLE
XGBoost,mod7,525.4015,0.3331306
RandomForest (ranger),mod6,563.4734,0.3636757
Decision Trees-tree,mod3,684.0402,0.440038
Decision Trees-rpart,mod4,684.0402,0.440038
SVM,mod9,778.8321,0.4847811
Extended Linear regression,mod2,813.0075,0.5508977
kNN,mod8,2069.2175,0.9964388
Base Linear regression,mod1,1940.8681,1.0118488
