# Regression Models in R (tips)

In [1]:
library(readr)
library(mechkar)
library(dplyr)
library(ggplot2)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [3]:

df <- read.csv("../data/tips.csv")

In [4]:
head(df)
dim(df)

total_bill,tip,sex,smoker,day,time,size
<dbl>,<dbl>,<fct>,<fct>,<fct>,<fct>,<int>
16.99,1.01,Female,No,Sun,Dinner,2
10.34,1.66,Male,No,Sun,Dinner,3
21.01,3.5,Male,No,Sun,Dinner,3
23.68,3.31,Male,No,Sun,Dinner,2
24.59,3.61,Female,No,Sun,Dinner,4
25.29,4.71,Male,No,Sun,Dinner,4


# EDA

In [5]:
summary(df)

   total_bill         tip             sex      smoker      day         time    
 Min.   : 3.07   Min.   : 1.000   Female: 87   No :151   Fri :19   Dinner:176  
 1st Qu.:13.35   1st Qu.: 2.000   Male  :157   Yes: 93   Sat :87   Lunch : 68  
 Median :17.80   Median : 2.900                          Sun :76               
 Mean   :19.79   Mean   : 2.998                          Thur:62               
 3rd Qu.:24.13   3rd Qu.: 3.562                                                
 Max.   :50.81   Max.   :10.000                                                
      size     
 Min.   :1.00  
 1st Qu.:2.00  
 Median :2.00  
 Mean   :2.57  
 3rd Qu.:3.00  
 Max.   :6.00  

# DATASET PARTITION

In [75]:
tab1 <- train_test(data=df,train_name="train",test_name="test",prop=0.7,seed=5,tableone=TRUE)
tab1

Dataset partitioned into:

 + Train dataset: train

 + Test dataset: test





You got a perfectly balanced training and test datasets

 



V1,V2,Pop,1,2,pval
<chr>,<chr>,<chr>,<fct>,<fct>,<fct>
Individuals,n,244,170,74,
total_bill,Mean (SD),19.8 (8.9),19.6 (8.9),20.3 (8.9),
total_bill,Median (IQR),17.8 (13.3-24.1),17.8 (13.4-23.2),17.7 (13.4-25.5),0.545
tip,Mean (SD),3.0 (1.4),3.0 (1.4),3.0 (1.3),
tip,Median (IQR),2.9 (2.0-3.6),3.0 (2.0-3.6),2.9 (2.0-3.5),0.932
sex,Female,87 (35.7%),63 (37.1%),24 (32.4%),
sex,Male,157 (64.3%),107 (62.9%),50 (67.6%),0.584
smoker,Yes,93 (38.1%),65 (38.2%),28 (37.8%),1.0
day,Fri,19 (7.8%),16 (9.4%),3 (4.1%),
day,Sat,87 (35.7%),58 (34.1%),29 (39.2%),0.944


In [11]:
tab1 %>% filter(pval < 0.05)

V1,V2,Pop,1,2,pval
<chr>,<chr>,<chr>,<fct>,<fct>,<fct>


In [12]:
summary(train)

   total_bill         tip             sex      smoker      day         time    
 Min.   : 3.07   Min.   : 1.000   Female: 63   No :105   Fri :16   Dinner:119  
 1st Qu.:13.38   1st Qu.: 2.000   Male  :107   Yes: 65   Sat :58   Lunch : 51  
 Median :17.80   Median : 3.000                          Sun :51               
 Mean   :19.56   Mean   : 3.003                          Thur:45               
 3rd Qu.:23.15   3rd Qu.: 3.578                                                
 Max.   :50.81   Max.   :10.000                                                
      size      
 Min.   :1.000  
 1st Qu.:2.000  
 Median :2.000  
 Mean   :2.553  
 3rd Qu.:3.000  
 Max.   :6.000  

# MODELS

In [6]:
### The error we will use is the RMSE and RMSLE
rmse <- function(y,y_hat) {
    err <- sqrt(sum((y_hat-y)^2,na.rm=T)/length(y))
    return(err)
}

rmsle <- function(y,y_hat) {
    err <- sqrt(sum((log(y_hat+1)-log(y+1))^2,na.rm=T)/length(y))
    return(err)
}


In [7]:
### Table of resulting errors
### Name, Model, RMSE, RMSLE
err_res <- NULL

## Linear Models

In [13]:
## model with only the original variables
mod1 <- lm(tip ~., data=train)
summary(mod1)


Call:
lm(formula = tip ~ ., data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9615 -0.4672 -0.0690  0.3670  3.9368 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.74383    0.40027   1.858    0.065 .  
total_bill   0.09734    0.01168   8.331 3.32e-14 ***
sexMale     -0.21030    0.16936  -1.242    0.216    
smokerYes    0.17068    0.17518   0.974    0.331    
daySat      -0.13139    0.35826  -0.367    0.714    
daySun       0.15808    0.36983   0.427    0.670    
dayThur     -0.10149    0.40477  -0.251    0.802    
timeLunch   -0.05436    0.46934  -0.116    0.908    
size         0.18151    0.10640   1.706    0.090 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.024 on 161 degrees of freedom
Multiple R-squared:  0.5034,	Adjusted R-squared:  0.4787 
F-statistic:  20.4 on 8 and 161 DF,  p-value: < 2.2e-16


In [14]:
pred1 <- predict(mod1,data=test)
rmse(test$tip,pred1)
rmsle(test$tip,pred1)
err_res <- rbind(err_res, data.frame(Name="Base Linear regression", Model="mod1", 
                                     RMSE=rmse(test$tip,pred1), 
                                     RMSLE=rmsle(test$tip,pred1)))

## Desicion trees

In [15]:
library(tree)
library(rpart)

In [16]:
mod3 <- tree(tip ~., data=train)
mod3

node), split, n, deviance, yval
      * denotes terminal node

 1) root 170 339.700 3.003  
   2) total_bill < 20.47 111  77.120 2.437  
     4) total_bill < 16.315 71  34.920 2.105  
       8) total_bill < 13.2 40  21.540 1.906  
        16) day: Fri,Sat,Thur 32   4.464 1.720 *
        17) day: Sun 8  11.550 2.649 *
       9) total_bill > 13.2 31   9.727 2.363 *
     5) total_bill > 16.315 40  20.490 3.027  
      10) smoker: No 29  13.540 2.821 *
      11) smoker: Yes 11   2.463 3.570 *
   3) total_bill > 20.47 59 160.200 4.068  
     6) total_bill < 44.23 54  88.870 3.811  
      12) day: Fri,Sat 20  29.560 3.182 *
      13) day: Sun,Thur 34  46.760 4.180  
        26) size < 3.5 16  14.340 3.611 *
        27) size > 3.5 18  22.620 4.687  
          54) total_bill < 29.825 9  13.210 4.238 *
          55) total_bill > 29.825 9   5.777 5.136 *
     7) total_bill > 44.23 5  29.200 6.846 *

In [17]:
pred3 <- predict(mod3,newdata=test)
rmse(test$tip,pred3)
rmsle(test$tip,pred3)
err_res <- rbind(err_res, data.frame(Name="Decision Trees-tree", Model="mod3", 
                                     RMSE=rmse(test$tip,pred3), 
                                     RMSLE=rmsle(test$tip,pred3)))

In [18]:
mod4 <- rpart(tip ~., data=train)
mod4

n= 170 

node), split, n, deviance, yval
      * denotes terminal node

 1) root 170 339.747800 3.003294  
   2) total_bill< 20.47 111  77.119490 2.437477  
     4) total_bill< 16.315 71  34.918760 2.105493  
       8) total_bill< 13.2 40  21.536380 1.905750  
        16) day=Fri,Sat,Thur 32   4.463800 1.720000 *
        17) day=Sun 8  11.552090 2.648750 *
       9) total_bill>=13.2 31   9.727277 2.363226 *
     5) total_bill>=16.315 40  20.485880 3.026750  
      10) smoker=No 29  13.544990 2.820690 *
      11) smoker=Yes 11   2.463200 3.570000 *
   3) total_bill>=20.47 59 160.235000 4.067797  
     6) total_bill< 40.87 52  85.926900 3.764808  
      12) day=Fri,Sat 20  29.555520 3.182000 *
      13) day=Sun,Thur 32  45.332270 4.129063  
        26) size< 3.5 16  14.336890 3.610625 *
        27) size>=3.5 16  22.394500 4.647500 *
     7) total_bill>=40.87 7  34.072490 6.318571 *

In [19]:
pred4 <- predict(mod4,newdata=test)
rmse(test$tip,pred4)
rmsle(test$tip,pred4)
err_res <- rbind(err_res, data.frame(Name="Decision Trees-rpart", Model="mod4", 
                                     RMSE=rmse(test$tip,pred4), 
                                     RMSLE=rmsle(test$tip,pred4)))

## Random Forest

In [20]:
library(randomForest)
library(ranger)

randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:ggplot2’:

    margin


The following object is masked from ‘package:dplyr’:

    combine



Attaching package: ‘ranger’


The following object is masked from ‘package:randomForest’:

    importance




In [21]:
mod5 <- randomForest(tip ~., data=train)
mod5


Call:
 randomForest(formula = tip ~ ., data = train) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 2

          Mean of squared residuals: 1.143062
                    % Var explained: 42.8

In [24]:
pred5 <- predict(mod5,newdata=test)
rmse(test$tip,pred5)
rmsle(test$tip,pred5)
err_res <- rbind(err_res, data.frame(Name="RandomForest (RF)", Model="mod5", 
                                     RMSE=rmse(test$tip,pred5), 
                                     RMSLE=rmsle(test$tip,pred5)))

In [28]:
mod6 <- ranger(tip ~., data=train)
mod6

Ranger result

Call:
 ranger(tip ~ ., data = train) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      170 
Number of independent variables:  6 
Mtry:                             2 
Target node size:                 5 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       1.160737 
R squared (OOB):                  0.4226173 

In [30]:
pred6 <- predict(mod6,data=test)
#head(pred6)
rmse(test$tip,pred6$predictions)
rmsle(test$tip,pred6$predictions)
err_res <- rbind(err_res, data.frame(Name="RandomForest (ranger)", Model="mod6", 
                                     RMSE=rmse(test$tip,pred6$predictions), 
                                     RMSLE=rmsle(test$tip,pred6$predictions)))

## XGBoost

In [31]:
library(xgboost)


Attaching package: ‘xgboost’


The following object is masked from ‘package:dplyr’:

    slice




In [91]:
#train1 <- Matrix::sparse.model.matrix(tip ~ .-1, data = train)

In [92]:
#test1 <- Matrix::sparse.model.matrix(tip ~ .-1, data = test)

In [39]:
summary(train)

   total_bill         tip             sex      smoker      day         time    
 Min.   : 3.07   Min.   : 1.000   Female: 63   No :105   Fri :16   Dinner:119  
 1st Qu.:13.38   1st Qu.: 2.000   Male  :107   Yes: 65   Sat :58   Lunch : 51  
 Median :17.80   Median : 3.000                          Sun :51               
 Mean   :19.56   Mean   : 3.003                          Thur:45               
 3rd Qu.:23.15   3rd Qu.: 3.578                                                
 Max.   :50.81   Max.   :10.000                                                
      size      
 Min.   :1.000  
 1st Qu.:2.000  
 Median :2.000  
 Mean   :2.553  
 3rd Qu.:3.000  
 Max.   :6.000  

In [76]:
train1 <- train
train1$sex <- ifelse(train$sex=='Female',1,0)
train1$smoker <- ifelse(train$smoker=='Yes',1,0)
train1$time <- ifelse(train$time=='Dinner',1,0) 
train1$day_Fri <- ifelse(train$day=='Fri',1,0) 
train1$day_Sat <- ifelse(train$day=='Sat',1,0) 
train1$day_Sun <- ifelse(train$day=='Sun',1,0) 
train1$day <- NULL

In [77]:
test1 <- test
test1$sex <- ifelse(test$sex=='Female',1,0)
test1$smoker <- ifelse(test$smoker=='Yes',1,0)
test1$time <- ifelse(test$time=='Dinner',1,0) 
test1$day_Fri <- ifelse(test$day=='Fri',1,0) 
test1$day_Sat <- ifelse(test$day=='Sat',1,0) 
test1$day_Sun <- ifelse(test$day=='Sun',1,0) 
test1$day <- NULL
summary(test1)

   total_bill         tip             sex             smoker      
 Min.   : 7.25   Min.   :1.000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:13.37   1st Qu.:2.000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :17.74   Median :2.855   Median :0.0000   Median :0.0000  
 Mean   :20.31   Mean   :2.987   Mean   :0.3243   Mean   :0.3784  
 3rd Qu.:25.47   3rd Qu.:3.538   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :44.30   Max.   :7.580   Max.   :1.0000   Max.   :1.0000  
      time             size          day_Fri           day_Sat      
 Min.   :0.0000   Min.   :1.000   Min.   :0.00000   Min.   :0.0000  
 1st Qu.:1.0000   1st Qu.:2.000   1st Qu.:0.00000   1st Qu.:0.0000  
 Median :1.0000   Median :2.000   Median :0.00000   Median :0.0000  
 Mean   :0.7703   Mean   :2.608   Mean   :0.04054   Mean   :0.3919  
 3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:0.00000   3rd Qu.:1.0000  
 Max.   :1.0000   Max.   :5.000   Max.   :1.00000   Max.   :1.0000  
    day_Sun      
 Min.   :0.0000  
 1st Qu.:0.0

In [78]:
nm <- setdiff(names(train1),"tip")
nm

In [45]:
summary(train1)

   total_bill         tip              sex             smoker      
 Min.   : 3.07   Min.   : 1.000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:13.38   1st Qu.: 2.000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :17.80   Median : 3.000   Median :0.0000   Median :0.0000  
 Mean   :19.56   Mean   : 3.003   Mean   :0.3706   Mean   :0.3824  
 3rd Qu.:23.15   3rd Qu.: 3.578   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :50.81   Max.   :10.000   Max.   :1.0000   Max.   :1.0000  
      time          size          day_Fri           day_Sat          day_Sun   
 Min.   :0.0   Min.   :1.000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0  
 1st Qu.:0.0   1st Qu.:2.000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0  
 Median :1.0   Median :2.000   Median :0.00000   Median :0.0000   Median :0.0  
 Mean   :0.7   Mean   :2.553   Mean   :0.09412   Mean   :0.3412   Mean   :0.3  
 3rd Qu.:1.0   3rd Qu.:3.000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0  
 Max.   :1.0   Max.   :6.000   Max.   :1.000

In [80]:
#X_train <- xgb.DMatrix(data=train1[,nm],labels=train$tip)
#X_train <- xgb.DMatrix(data=as.matrix(train1[,nm]))
#X_train <- train1
#y_train <- train1$tip
#mod7 <- xgboost(data=X_train,label=y_train, nrounds=100,print_every_n = 10)

In [68]:
X_train <- data.matrix(train1[,nm])
y_train <- train1$tip
mod7 <- xgboost(data=X_train, label = y_train, nrounds=100,print_every_n = 10)

[1]	train-rmse:2.152302 
[11]	train-rmse:0.473452 
[21]	train-rmse:0.318180 
[31]	train-rmse:0.228823 
[41]	train-rmse:0.159094 
[51]	train-rmse:0.119651 
[61]	train-rmse:0.094507 
[71]	train-rmse:0.066804 
[81]	train-rmse:0.049393 
[91]	train-rmse:0.036365 
[100]	train-rmse:0.026892 


In [79]:
#X_test <- xgb.DMatrix(test1)
X_test <- data.matrix(test1[,nm])
y_test <- test1$tip

pred7 <- predict(mod7,newdata=X_test)
rmse(y_test,pred7)
rmsle(y_test,pred7)
err_res <- rbind(err_res, data.frame(Name="XGBoost", Model="mod7", 
                                     RMSE=rmse(test$tip,pred7), 
                                     RMSLE=rmsle(test$tip,pred7)))

## kNN 

In [54]:
### adaboost needs that values to be normalized
min_max <- function(x) { (x -min(x))/(max(x)-min(x))   }

In [96]:
#X_train <- sapply(data.frame(as.matrix(train1)),min_max)


In [97]:
#X_test <- sapply(data.frame(as.matrix(test1)),min_max)

In [98]:
#summary(X_train)

    holiday0         holiday1             temp           rain_1h        
 Min.   :0.0000   Min.   :0.000000   Min.   :0.0000   Min.   :0.000000  
 1st Qu.:1.0000   1st Qu.:0.000000   1st Qu.:0.4369   1st Qu.:0.000000  
 Median :1.0000   Median :0.000000   Median :0.5932   Median :0.000000  
 Mean   :0.9987   Mean   :0.001275   Mean   :0.5751   Mean   :0.002263  
 3rd Qu.:1.0000   3rd Qu.:0.000000   3rd Qu.:0.7349   3rd Qu.:0.000000  
 Max.   :1.0000   Max.   :1.000000   Max.   :1.0000   Max.   :1.000000  
    snow_1h            clouds_all    weather_mainClouds weather_mainDrizzle
 Min.   :0.0000000   Min.   :0.000   Min.   :0.0000     Min.   :0.00000    
 1st Qu.:0.0000000   1st Qu.:0.010   1st Qu.:0.0000     1st Qu.:0.00000    
 Median :0.0000000   Median :0.640   Median :0.0000     Median :0.00000    
 Mean   :0.0004365   Mean   :0.493   Mean   :0.3146     Mean   :0.03762    
 3rd Qu.:0.0000000   3rd Qu.:0.900   3rd Qu.:1.0000     3rd Qu.:0.00000    
 Max.   :1.0000000   Max.   :1.00

In [82]:
library(class)
#mod8 <- knn(X_train,X_test,cl=train$tip)
mod8 <- knn(train1,test1,cl=train1$tip)

In [83]:
str(mod8)

 Factor w/ 89 levels "1","1.1","1.17",..: 42 16 11 50 40 47 64 55 48 39 ...


In [84]:
pred8 <- as.numeric(as.character(mod8))

rmse(test$tip,pred8)
rmsle(test$tip,pred8)
err_res <- rbind(err_res, data.frame(Name="kNN", Model="mod8", 
                                     RMSE=rmse(test$tip,pred8), 
                                     RMSLE=rmsle(test$tip,pred8)))

## SVM

In [59]:
#install.packages("liquidSVM")
library(liquidSVM)

mod9 <- svm(tip ~., train)

In [60]:
pred9 <- predict(mod9, newdata=test)

rmse(test$tip,pred9)
rmsle(test$tip,pred9)
err_res <- rbind(err_res, data.frame(Name="SVM", Model="mod9", 
                                     RMSE=rmse(test$tip,pred9), 
                                     RMSLE=rmsle(test$tip,pred9)))

In [85]:
err_res %>% arrange(RMSLE)

Name,Model,RMSE,RMSLE
<fct>,<fct>,<dbl>,<dbl>
kNN,mod8,0.6296846,0.1600974
SVM,mod9,1.0404241,0.2486826
RandomForest (ranger),mod6,1.0574537,0.2530595
RandomForest (RF),mod5,1.0494992,0.2532754
Decision Trees-rpart,mod4,1.193916,0.2719324
Decision Trees-tree,mod3,1.2105324,0.2737575
XGBoost,mod7,1.2259525,0.296692
Base Linear regression,mod1,2.4786077,0.5920905
