In [3]:
import pycaret
import pandas as pd 
from pycaret.classification import *
from pycaret.regression import *

In [5]:
# reading file
df = pd.read_csv("taxi_weather_grouped.csv")

In [7]:
# first 5 rows
df.head(5)

Unnamed: 0.1,Unnamed: 0,time,temperature_2m (°C),relativehumidity_2m (%),dewpoint_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),cloudcover (%),month,dayofweek,hour,DOLocationID,passenger_count,trip_distance
0,0,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,4,13.0,13.1
1,1,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,13,5.0,8.91
2,2,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,24,13.0,38.4
3,3,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,41,22.0,77.32
4,4,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,42,16.0,65.17


In [8]:
# droping unnamed column
df = df.drop("Unnamed: 0", 1)

In [9]:
# viewing 5 first rows to confirm that the column was dropped
df.head(5)

Unnamed: 0,time,temperature_2m (°C),relativehumidity_2m (%),dewpoint_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),cloudcover (%),month,dayofweek,hour,DOLocationID,passenger_count,trip_distance
0,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,4,13.0,13.1
1,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,13,5.0,8.91
2,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,24,13.0,38.4
3,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,41,22.0,77.32
4,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,42,16.0,65.17


In [12]:
# descriptive stats of data frame
df.describe()

Unnamed: 0,temperature_2m (°C),relativehumidity_2m (%),dewpoint_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),cloudcover (%),month,dayofweek,hour,DOLocationID,passenger_count,trip_distance
count,536123.0,536123.0,536123.0,536123.0,536123.0,536123.0,536123.0,536123.0,536123.0,536123.0,536123.0,536123.0,536123.0,536123.0
mean,12.75874,70.383375,6.908693,11.067024,0.120472,0.117634,0.001987,41.953707,6.522078,3.000341,11.657349,150.335733,91.07303,177.71856
std,9.734032,19.19897,9.729113,12.157467,0.446178,0.440409,0.037569,37.839403,3.444259,1.996231,6.890956,74.021208,100.955257,169.896176
min,-13.8,22.0,-23.8,-21.3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,1.0,0.0
25%,4.8,55.0,-0.6,0.9,0.0,0.0,0.0,4.0,4.0,1.0,6.0,88.0,16.0,45.36
50%,12.4,72.0,8.1,10.3,0.0,0.0,0.0,30.0,7.0,3.0,12.0,151.0,54.0,130.81
75%,20.6,87.0,15.1,21.4,0.0,0.0,0.0,81.0,10.0,5.0,18.0,230.0,135.0,256.96
max,36.0,100.0,23.9,39.3,7.1,7.1,1.96,100.0,12.0,6.0,23.0,263.0,899.0,1616.74


In [44]:
# data frame shape
df.shape

(536123, 15)

# Busyness Index
- Using percentiles for the busyness because the mean can be skewed by outliers
- For the model, busyness will be defined in the following ways:
    - Not Busy: passenger count is within the range of the 25th percentile and the minimum passenger count for the given zone
    - A little Busy: passenger count is within the range of the 25th percentile and 50th percentile
    - Busy: passenger count is within the range of the 50th percentile and 75th percentile
    - Very Busy: passenger count is greater than the 75 percentile 

# Sources used to get the percentile passenger_count for each taxi zone 
- https://sparkbyexamples.com/pandas/pandas-groupby-aggregate-explained/
- https://pbpython.com/groupby-agg.html

In [39]:
# creating a new data frame with the 25th, 50th and 75th percentile passenger count for each taxi zone
percentiles = df.groupby("DOLocationID")["passenger_count"].agg([lambda x: x.quantile(0.25), 
                                                                lambda x: x.quantile(0.5), 
                                                                lambda x: x.quantile(0.75)]).reset_index()                                                                

In [40]:
percentiles.head(5)

Unnamed: 0,DOLocationID,<lambda_0>,<lambda_1>,<lambda_2>
0,4,9.0,21.0,37.0
1,12,2.0,5.0,11.0
2,13,25.0,55.0,75.0
3,24,11.0,23.0,35.0
4,41,19.0,39.0,60.0


In [41]:
# data frame shape
percentiles.shape

(66, 4)

In [65]:
# renaming percentile colmuns
percentiles.rename(columns = {"<lambda_0>" : "25th_percentile", "<lambda_1>" : "50th_percentile",
                              "<lambda_2>" : "75th_percentile"}, inplace = True)

In [67]:
# confirming column name chnage 
percentiles.head(5)

Unnamed: 0,DOLocationID,25th_percentile,50th_percentile,75th_percentile
0,4,9.0,21.0,37.0
1,12,2.0,5.0,11.0
2,13,25.0,55.0,75.0
3,24,11.0,23.0,35.0
4,41,19.0,39.0,60.0


In [70]:
# merging percentile data frame with original data frame
merged_df = pd.merge(percentiles, df, on = "DOLocationID", how = "right")

In [71]:
merged_df.shape

(536123, 18)

In [72]:
merged_df.head(5)

Unnamed: 0,DOLocationID,25th_percentile,50th_percentile,75th_percentile,time,temperature_2m (°C),relativehumidity_2m (%),dewpoint_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),cloudcover (%),month,dayofweek,hour,passenger_count,trip_distance
0,4,9.0,21.0,37.0,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,13.0,13.1
1,13,25.0,55.0,75.0,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,5.0,8.91
2,24,11.0,23.0,35.0,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,13.0,38.4
3,41,19.0,39.0,60.0,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,22.0,77.32
4,42,14.0,24.0,37.0,2022-02-01 00:00:00,-5.8,67,-11.0,-10.6,0.0,0.0,0.0,0,2,1,0,16.0,65.17


In [73]:
# checking data types
merged_df.dtypes

DOLocationID                   int64
25th_percentile              float64
50th_percentile              float64
75th_percentile              float64
time                          object
temperature_2m (°C)          float64
relativehumidity_2m (%)        int64
dewpoint_2m (°C)             float64
apparent_temperature (°C)    float64
precipitation (mm)           float64
rain (mm)                    float64
snowfall (cm)                float64
cloudcover (%)                 int64
month                          int64
dayofweek                      int64
hour                           int64
passenger_count              float64
trip_distance                float64
dtype: object

- Source for displaying max rows 
- https://builtin.com/data-science/pandas-show-all-columns

In [74]:
# checking for passenger count values with only one instance 
pd.set_option("display.max_rows", None)
values = merge["passenger_count"].value_counts()
print(values)

1.0      17473
2.0      13448
3.0      11433
4.0      10166
5.0       9599
6.0       8995
7.0       8336
8.0       7853
9.0       7357
10.0      7016
11.0      6626
12.0      6347
13.0      6041
14.0      5819
15.0      5585
16.0      5391
17.0      5168
18.0      5062
19.0      4930
20.0      4866
21.0      4661
22.0      4525
23.0      4415
24.0      4399
25.0      4232
26.0      4122
27.0      3985
29.0      3889
28.0      3869
30.0      3640
32.0      3615
31.0      3600
33.0      3520
34.0      3362
35.0      3354
36.0      3306
38.0      3247
39.0      3211
37.0      3172
40.0      3073
41.0      3002
42.0      2994
44.0      2897
43.0      2870
45.0      2813
46.0      2689
48.0      2678
47.0      2621
49.0      2604
50.0      2556
51.0      2546
53.0      2426
54.0      2420
52.0      2416
55.0      2338
58.0      2284
57.0      2278
56.0      2243
61.0      2190
59.0      2189
60.0      2156
64.0      2102
62.0      2088
63.0      2068
66.0      2053
67.0      2030
65.0      

In [75]:
# removing all rows with passenger count over 667 because 668 is the lowest passenger count that only occurs once
# pycaret does not cannot processes classes with only one instance, so each passenger count value must have atleast
# two instances
merged_df = merged_df[merged_df["passenger_count"] <=667]

In [77]:
# confirming that there are no passenger_count values that only appear once
pd.set_option("display.max_rows", None)
values = merged_df["passenger_count"].value_counts()
print(values)

1.0      17473
2.0      13448
3.0      11433
4.0      10166
5.0       9599
6.0       8995
7.0       8336
8.0       7853
9.0       7357
10.0      7016
11.0      6626
12.0      6347
13.0      6041
14.0      5819
15.0      5585
16.0      5391
17.0      5168
18.0      5062
19.0      4930
20.0      4866
21.0      4661
22.0      4525
23.0      4415
24.0      4399
25.0      4232
26.0      4122
27.0      3985
29.0      3889
28.0      3869
30.0      3640
32.0      3615
31.0      3600
33.0      3520
34.0      3362
35.0      3354
36.0      3306
38.0      3247
39.0      3211
37.0      3172
40.0      3073
41.0      3002
42.0      2994
44.0      2897
43.0      2870
45.0      2813
46.0      2689
48.0      2678
47.0      2621
49.0      2604
50.0      2556
51.0      2546
53.0      2426
54.0      2420
52.0      2416
55.0      2338
58.0      2284
57.0      2278
56.0      2243
61.0      2190
59.0      2189
60.0      2156
64.0      2102
62.0      2088
63.0      2068
66.0      2053
67.0      2030
65.0      

# Model
- Pycaret Sources
- https://towardsdatascience.com/introduction-to-regression-in-python-with-pycaret-d6150b540fc4
- https://github.com/pycaret/pycaret/blob/master/tutorials/Tutorial%20-%20Binary%20Classification.ipynb

In [78]:
# preparing data for model
# reduced the number of folds to decrease processing time
# numeric_features parameter used to define data types
# passenger_count assigned as target column
all_models = setup(merged_df, target = "passenger_count", fold = 5, numeric_features = ["DOLocationID", 
                                                                                     "25th_percentile",
                                                                                     "50th_percentile",
                                                                                     "75th_percentile",
                                                                                   "temperature_2m (°C)", 
                                                                                   "relativehumidity_2m (%)",
                                                                                  "dewpoint_2m (°C)",
                                                                                  "apparent_temperature (°C)",
                                                                                  "precipitation (mm)",
                                                                                  "rain (mm)",
                                                                                  "snowfall (cm)",
                                                                                  "cloudcover (%)",
                                                                                  "month",
                                                                                  "dayofweek",
                                                                                  "hour",
                                                                                  "trip_distance"])

Unnamed: 0,Description,Value
0,Session id,2414
1,Target,passenger_count
2,Target type,Regression
3,Original data shape,"(535902, 18)"
4,Transformed data shape,"(535902, 18)"
5,Transformed train set shape,"(375131, 18)"
6,Transformed test set shape,"(160771, 18)"
7,Numeric features,16
8,Categorical features,1
9,Preprocess,True


In [79]:
# comparing models
best_models = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,10.4944,283.1988,16.8281,0.9718,0.2562,0.2378,60.734
rf,Random Forest Regressor,10.9681,312.973,17.6906,0.9689,0.2613,0.2437,61.276
lightgbm,Light Gradient Boosting Machine,11.9149,346.6516,18.6181,0.9655,0.3167,0.3075,1.378
knn,K Neighbors Regressor,13.7726,501.9988,22.4051,0.9501,0.3263,0.3175,54.09
gbr,Gradient Boosting Regressor,14.5991,517.536,22.7488,0.9485,0.3961,0.4369,16.734
dt,Decision Tree Regressor,15.47,633.1909,25.1626,0.937,0.3591,0.3186,1.176
lr,Linear Regression,22.0185,1013.9217,31.8415,0.8991,0.6877,1.2301,1.154
ridge,Ridge Regression,22.0189,1013.9801,31.8424,0.8991,0.6877,1.2302,0.544
br,Bayesian Ridge,22.0193,1014.0388,31.8433,0.8991,0.6877,1.2302,0.3
lasso,Lasso Regression,22.0102,1015.4137,31.8649,0.899,0.6864,1.2253,1.524


In [87]:
# creating extra trees regressor model
extra_trees = create_model("et", n_estimators = 70, max_depth = 14, fold = 5)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,11.67,343.6721,18.5384,0.9655,0.2788,0.2651
1,11.7729,351.5011,18.7484,0.9653,0.2775,0.2652
2,11.7768,349.3646,18.6913,0.9653,0.2762,0.264
3,11.8255,361.0588,19.0015,0.9642,0.2756,0.2633
4,11.739,349.1446,18.6854,0.9652,0.2783,0.2669
Mean,11.7569,350.9482,18.733,0.9651,0.2773,0.2649
Std,0.0514,5.6786,0.1511,0.0005,0.0012,0.0012


In [None]:
# Model max_depth had to be reduced since the model with these parameters:
# extra_trees = create_model("et", n_estimators = 70, max_depth = 15, fold = 5)
# created a pickel file that was 154MB which cannot be uploaded on github

In [88]:
save_model(extra_trees, "exta_trees_model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/var/folders/6b/tc94k9n157g920l0_fmhzz000000gn/T/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['DOLocationID', '25th_percentile',
                                              '50th_percentile',
                                              '75th_percentile',
                                              'temperature_2m (°C)',
                                              'relativehumidity_2m (%)',
                                              'dewpoint_2m (°C)',
                                              'apparent_temperature (°C)',
                                              'precipitation (mm)', 'rain (mm)',
                                              'snowfall (...
                  TransformerWrapper(include=['time'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('rest_encoding',
                  TransformerWrapper(include=['t