## Import Statements ##

In [123]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns                     
import matplotlib.pyplot as plt           
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Making a List of all the files from the directory ##

In [124]:
file_path = "/Users/vedantigulalkari/Documents/Fall Module 2/WDA/WDA Final Project/Monthly Datasets"
file_list = os.listdir(file_path)
for f in file_list:
    print(f)

202208-divvy-tripdata.csv
202205-divvy-tripdata.csv
202109-divvy-tripdata.csv
202104-divvy-tripdata.csv
.DS_Store
202107-divvy-tripdata.csv
202206-divvy-tripdata.csv
202210-divvy-tripdata.csv
202111-divvy-tripdata.csv
202101-divvy-tripdata.csv
202102-divvy-tripdata.csv
202112-divvy-tripdata.csv
202203-divvy-tripdata.csv
202103-divvy-tripdata.csv
202202-divvy-tripdata.csv
202201-divvy-tripdata.csv
202110-divvy-tripdata.csv
202106-divvy-tripdata.csv
202207-divvy-tripdata.csv
202209-divvy-tripdata.csv
202204-divvy-tripdata.csv
202108-divvy-tripdata.csv
202105-divvy-tripdata.csv


In [125]:
# Remove Unnecessary files
if (".DS_Store" in file_list):
    file_list.remove(".DS_Store")
len(file_list)

22

## Combining all data files to get a combined dataset ##

In [126]:
df = pd.concat([pd.read_csv("/Users/vedantigulalkari/Documents/Fall Module 2/WDA/WDA Final Project/Monthly Datasets/"+f) for f in file_list], ignore_index=True)
df.shape

(10743239, 13)

In [127]:
#checking dtypes
df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

## Data Cleaning ##

In [128]:
#missing values
df.isna().sum()

ride_id                     0
rideable_type               0
started_at                  0
ended_at                    0
start_station_name    1442633
start_station_id      1442630
end_station_name      1546495
end_station_id        1546495
start_lat                   0
start_lng                   0
end_lat                 10271
end_lng                 10271
member_casual               0
dtype: int64

In [129]:
#convert datatype datetime format
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

In [130]:
# drop as not needed
df = df.drop(['ride_id', 'start_station_id', 'end_station_id'], axis = 1)

In [131]:
#Calculate ride-length column for future analysis 
df['Ride_Length'] = abs(((df['ended_at'] - df['started_at']).dt.days) * 24 + ((df['ended_at'] - df['started_at']).dt.seconds)/ 3600)

In [132]:
df.describe
df_clean = df[df['Ride_Length']>=0.0166667]

In [133]:
df_clean['Day_of_Week'] = df_clean['started_at'].dt.day_name()
print(df.head())

   rideable_type          started_at            ended_at start_station_name  \
0  electric_bike 2022-08-07 21:34:15 2022-08-07 21:41:46                NaN   
1  electric_bike 2022-08-08 14:39:21 2022-08-08 14:53:23                NaN   
2  electric_bike 2022-08-08 15:29:50 2022-08-08 15:40:34                NaN   
3  electric_bike 2022-08-08 02:43:50 2022-08-08 02:58:53                NaN   
4  electric_bike 2022-08-07 20:24:06 2022-08-07 20:29:58                NaN   

  end_station_name  start_lat  start_lng  end_lat  end_lng member_casual  \
0              NaN      41.93     -87.69    41.94   -87.72        casual   
1              NaN      41.89     -87.64    41.92   -87.64        casual   
2              NaN      41.97     -87.69    41.97   -87.66        casual   
3              NaN      41.94     -87.65    41.97   -87.69        casual   
4              NaN      41.85     -87.65    41.84   -87.66        casual   

   Ride_Length  
0     0.125278  
1     0.233889  
2     0.178889  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Day_of_Week'] = df_clean['started_at'].dt.day_name()


## Random Forest Modelling ##

In [134]:
#Cleaning up our columns:
df_ML = df_clean.drop(['start_lat', 'end_lat', 'start_lng', 'end_lng','start_station_name','end_station_name','ended_at'], axis='columns')
df_ML.head()

Unnamed: 0,rideable_type,started_at,member_casual,Ride_Length,Day_of_Week
0,electric_bike,2022-08-07 21:34:15,casual,0.125278,Sunday
1,electric_bike,2022-08-08 14:39:21,casual,0.233889,Monday
2,electric_bike,2022-08-08 15:29:50,casual,0.178889,Monday
3,electric_bike,2022-08-08 02:43:50,casual,0.250833,Monday
4,electric_bike,2022-08-07 20:24:06,casual,0.097778,Sunday


In [135]:
#Convert Ride Length into minutes
df_ML['Ride_Length'] = df_ML['Ride_Length'] * 60

In [136]:
df_ML['Ride_Length'] = df_ML['Ride_Length'].astype(int)

In [138]:
#Break datetime into integer value for Month
df_ML['Month'] = df_ML['started_at'].dt.month

In [139]:
#Season variable to analyse effect of seasonality
df_ML['Season'] = pd.cut(df_ML['Month'],
                            [0,3,9,12],
                            labels = ['Cold', 'Warm', 'Cold'],
                             include_lowest=True,
                            ordered=False)

In [140]:
df_ML['Part_of_Weekend'] = np.where((df_ML['Day_of_Week'] == 'Saturday') | (df_ML['Day_of_Week'] == 'Sunday'), 1,0)

In [141]:
#Break datetime into integer value for hour
df_ML['Hours'] = df_ML['started_at'].dt.hour

In [144]:
df_ML['Ride_Type'] = np.where(df_ML['member_casual'] == 'member',1,0)

In [146]:
#Creating Dummies for categorical variables
bt_dummy = pd.get_dummies(df_ML['rideable_type'])
seasons_dummy = pd.get_dummies(df_ML['Season'])

In [148]:
df_ML = pd.concat([df_ML, bt_dummy, seasons_dummy], axis=1)

In [149]:
df_ML = df_ML.drop(columns = ['Day_of_Week', 'Season','started_at', 'rideable_type', 'member_casual'])

In [150]:
#Fill relevant information for the missing values
df_ML.fillna("NotatDock", inplace=True)
df_ML.isna().sum()

Ride_Length        0
Month              0
Part_of_Weekend    0
Hours              0
Ride_Type          0
classic_bike       0
docked_bike        0
electric_bike      0
Cold               0
Warm               0
dtype: int64

In [152]:
y = df_ML['Ride_Type']
X = df_ML.drop(['Ride_Type'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=30)

In [153]:
#check training data 
y_train.value_counts()

1    4736738
0    3701298
Name: Ride_Type, dtype: int64

In [154]:
X_train.value_counts()

Ride_Length  Month  Part_of_Weekend  Hours  classic_bike  docked_bike  electric_bike  Cold  Warm
6            8      0                17     1             0            0              0     1       3299
7            8      0                17     1             0            0              0     1       3195
5            8      0                17     1             0            0              0     1       3168
6            9      0                17     1             0            0              0     1       3096
8            8      0                17     1             0            0              0     1       3041
                                                                                                    ... 
47           2      1                10     0             0            1              1     0          1
                    0                23     1             0            0              1     0          1
                                     22     1             0    

In [156]:
clf = RandomForestClassifier(n_estimators = 100, random_state = 20)
rf_train = clf.fit(X_train,y_train)

In [157]:
y_pred = clf.predict(X_test)

In [158]:
clf.predict_proba(X)[0:5]

array([[0.4795562 , 0.5204438 ],
       [0.60430254, 0.39569746],
       [0.48388385, 0.51611615],
       [0.704748  , 0.295252  ],
       [0.54043426, 0.45956574]])

In [159]:
importance = list(zip(X, clf.feature_importances_))
importance

[('Ride_Length', 0.39154347872954026), ('Month', 0.053679617383315135), ('Part_of_Weekend', 0.07509425305963378), ('Hours', 0.11440092760657988), ('classic_bike', 0.050534737624370184), ('docked_bike', 0.209362178260185), ('electric_bike', 0.050399884633083446), ('Cold', 0.028255707833497245), ('Warm', 0.026729214869795202)]


In [160]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.51      0.57    924897
           1       0.67      0.80      0.73   1184612

    accuracy                           0.67   2109509
   macro avg       0.67      0.65      0.65   2109509
weighted avg       0.67      0.67      0.66   2109509



In [161]:
accuracy = (metrics.accuracy_score(y_test, y_pred)) * 100
accuracy

67.13405820975402

Random Forest Model 2: Seasonality is taken into consideration to figure out the relation in bikes given seasonality and ride length

In [162]:
dfML_time_bike = df_ML[['Ride_Type','Ride_Length', 'Month', 'Part_of_Weekend', 'classic_bike','electric_bike','docked_bike']].copy()

In [163]:
yt = dfML_time_bike['Ride_Type']
Xt = dfML_time_bike.drop(['Ride_Type'], axis=1)
Xt_train, Xt_test, yt_train, yt_test = train_test_split(Xt,yt, test_size=0.3, random_state=30)

In [164]:
clft = RandomForestClassifier(n_estimators = 100, random_state = 20)
rft_train = clft.fit(Xt_train,yt_train)

In [165]:
yt_pred = clft.predict(Xt_test)

In [166]:
clft.predict_proba(Xt)[0:5]

array([[0.52866453, 0.47133547],
       [0.50422339, 0.49577661],
       [0.47697651, 0.52302349],
       [0.49850814, 0.50149186],
       [0.49352996, 0.50647004]])

In [167]:
importance = list(zip(Xt, clft.feature_importances_))
importance

[('Ride_Length', 0.4234856396202953),
 ('Month', 0.11646362692367655),
 ('Part_of_Weekend', 0.09601708847300512),
 ('classic_bike', 0.06321905399542105),
 ('electric_bike', 0.05716287816071255),
 ('docked_bike', 0.24365171282688944)]

In [168]:
print(classification_report(yt_test, yt_pred))

              precision    recall  f1-score   support

           0       0.67      0.46      0.55   1388914
           1       0.66      0.82      0.73   1775350

    accuracy                           0.66   3164264
   macro avg       0.67      0.64      0.64   3164264
weighted avg       0.67      0.66      0.65   3164264



In [169]:
acct = (metrics.accuracy_score(yt_test, yt_pred)) * 100
acct

66.4368080539424