# download the data



In [4]:
import requests
import datetime

import pandas as pd
import zipfile
from tqdm import tqdm


import mlflow
import os

In [5]:
files= [('202304','./data/raw'),('202305','./data/raw')]

for file, path in files:
    url=f'https://divvy-tripdata.s3.amazonaws.com/{file}-divvy-tripdata.zip'
    resp=requests.get(url, stream=True)
    zip_save_path = f'{path}/{file}.zip'

    os.makedirs(path, exist_ok=True)

    with open(zip_save_path,"wb") as handle:
        for data in tqdm(resp.iter_content(chunk_size=1024),
                         desc=f'{file}',
                         postfix=f"save to {zip_save_path}",
                         total=int(resp.headers["Content-Length"])):
            handle.write(data)

    with zipfile.ZipFile(zip_save_path, 'r') as zip_ref:
        zip_ref.extractall(path)
    
    os.remove(zip_save_path)

202304:   0%|          | 15038/15398190 [00:02<51:00, 5026.24it/s, save to ./data/raw/202304.zip] 
202305:   0%|          | 22894/23442906 [00:04<1:14:30, 5239.28it/s, save to ./data/raw/202305.zip]


In [6]:
data = pd.read_csv("data/raw/202304-divvy-tripdata.csv")

In [7]:
data.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,8FE8F7D9C10E88C7,electric_bike,2023-04-02 08:37:28,2023-04-02 08:41:37,,,,,41.8,-87.6,41.79,-87.6,member
1,34E4ED3ADF1D821B,electric_bike,2023-04-19 11:29:02,2023-04-19 11:52:12,,,,,41.87,-87.65,41.93,-87.68,member
2,5296BF07A2F77CB5,electric_bike,2023-04-19 08:41:22,2023-04-19 08:43:22,,,,,41.93,-87.66,41.93,-87.66,member
3,40759916B76D5D52,electric_bike,2023-04-19 13:31:30,2023-04-19 13:35:09,,,,,41.92,-87.65,41.91,-87.65,member
4,77A96F460101AC63,electric_bike,2023-04-19 12:05:36,2023-04-19 12:10:26,,,,,41.91,-87.65,41.91,-87.63,member


In [8]:
data.describe()

Unnamed: 0,start_lat,start_lng,end_lat,end_lng
count,426590.0,426590.0,426155.0,426155.0
mean,41.901507,-87.646961,41.902002,-87.647225
std,0.047098,0.027689,0.047241,0.027808
min,41.648501,-87.83,41.648501,-88.11
25%,41.88,-87.660224,41.880317,-87.660984
50%,41.897733,-87.64414,41.898969,-87.644336
75%,41.93,-87.629859,41.93,-87.629912
max,42.07,-87.52,42.08,-87.528232


In [9]:
# missing data for started at or station name
data.isnull().sum()

ride_id                   0
rideable_type             0
started_at                0
ended_at                  0
start_station_name    63814
start_station_id      63814
end_station_name      68630
end_station_id        68630
start_lat                 0
start_lng                 0
end_lat                 435
end_lng                 435
member_casual             0
dtype: int64

In [10]:
#removing rows without station name
data=data.dropna(subset=['start_station_name','end_station_name'])


In [11]:
data.isnull().sum() #no missing data left

ride_id               0
rideable_type         0
started_at            0
ended_at              0
start_station_name    0
start_station_id      0
end_station_name      0
end_station_id        0
start_lat             0
start_lng             0
end_lat               0
end_lng               0
member_casual         0
dtype: int64

In [12]:
data.describe() #enough data to train my model

Unnamed: 0,start_lat,start_lng,end_lat,end_lng
count,324197.0,324197.0,324197.0,324197.0
mean,41.900372,-87.644843,41.900953,-87.645155
std,0.04451,0.025272,0.044666,0.025395
min,41.648501,-87.83,41.648501,-87.83
25%,41.879644,-87.658416,41.88033,-87.658617
50%,41.895748,-87.642746,41.896373,-87.642884
75%,41.926756,-87.628594,41.92883,-87.629155
max,42.064854,-87.528232,42.064854,-87.528232


# Target engineering

I decided to create a target value that is the net usage of bikes per station per hour of the day.
This target will be called `net_usage`

## Define target variable
Target will be the number of rental at start station by hour


## Features

1 - Time features extracted from started at and ended at (day, day of the week)
2 - Rideable type (optional)
3 - Start station name
4 - End station name
    

In [13]:
data.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')

In [14]:
# extract hour and day of the week for started_at and ended_at
data['started_at'] = pd.to_datetime(data['started_at'])
data['ended_at'] = pd.to_datetime(data['ended_at'])

In [17]:
data['started_hour'] = data['started_at'].dt.hour
data['started_day_of_week'] = data['started_at'].dt.day_name()

data['ended_hour'] = data['ended_at'].dt.hour
data['ended_day_of_week'] = data['ended_at'].dt.day_name()



In [28]:
# dataset grouping features
rentals = data.groupby(['start_station_name', 'started_hour', 'started_day_of_week']).size().reset_index(name='average_rentals')


returns = data.groupby(['end_station_name', 'ended_hour', 'ended_day_of_week']).size().reset_index(name='average_returns')


In [29]:
usage_data = pd.merge(rentals, returns, left_on=['start_station_name', 'started_hour', 'started_day_of_week'], right_on=['end_station_name', 'ended_hour', 'ended_day_of_week'], how='outer')


In [31]:
usage_data['average_rentals'] = usage_data['average_rentals'].fillna(0)
usage_data['average_returns'] = usage_data['average_returns'].fillna(0)


In [32]:
usage_data['net_usage'] = usage_data['average_rentals'] - usage_data['average_returns']


In [43]:
usage_data.head()

Unnamed: 0,start_station_name,started_hour,started_day_of_week,average_rentals,end_station_name,ended_hour,ended_day_of_week,average_returns,net_usage
0,2112 W Peterson Ave,0.0,Sunday,1.0,,,,0.0,1.0
1,,,,0.0,2112 W Peterson Ave,0.0,Thursday,1.0,-1.0
2,,,,0.0,2112 W Peterson Ave,0.0,Tuesday,1.0,-1.0
3,,,,0.0,2112 W Peterson Ave,0.0,Wednesday,1.0,-1.0
4,,,,0.0,2112 W Peterson Ave,1.0,Sunday,1.0,-1.0


In [21]:
data.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,started_hour,started_day_of_week,ended_hour,ended_day_of_week
227,5B6500E1E58655C0,classic_bike,2023-04-10 17:34:35,2023-04-10 18:02:36,Avenue O & 134th St,20214,Avenue O & 134th St,20214,41.651868,-87.539671,41.651868,-87.539671,member,17,Monday,18,Monday
383,AA65D25D69AF771F,classic_bike,2023-04-12 12:29:46,2023-04-12 12:54:00,Cottage Grove Ave & 51st St,TA1309000067,Cottage Grove Ave & 51st St,TA1309000067,41.803038,-87.606615,41.803038,-87.606615,member,12,Wednesday,12,Wednesday
409,079FB2C196414482,electric_bike,2023-04-13 17:39:23,2023-04-13 17:40:57,Morgan Ave & 14th Pl,TA1306000002,Morgan Ave & 14th Pl,TA1306000002,41.86243,-87.651152,41.862378,-87.651062,member,17,Thursday,17,Thursday
561,599623864C871207,classic_bike,2023-04-29 20:57:10,2023-04-29 20:57:13,Cottage Grove Ave & 51st St,TA1309000067,Cottage Grove Ave & 51st St,TA1309000067,41.803038,-87.606615,41.803038,-87.606615,member,20,Saturday,20,Saturday
692,63ECC8A13D11A76A,classic_bike,2023-04-20 17:03:11,2023-04-20 17:24:58,California Ave & Division St,13256,California Ave & Milwaukee Ave,13084,41.903029,-87.697474,41.922695,-87.697153,casual,17,Thursday,17,Thursday


In [48]:
#extract station, hour and day of the week of either or start or end

def extract_station_info(df):

  df['station_name'] = df.apply(lambda row: row['end_station_name'] if not pd.isna(row['end_station_name']) else row['start_station_name'], axis=1)
  df['hour'] = df.apply(lambda row: row['ended_hour'] if not pd.isna(row['end_station_name']) else row['started_hour'], axis=1)
  df['day_of_week'] = df.apply(lambda row: row['ended_day_of_week'] if not pd.isna(row['end_station_name']) else row['started_day_of_week'], axis=1)

  return df[['net_usage', 'station_name', 'hour', 'day_of_week']]

In [49]:
usage_data_2 = extract_station_info(usage_data.copy()) 

In [52]:
usage_data_2.shape

(70050, 4)

In [53]:
usage_data_2.head()

Unnamed: 0,net_usage,station_name,hour,day_of_week
0,1.0,2112 W Peterson Ave,0.0,Sunday
1,-1.0,2112 W Peterson Ave,0.0,Thursday
2,-1.0,2112 W Peterson Ave,0.0,Tuesday
3,-1.0,2112 W Peterson Ave,0.0,Wednesday
4,-1.0,2112 W Peterson Ave,1.0,Sunday


In [None]:
usage_data_2.to_parquet('')

# Model training


In [56]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [54]:
features = usage_data_2[['station_name', 'hour', 'day_of_week']]
target = usage_data_2['net_usage']

features = pd.get_dummies(features, columns=['station_name', 'day_of_week']) #encoding categorical


   hour  station_name_2112 W Peterson Ave  station_name_410  \
0   0.0                              True             False   
1   0.0                              True             False   
2   0.0                              True             False   
3   0.0                              True             False   
4   1.0                              True             False   

   station_name_63rd St Beach  station_name_900 W Harrison St  \
0                       False                           False   
1                       False                           False   
2                       False                           False   
3                       False                           False   
4                       False                           False   

   station_name_Aberdeen St & Jackson Blvd  \
0                                    False   
1                                    False   
2                                    False   
3                                    False   


In [57]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [58]:
model = RandomForestRegressor(n_estimators=100, random_state=42)


In [59]:
model.fit(X_train, y_train)


In [60]:
predictions = model.predict(X_test)


In [61]:
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 17.831992748037116
