In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from pathlib import Path
import pandas as pd

month = 1
year = 2023
path = Path("..") / "data" / "raw" / f"rides_{year}_{month:02}.parquet"

rides = pd.read_parquet(path)
rides.head()


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0905B18B365C9D20,classic_bike,2023-01-28 09:18:10,2023-01-28 09:28:52,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Hamilton Park,JC009,40.735938,-74.030305,40.727596,-74.044247,member
1,B4F0562B05CB5404,electric_bike,2023-01-23 20:10:12,2023-01-23 20:18:27,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Southwest Park - Jackson St & Observer Hwy,HB401,40.735938,-74.030305,40.737551,-74.041664,member
2,5ABF032895F5D87E,classic_bike,2023-01-29 15:27:04,2023-01-29 15:32:38,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Marshall St & 2 St,HB408,40.735944,-74.030383,40.740802,-74.042521,member
3,E7E1F9C53976D2F9,classic_bike,2023-01-24 18:35:08,2023-01-24 18:42:13,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Hamilton Park,JC009,40.735986,-74.030364,40.727596,-74.044247,member
4,323165780CA0734B,classic_bike,2023-01-21 20:44:09,2023-01-21 20:48:08,Hamilton Park,JC009,Manila & 1st,JC082,40.727596,-74.044247,40.721651,-74.042884,member


In [3]:
rides_cp = rides.copy()
rides_cp["started_at"] = pd.to_datetime(rides_cp["started_at"])
rides_cp["ended_at"] = pd.to_datetime(rides_cp["ended_at"])
rides_cp["duration"] = rides_cp["ended_at"] - rides_cp["started_at"]
rides_cp.head()


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration
0,0905B18B365C9D20,classic_bike,2023-01-28 09:18:10,2023-01-28 09:28:52,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Hamilton Park,JC009,40.735938,-74.030305,40.727596,-74.044247,member,0 days 00:10:42
1,B4F0562B05CB5404,electric_bike,2023-01-23 20:10:12,2023-01-23 20:18:27,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Southwest Park - Jackson St & Observer Hwy,HB401,40.735938,-74.030305,40.737551,-74.041664,member,0 days 00:08:15
2,5ABF032895F5D87E,classic_bike,2023-01-29 15:27:04,2023-01-29 15:32:38,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Marshall St & 2 St,HB408,40.735944,-74.030383,40.740802,-74.042521,member,0 days 00:05:34
3,E7E1F9C53976D2F9,classic_bike,2023-01-24 18:35:08,2023-01-24 18:42:13,Hoboken Terminal - Hudson St & Hudson Pl,HB101,Hamilton Park,JC009,40.735986,-74.030364,40.727596,-74.044247,member,0 days 00:07:05
4,323165780CA0734B,classic_bike,2023-01-21 20:44:09,2023-01-21 20:48:08,Hamilton Park,JC009,Manila & 1st,JC082,40.727596,-74.044247,40.721651,-74.042884,member,0 days 00:03:59


In [4]:
rides_cp["duration"].describe()

count                        56075
mean     0 days 00:11:47.619580918
std      0 days 01:08:03.871820543
min                0 days 00:00:00
25%                0 days 00:03:51
50%                0 days 00:05:52
75%                0 days 00:09:02
max                3 days 03:38:17
Name: duration, dtype: object

In [5]:
rides_cp["duration"].quantile(0)
rides_cp["duration"].quantile(0.01)
rides_cp["duration"].quantile(0.995)
rides_cp["duration"].quantile(0.999)

Timedelta('0 days 00:00:00')

Timedelta('0 days 00:00:19')

Timedelta('0 days 01:47:45.519999999')

Timedelta('1 days 00:59:39.926000')

In [6]:
duration_filter = (
    (rides_cp["duration"] > pd.Timedelta(minutes=1)) &
    (rides_cp["duration"] <= pd.Timedelta(hours=3))
)
sum(~duration_filter)

1849

In [7]:
location_filter = (
    rides_cp["start_station_id"].notna() &
    rides_cp["end_station_id"].notna()
)
sum(~location_filter)


202

In [8]:
filter_date_range = (
    (rides_cp["started_at"] >= "2023-01-01") & 
    (rides_cp["started_at"] < "2023-02-01")
)
sum(~filter_date_range)


0

In [9]:
final_filter = duration_filter & location_filter & filter_date_range
numbers_dropped = rides_cp.shape[0] - sum(final_filter)
print("Dropped rows:", numbers_dropped)
print("Dropped %:", numbers_dropped / rides_cp.shape[0] * 100)


Dropped rows: 1966
Dropped %: 3.50601872492198


In [10]:
rides = rides_cp[final_filter]
rides = rides[["started_at", "start_station_id"]]
rides.rename(columns={
    "started_at": "pickup_datetime",
    "start_station_id": "pickup_location_id"
}, inplace=True)
rides.head()


Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-01-28 09:18:10,HB101
1,2023-01-23 20:10:12,HB101
2,2023-01-29 15:27:04,HB101
3,2023-01-24 18:35:08,HB101
4,2023-01-21 20:44:09,JC009


In [11]:
top_locations = rides["pickup_location_id"].value_counts().nlargest(3).index.tolist()
rides = rides[rides["pickup_location_id"].isin(top_locations)]
print("Top 3 locations:", top_locations)
rides.head()
rides.shape


Top 3 locations: ['JC115', 'HB102', 'HB103']


Unnamed: 0,pickup_datetime,pickup_location_id
3546,2023-01-13 09:22:36,JC115
3946,2023-01-02 11:26:33,JC115
3947,2023-01-09 10:11:56,JC115
3948,2023-01-23 19:37:41,JC115
4017,2023-01-30 18:55:06,JC115


(7246, 2)

In [12]:
year = 2023
month = 1
path = Path("..") / "data" / "processed" / f"rides_{year}_{month:02}.parquet"
path.parent.mkdir(parents=True, exist_ok=True)
rides.to_parquet(path, engine="pyarrow", index=False)
print(f"Saved: {path}")


Saved: ..\data\processed\rides_2023_01.parquet
