In [1]:
""" ðŸ“Œ 02 â€” Feature Engineering & Preprocessing  
 This notebook prepares the Uber dataset for machine learning by creating time-based, statistical, and lag features."""

' ðŸ“Œ 02 â€” Feature Engineering & Preprocessing  \n This notebook prepares the Uber dataset for machine learning by creating time-based, statistical, and lag features.'

In [2]:
import pandas as pd

In [3]:
# Load dataset
data = pd.read_csv("Data/Uber-Jan-Feb-FOIL.csv")

# Convert to datetime and sort
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values('date')

In [4]:
""" ðŸ“Œ Day of Week Feature
 Extract the weekday name (Mondayâ€“Sunday). """

' ðŸ“Œ Day of Week Feature\n Extract the weekday name (Mondayâ€“Sunday). '

In [5]:
data['day_of_week'] = data['date'].dt.day_name()

In [6]:
""" ðŸ“Œ Weekend Indicator  
 Binary feature:  
 - **1** â†’ Saturday or Sunday  
 - **0** â†’ Weekday """


' ðŸ“Œ Weekend Indicator  \n Binary feature:  \n - **1** â†’ Saturday or Sunday  \n - **0** â†’ Weekday '

In [7]:
data['is_weekend'] = data['day_of_week'].isin(['Saturday','Sunday']).astype(int)

In [8]:
""" ðŸ“Œ Date-Based Features  
 Extract basic time components:
 - Month  
 - Day of month """  

' ðŸ“Œ Date-Based Features  \n Extract basic time components:\n - Month  \n - Day of month '

In [9]:
data['month'] = data['date'].dt.month
data['day']   = data['date'].dt.day

In [10]:
""" ðŸ“Œ Rolling Mean Features  
 Capture short-term and medium-term trip trends:
 - `rolling_mean_3` â†’ last 3 days average  
 - `rolling_mean_7` â†’ last 7 days average """  

' ðŸ“Œ Rolling Mean Features  \n Capture short-term and medium-term trip trends:\n - `rolling_mean_3` â†’ last 3 days average  \n - `rolling_mean_7` â†’ last 7 days average '

In [11]:
data['trips_rolling_mean_3'] = data['trips'].rolling(window=3).mean()
data['trips_rolling_mean_7'] = data['trips'].rolling(window=7).mean()

In [12]:
""" ðŸ“Œ Lag Features  
 Help the model understand past behavior:
 - Trips yesterday (`lag_1`)
 - Trips 2 days ago (`lag_2`)
 - Trips 3 days ago (`lag_3`) """

' ðŸ“Œ Lag Features  \n Help the model understand past behavior:\n - Trips yesterday (`lag_1`)\n - Trips 2 days ago (`lag_2`)\n - Trips 3 days ago (`lag_3`) '

In [13]:
data['lag_1'] = data['trips'].shift(1)
data['lag_2'] = data['trips'].shift(2)
data['lag_3'] = data['trips'].shift(3)

In [14]:
""" ðŸ“Œ Clean Data  
 Drop the initial rows where rolling/lag features generate NaN values. """

' ðŸ“Œ Clean Data  \n Drop the initial rows where rolling/lag features generate NaN values. '

In [15]:
data = data.dropna().reset_index(drop=True)

In [16]:
display(data.head(10))

Unnamed: 0,dispatching_base_number,date,active_vehicles,trips,day_of_week,is_weekend,month,day,trips_rolling_mean_3,trips_rolling_mean_7,lag_1,lag_2,lag_3
0,B02764,2015-01-02,3147,19974,Friday,0,1,2,12138.0,10915.857143,6903.0,9537.0,7679.0
1,B02765,2015-01-02,196,1001,Friday,0,1,2,9292.666667,10897.142857,19974.0,6903.0,9537.0
2,B02682,2015-01-02,890,5506,Friday,0,1,2,8827.0,11431.571429,1001.0,19974.0,6903.0
3,B02617,2015-01-02,1137,7065,Friday,0,1,2,4524.0,8237.857143,5506.0,1001.0,19974.0
4,B02598,2015-01-02,785,4768,Friday,0,1,2,5779.666667,7822.0,7065.0,5506.0,1001.0
5,B02512,2015-01-02,175,875,Friday,0,1,2,4236.0,6584.571429,4768.0,7065.0,5506.0
6,B02765,2015-01-03,201,1526,Saturday,1,1,3,2389.666667,5816.428571,875.0,4768.0,7065.0
7,B02617,2015-01-03,1188,10664,Saturday,1,1,3,4355.0,4486.428571,1526.0,875.0,4768.0
8,B02598,2015-01-03,818,7432,Saturday,1,1,3,6540.666667,5405.142857,10664.0,1526.0,875.0
9,B02682,2015-01-03,915,8010,Saturday,1,1,3,8702.0,5762.857143,7432.0,10664.0,1526.0


In [17]:
""" Feature Engineering Summary

This notebook created:

- **Time Features:** month, day, day_of_week  
- **Weekend Indicator:** captures behavior shift on weekends  
- **Rolling Features:** 3-day and 7-day moving averages  
- **Lag Features:** previous 1, 2, 3 days of trips  
- **Cleaned Data:** removed NaN rows caused by rolling/lagging  

These features significantly boost model accuracy in the next notebook. """ 


' Feature Engineering Summary\n\nThis notebook created:\n\n- **Time Features:** month, day, day_of_week  \n- **Weekend Indicator:** captures behavior shift on weekends  \n- **Rolling Features:** 3-day and 7-day moving averages  \n- **Lag Features:** previous 1, 2, 3 days of trips  \n- **Cleaned Data:** removed NaN rows caused by rolling/lagging  \n\nThese features significantly boost model accuracy in the next notebook. '