# Data Cleaning & Feature Engineering

The goal of this notebook is to clean and transform the data into the panel format ready for analysis

In [64]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 

In [65]:
df = pd.read_csv('../data/trips.csv')
df.head()

Unnamed: 0,usertype,zip_code_start,borough_start,neighborhood_start,zip_code_end,borough_end,neighborhood_end,start_time,stop_time,day_mean_temperature,day_mean_wind_speed,day_total_precipitation,trip_minutes,avg_trip_minutes
0,Customer,10167,Manhattan,Gramercy Park and Murray Hill,10021,Manhattan,Upper East Side,2020-08-15 10:48:39,2020-08-15 11:02:13,80.7,3.3,0.0,10.0,13.566667
1,Customer,10021,Manhattan,Upper East Side,10167,Manhattan,Gramercy Park and Murray Hill,2020-09-29 08:52:46,2020-09-29 09:14:27,75.6,2.7,0.0,20.0,21.683333
2,Customer,10110,Manhattan,Chelsea and Clinton,10024,Manhattan,Upper West Side,2020-11-09 10:32:08,2020-11-09 11:43:11,51.1,1.7,0.0,70.0,71.05
3,Customer,10103,Manhattan,Chelsea and Clinton,10016,Manhattan,Gramercy Park and Murray Hill,2020-08-19 17:49:51,2020-08-19 17:59:38,80.0,4.0,0.0,10.0,9.766667
4,Customer,11238,Brooklyn,Central Brooklyn,11217,Brooklyn,Northwest Brooklyn,2020-08-16 14:06:32,2020-08-16 14:21:35,82.8,1.2,0.0,20.0,15.033333


## 1. Data Cleaning

* Datatypes
* Missing Data
* Duplicates
* Outliers

In [66]:
# Summary statistics of the dataset
df.describe()

Unnamed: 0,zip_code_start,zip_code_end,day_mean_temperature,day_mean_wind_speed,day_total_precipitation,trip_minutes,avg_trip_minutes
count,1251254.0,1251254.0,1251254.0,1251254.0,1251254.0,1251254.0,1251254.0
mean,10162.78,10166.29,68.25411,8.130439,0.08303776,33.48319,32.938
std,372.4637,376.5618,11.34571,66.57515,0.2543189,229.9335,229.9198
min,10001.0,10001.0,9.8,0.7,0.0,0.0,1.0
25%,10009.0,10009.0,60.0,2.5,0.0,10.0,12.91667
50%,10016.0,10016.0,72.2,3.6,0.0,20.0,20.46667
75%,10028.0,10036.0,76.7,4.5,0.0,30.0,29.05
max,11238.0,11238.0,88.0,999.9,2.1,86240.0,86238.48


In [67]:
# Check datatypes
df.dtypes
df['start_time'] = pd.to_datetime(df['start_time'])

In [68]:
# Check for missing values
df.isna().sum() 

usertype                   0
zip_code_start             0
borough_start              0
neighborhood_start         0
zip_code_end               0
borough_end                0
neighborhood_end           0
start_time                 0
stop_time                  0
day_mean_temperature       0
day_mean_wind_speed        0
day_total_precipitation    0
trip_minutes               0
avg_trip_minutes           0
dtype: int64

In [69]:
# Check for duplicates
df.duplicated().sum()

0

## 2. Feature Engineering

Some features already generated during the inital data pull.

Adding a few more features for further analysis

In [73]:
# Create holiday dummy variable based on US federal holidays 1 if not 0 
from pandas.tseries.holiday import USFederalHolidayCalendar
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start='2018-01-01', end='2018-12-31')
df['start_time'] = pd.to_datetime(df['start_time'])
df['holiday'] = df['start_time'].dt.normalize().isin(holidays).astype(int)

# Create day of week variable as numeric and string
df['day_of_week_num'] = df['start_time'].dt.dayofweek
df['day_of_week'] = df['start_time'].dt.day_name()  

# Encode neighborhood names as numeric ids
neighborhoods = df['neighborhood_start'].unique()
neighborhood_to_id = {name: idx for idx, name in enumerate(neighborhoods)}
df['neighborhood_id'] = df['neighborhood_start'].map(neighborhood_to_id)

# Create a day variable include 365 days from Jan 1 to Dec 31
df['day'] = df['start_time'].dt.dayofyear   

In [81]:

# Set up panel data format with neighborhood and day of week as index
df_panel = df.groupby(['neighborhood_id', 'day']).agg({
    'avg_trip_minutes': 'mean',
    'day_mean_temperature': 'mean',
    'day_mean_wind_speed': 'mean',
    'day_total_precipitation': 'mean',
    'holiday': 'max'  # if any day in the group is a holiday, mark the whole group as holiday
}).reset_index()

df_panel.to_csv('../data/trips_panel.csv', index=False)
df.to_csv('../data/trips_cleaned.csv', index=False)
df_panel.head().sort_values(by=['neighborhood_id', 'day'])

Unnamed: 0,neighborhood_id,day,avg_trip_minutes,day_mean_temperature,day_mean_wind_speed,day_total_precipitation,holiday
0,0,1,33.270238,31.4,5.3,0.0,0
1,0,2,31.995732,38.7,7.0,0.0,0
2,0,3,33.845699,35.5,5.0,0.0,0
3,0,4,23.940504,45.4,5.1,0.89,0
4,0,5,18.516975,40.9,9.7,0.12,0


In [80]:
# display count of days per neighborhood    
df_panel.groupby('neighborhood_id')['day'].count()

neighborhood_id
0     364
1     184
2     365
3     317
4     134
5     147
6      71
7       2
8     364
9     362
10    362
11    352
12    331
13    354
14    326
Name: day, dtype: int64