# Data Cleaning & Feature Engineering

The goal of this notebook is to clean and transform the data into the panel format ready for analysis

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('../data/trips.csv')
df.head()

Unnamed: 0,usertype,zip_code_start,borough_start,neighborhood_start,zip_code_end,borough_end,neighborhood_end,start_time,stop_time,day_mean_temperature,day_mean_wind_speed,day_total_precipitation,trip_minutes,trip_count,avg_trip_minutes
0,Customer,11206,Brooklyn,Bushwick and Williamsburg,11206,Brooklyn,Bushwick and Williamsburg,2020-08-26 03:11:43,2020-08-26 03:32:38,76.0,3.7,0.0,20.0,1,20.9
1,Customer,11233,Brooklyn,Central Brooklyn,11233,Brooklyn,Central Brooklyn,2020-08-29 15:45:10,2020-08-29 16:13:57,75.7,1.2,0.0,30.0,1,28.766667
2,Customer,11206,Brooklyn,Bushwick and Williamsburg,11221,Brooklyn,Bushwick and Williamsburg,2020-11-08 23:11:00,2020-11-08 23:14:14,51.9,4.0,0.0,0.0,1,3.216667
3,Customer,11101,Queens,Northwest Queens,11101,Queens,Northwest Queens,2020-11-15 15:06:18,2020-11-15 15:14:38,46.5,4.1,0.0,10.0,1,8.333333
4,Customer,11238,Brooklyn,Central Brooklyn,11216,Brooklyn,Central Brooklyn,2020-09-26 14:12:44,2020-09-26 14:29:40,66.5,5.2,0.0,20.0,1,16.916667


## 1. Data Cleaning

* Datatypes
* Missing Data
* Duplicates
* Outliers

In [3]:
# Summary statistics of the dataset
df.describe()

Unnamed: 0,zip_code_start,zip_code_end,day_mean_temperature,day_mean_wind_speed,day_total_precipitation,trip_minutes,trip_count,avg_trip_minutes
count,1251254.0,1251254.0,1251254.0,1251254.0,1251254.0,1251254.0,1251254.0,1251254.0
mean,10162.78,10166.29,68.25411,8.130439,0.08303776,33.48319,1.016014,32.938
std,372.4637,376.5618,11.34571,66.57515,0.2543189,229.9335,0.1348016,229.9198
min,10001.0,10001.0,9.8,0.7,0.0,0.0,1.0,1.0
25%,10009.0,10009.0,60.0,2.5,0.0,10.0,1.0,12.91667
50%,10016.0,10016.0,72.2,3.6,0.0,20.0,1.0,20.46667
75%,10028.0,10036.0,76.7,4.5,0.0,30.0,1.0,29.05
max,11238.0,11238.0,88.0,999.9,2.1,86240.0,6.0,86238.48


In [4]:
# Check datatypes
df.dtypes
df['start_time'] = pd.to_datetime(df['start_time'])

In [5]:
# Check for missing values
df.isna().sum() 

usertype                   0
zip_code_start             0
borough_start              0
neighborhood_start         0
zip_code_end               0
borough_end                0
neighborhood_end           0
start_time                 0
stop_time                  0
day_mean_temperature       0
day_mean_wind_speed        0
day_total_precipitation    0
trip_minutes               0
trip_count                 0
avg_trip_minutes           0
dtype: int64

In [6]:
# Check for duplicates
df.duplicated().sum()

0

## 2. Feature Engineering

Some features already generated during the inital data pull.

Adding a few more features for further analysis

In [7]:
# Create holiday dummy variable based on US federal holidays 1 if not 0 
from pandas.tseries.holiday import USFederalHolidayCalendar
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start='2018-01-01', end='2018-12-31')
df['start_time'] = pd.to_datetime(df['start_time'])
df['holiday'] = df['start_time'].dt.normalize().isin(holidays).astype(int)

# Create day of week variable as numeric and string
df['day_of_week_num'] = df['start_time'].dt.dayofweek
df['day_of_week'] = df['start_time'].dt.day_name()  

# Encode neighborhood names as numeric ids
neighborhoods = df['neighborhood_start'].unique()
neighborhood_to_id = {name: idx for idx, name in enumerate(neighborhoods)}
df['neighborhood_id'] = df['neighborhood_start'].map(neighborhood_to_id)

# Create a day variable include 365 days from Jan 1 to Dec 31
df['day'] = df['start_time'].dt.dayofyear   

In [9]:

# Set up panel data format with neighborhood and day of week as index
df_panel = df.groupby(['neighborhood_id', 'day']).agg({
    'trip_count': 'sum',
    'avg_trip_minutes': 'mean',
    'day_mean_temperature': 'mean',
    'day_mean_wind_speed': 'mean',
    'day_total_precipitation': 'mean',
    'holiday': 'max'  # if any day in the group is a holiday, mark the whole group as holiday
}).reset_index()

df_panel.to_csv('../data/trips_panel.csv', index=False)
df.to_csv('../data/trips_cleaned.csv', index=False)
df_panel.head().sort_values(by=['neighborhood_id', 'day'])

Unnamed: 0,neighborhood_id,day,trip_count,avg_trip_minutes,day_mean_temperature,day_mean_wind_speed,day_total_precipitation,holiday
0,0,233,12,18.219444,80.5,4.2,0.0,0
1,0,234,36,17.390741,78.7,4.0,0.33,0
2,0,235,61,20.819399,76.9,3.7,0.32,0
3,0,236,82,25.040041,76.0,3.6,0.0,0
4,0,237,41,23.995935,78.2,2.1,0.0,0


In [80]:
# display count of days per neighborhood    
df_panel.groupby('neighborhood_id')['day'].count()

neighborhood_id
0     364
1     184
2     365
3     317
4     134
5     147
6      71
7       2
8     364
9     362
10    362
11    352
12    331
13    354
14    326
Name: day, dtype: int64