# Data Cleaning & Feature Engineering

In [3]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 

In [4]:
df = pd.read_csv('../data/trips.csv')
df.head()

Unnamed: 0,usertype,zip_code_start,borough_start,neighborhood_start,zip_code_end,borough_end,neighborhood_end,start_time,stop_time,day_mean_temperature,day_mean_wind_speed,day_total_precipitation,trip_minutes,trip_count,unique_bikes_used,total_trip_minutes,avg_trip_minutes,median_trip_minutes,min_trip_minutes,max_trip_minutes
0,Subscriber,10029,Manhattan,East Harlem,10115,Manhattan,Morningside Heights,2023-01-26 13:16:37.154,2023-01-26 13:31:12.562,30.7,3.2,0.0,20.0,1,1,14.583333,14.583333,14.583333,14.583333,14.583333
1,Subscriber,10010,Manhattan,Gramercy Park and Murray Hill,10278,Manhattan,Lower Manhattan,2023-05-09 07:25:21.833,2023-05-09 07:42:36.540,63.0,1.8,0.0,20.0,1,1,17.233333,17.233333,17.233333,17.233333,17.233333
2,Subscriber,11217,Brooklyn,Northwest Brooklyn,11233,Brooklyn,Central Brooklyn,2023-03-26 21:13:02.938,2023-03-26 21:29:07.304,40.6,5.5,0.0,20.0,1,1,16.066667,16.066667,16.066667,16.066667,16.066667
3,Subscriber,10001,Manhattan,Chelsea and Clinton,10103,Manhattan,Chelsea and Clinton,2023-05-11 07:39:13.479,2023-05-11 07:56:35.768,64.2,2.8,0.25,20.0,1,1,17.366667,17.366667,17.366667,17.366667,17.366667
4,Subscriber,10019,Manhattan,Chelsea and Clinton,10199,Manhattan,Chelsea and Clinton,2023-02-09 15:20:10.838,2023-02-09 15:32:23.252,29.9,4.5,0.0,10.0,1,1,12.2,12.2,12.2,12.2,12.2


## 1. Data Cleaning

* Datatypes
* Missing Data
* Duplicates
* Outliers

In [11]:
# Summary statistics of the dataset
df.describe()

Unnamed: 0,zip_code_start,zip_code_end,day_mean_temperature,day_mean_wind_speed,day_total_precipitation,trip_minutes,trip_count,unique_bikes_used,total_trip_minutes,avg_trip_minutes,median_trip_minutes,min_trip_minutes,max_trip_minutes
count,5506273.0,5506273.0,5506273.0,5506273.0,5506273.0,5506273.0,5506273.0,5506273.0,5506273.0,5506273.0,5506273.0,5506273.0,5506273.0
mean,10242.5,10242.55,50.95836,4.494994,0.129057,16.7301,1.0,1.0,16.12945,16.12945,16.12945,16.12945,16.12945
std,456.0281,456.071,14.38971,2.076705,0.283195,440.9013,0.0,0.0,440.8894,440.8894,440.8894,440.8894,440.8894
min,10001.0,10001.0,9.7,1.0,0.0,0.0,1.0,1.0,1.016667,1.016667,1.016667,1.016667,1.016667
25%,10010.0,10010.0,39.8,2.7,0.0,10.0,1.0,1.0,5.75,5.75,5.75,5.75,5.75
50%,10018.0,10018.0,48.7,4.3,0.0,10.0,1.0,1.0,9.6,9.6,9.6,9.6,9.6
75%,10065.0,10065.0,63.0,5.7,0.1,20.0,1.0,1.0,16.65,16.65,16.65,16.65,16.65
max,11238.0,11238.0,80.9,11.5,1.68,325170.0,1.0,1.0,325167.5,325167.5,325167.5,325167.5,325167.5


In [12]:
# Check datatypes
df.dtypes

usertype                    object
zip_code_start               int64
borough_start               object
neighborhood_start          object
zip_code_end                 int64
borough_end                 object
neighborhood_end            object
start_time                  object
stop_time                   object
day_mean_temperature       float64
day_mean_wind_speed        float64
day_total_precipitation    float64
trip_minutes               float64
trip_count                   int64
unique_bikes_used            int64
total_trip_minutes         float64
avg_trip_minutes           float64
median_trip_minutes        float64
min_trip_minutes           float64
max_trip_minutes           float64
dtype: object

In [13]:
# Check for missing values
df.isna().sum() 

usertype                   0
zip_code_start             0
borough_start              0
neighborhood_start         0
zip_code_end               0
borough_end                0
neighborhood_end           0
start_time                 0
stop_time                  0
day_mean_temperature       0
day_mean_wind_speed        0
day_total_precipitation    0
trip_minutes               0
trip_count                 0
unique_bikes_used          0
total_trip_minutes         0
avg_trip_minutes           0
median_trip_minutes        0
min_trip_minutes           0
max_trip_minutes           0
dtype: int64

In [14]:
# Check for duplicates
df.duplicated().sum()

0

## 2. Data Transformation

1. Normalization and Standardization: 
2. Aggregation: For instance, daily sales data could be aggregated to a monthly or quarterly level.
3. Discretization: This process converts continuous data into a finite number of intervals or bins.

## 2. Feature Engineering

Some features already generated during the inital data pull.

Adding a few more features for further analysis

In [8]:
# Create holiday dummy variable based on US federal holidays 1 if not 0 
from pandas.tseries.holiday import USFederalHolidayCalendar
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start='2018-01-01', end='2018-12-31')
df['start_time'] = pd.to_datetime(df['start_time'])
df['holiday'] = df['start_time'].dt.normalize().isin(holidays).astype(int)

# Create day of week variable as numeric and string
df['day_of_week_num'] = df['start_time'].dt.dayofweek
df['day_of_week'] = df['start_time'].dt.day_name()  

# Create peak hours variable 1 if peak hours 0 if not peak hours
def peak_hours(hour):
    if 7 <= hour <= 9 or 16 <= hour <= 18:
        return 1
    else:
        return 0    
    
df['start_hour'] = pd.to_datetime(df['start_time']).dt.hour
df['peak_hours'] = df['start_hour'].apply(peak_hours)   

In [None]:

df.to_csv('../data/trips_cleaned.csv', index=False)
df.head()

In [None]:
df.shape