# Data Cleaning & Feature Engineering

In [None]:
import pandas as pd 
import numpy as np 

In [3]:
df = pd.read_csv('../data/trips.csv')
df.head()

Unnamed: 0,usertype,zip_code_start,borough_start,neighborhood_start,zip_code_end,borough_end,neighborhood_end,start_day,stop_day,day_mean_temperature,day_mean_wind_speed,day_total_precipitation,trip_minutes,trip_count,unique_bikes_used,total_trip_minutes,avg_trip_minutes,median_trip_minutes,min_trip_minutes,max_trip_minutes
0,Subscriber,10022,Manhattan,Gramercy Park and Murray Hill,10022,Manhattan,Gramercy Park and Murray Hill,2023-01-17,2023-01-17,33.6,4.5,0.0,0.0,22,20,61.366667,2.789394,2.433333,1.033333,4.483333
1,Subscriber,10075,Manhattan,Upper East Side,10021,Manhattan,Upper East Side,2023-01-30,2023-01-30,33.1,6.4,0.09,0.0,7,7,25.583333,3.654762,3.983333,1.416667,4.416667
2,Subscriber,10036,Manhattan,Chelsea and Clinton,10025,Manhattan,Upper West Side,2023-01-02,2023-01-02,17.9,7.0,0.0,30.0,6,6,181.05,30.175,30.383333,25.133333,33.416667
3,Subscriber,10005,Manhattan,Lower Manhattan,10038,Manhattan,Lower Manhattan,2023-01-13,2023-01-13,43.6,7.8,0.59,0.0,4,4,9.783333,2.445833,1.783333,1.433333,3.45
4,Subscriber,11101,Queens,Northwest Queens,11101,Queens,Northwest Queens,2023-01-31,2023-01-31,24.9,6.0,0.02,10.0,63,46,443.333333,7.037037,6.166667,4.5,14.1


## 1. Data Cleaning

* Datatypes
* Missing Data
* Duplicates
* Outliers

In [6]:
# Summary statistics of the dataset
df.describe()

Unnamed: 0,zip_code_start,zip_code_end,day_mean_temperature,day_mean_wind_speed,day_total_precipitation,trip_minutes,trip_count,unique_bikes_used,total_trip_minutes,avg_trip_minutes,median_trip_minutes,min_trip_minutes,max_trip_minutes
count,730088.0,730088.0,730088.0,730088.0,730088.0,730088.0,730088.0,730088.0,730088.0,730088.0,730088.0,730088.0,730088.0
mean,10276.190185,10276.061024,50.414297,4.589558,0.132375,49.07544,7.541931,7.321202,121.647167,48.528721,48.223902,47.294187,49.935777
std,468.591262,468.560471,14.516224,2.101441,0.285154,1209.579436,20.866344,19.522027,1221.123223,1209.567546,1209.573931,1209.598581,1209.533893
min,10001.0,10001.0,9.7,1.0,0.0,0.0,1.0,1.0,1.016667,1.016667,1.016667,1.016667,1.016667
25%,10012.0,10012.0,39.5,2.8,0.0,10.0,1.0,1.0,27.45,12.876124,12.916667,12.35,14.316667
50%,10024.0,10023.0,48.3,4.5,0.0,20.0,2.0,2.0,47.716667,21.544444,21.533333,20.766667,23.75
75%,10168.0,10173.0,62.5,5.8,0.11,30.0,5.0,5.0,102.416667,31.683333,31.666667,31.333333,33.35
max,11238.0,11238.0,80.9,11.5,1.68,325170.0,488.0,408.0,325167.483333,325167.483333,325167.483333,325167.483333,325167.483333


In [8]:
# Check datatypes
df.dtypes

usertype                    object
zip_code_start               int64
borough_start               object
neighborhood_start          object
zip_code_end                 int64
borough_end                 object
neighborhood_end            object
start_day                   object
stop_day                    object
day_mean_temperature       float64
day_mean_wind_speed        float64
day_total_precipitation    float64
trip_minutes               float64
trip_count                   int64
unique_bikes_used            int64
total_trip_minutes         float64
avg_trip_minutes           float64
median_trip_minutes        float64
min_trip_minutes           float64
max_trip_minutes           float64
dtype: object

In [9]:
# Check for missing values
df.isna().sum() 

usertype                   0
zip_code_start             0
borough_start              0
neighborhood_start         0
zip_code_end               0
borough_end                0
neighborhood_end           0
start_day                  0
stop_day                   0
day_mean_temperature       0
day_mean_wind_speed        0
day_total_precipitation    0
trip_minutes               0
trip_count                 0
unique_bikes_used          0
total_trip_minutes         0
avg_trip_minutes           0
median_trip_minutes        0
min_trip_minutes           0
max_trip_minutes           0
dtype: int64

In [5]:
# Check for duplicates
df.duplicated().sum()

0

## 2. Data Transformation

1. Normalization and Standardization: 
2. Aggregation: For instance, daily sales data could be aggregated to a monthly or quarterly level.
3. Discretization: This process converts continuous data into a finite number of intervals or bins.

In [None]:
sns.pairplot(df)
plt.show()

## 2. Feature Engineering

Some features already generated during the inital data pull.

Adding a few more features for further analysis

In [None]:
# Create holiday dummy variable based on US federal holidays 1 if not 0 
from pandas.tseries.holiday import USFederalHolidayCalendar
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start='2018-01-01', end='2018-12-31')
df['start_day'] = pd.to_datetime(df['start_day'])
df['holiday'] = df['start_day'].dt.normalize().isin(holidays).astype(int)

# Create day of week variable as numeric and string
df['day_of_week_num'] = df['start_day'].dt.dayofweek
df['day_of_week'] = df['start_day'].dt.day_name()  

# Create peak hours variable 1 if peak hours 0 if not peak hours
def peak_hours(hour):
    if 7 <= hour <= 9 or 16 <= hour <= 18:
        return 1
    else:
        return 0    
    
df['start_hour'] = pd.to_datetime(df['start_time']).dt.hour
df['peak_hours'] = df['start_hour'].apply(peak_hours)   

In [None]:

df.to_csv('../data/trips_cleaned.csv', index=False)
df.head()

Unnamed: 0,usertype,zip_code_start,borough_start,neighborhood_start,zip_code_end,borough_end,neighborhood_end,start_day,stop_day,day_mean_temperature,...,trip_minutes,trip_count,unique_bikes_used,total_trip_minutes,avg_trip_minutes,median_trip_minutes,min_trip_minutes,max_trip_minutes,holiday,day_of_week
0,Subscriber,10065,Manhattan,Upper East Side,10168,Manhattan,Gramercy Park and Murray Hill,2023-03-22,2023-03-22,37.7,...,10,4,4,44.95,11.2375,11.066667,8.383333,13.933333,0,Wednesday
1,Subscriber,10024,Manhattan,Upper West Side,10022,Manhattan,Gramercy Park and Murray Hill,2023-01-07,2023-01-07,9.7,...,20,2,2,37.266667,18.633333,15.016667,15.016667,22.25,0,Saturday
2,Subscriber,10023,Manhattan,Upper West Side,10035,Manhattan,East Harlem,2023-01-13,2023-01-13,43.6,...,20,3,3,63.316667,21.105556,21.233333,20.766667,21.316667,0,Friday
3,Subscriber,10001,Manhattan,Chelsea and Clinton,10199,Manhattan,Chelsea and Clinton,2023-01-31,2023-01-31,24.9,...,10,12,12,67.883333,5.656944,5.266667,4.6,7.633333,0,Tuesday
4,Subscriber,11103,Queens,Northwest Queens,11101,Queens,Northwest Queens,2023-02-22,2023-02-22,48.7,...,20,6,6,103.066667,17.177778,17.116667,15.366667,19.283333,0,Wednesday
