# Data Cleansing Step 3 (NYC)

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
nyc_weather = pd.read_csv("./Resources/Weather/NYC_Weather_Clean.csv")

In [3]:
nyc_merged_list_2018 = []

nyc_file_list_2018 = ["NYC_Trip_201801.csv", "NYC_Trip_201802.csv", "NYC_Trip_201803.csv", "NYC_Trip_201804.csv", 
                "NYC_Trip_201805.csv", "NYC_Trip_201806.csv", "NYC_Trip_201807.csv", "NYC_Trip_201808.csv",
                "NYC_Trip_201809.csv", "NYC_Trip_201810.csv", "NYC_Trip_201811.csv", "NYC_Trip_201812.csv"]

In [4]:
for file_name in nyc_file_list_2018:
    
    file_path = "./Resources/Bike_Data/NYC_Citibike/2018/Step_2/" + file_name
    nyc_trip_df = pd.read_csv(file_path)
    
    bins = [0, 6, 9, 16, 19, 24]
    label_names = ["1am-6am", "7am-9am", "10am-4pm", "5pm-7pm", "8pm_12am"]
    nyc_trip_df["start_hour_range"] = pd.cut(nyc_trip_df["start_hour"], bins, labels=label_names)
    nyc_trip_df.loc[nyc_trip_df["start_hour"] == 0, "start_hour_range"] = "8pm_12am"
    
    nyc_trip_groupby = nyc_trip_df.groupby(["start_date", "start_hour_range", "start_station_name"])
    nyc_trip_agg = nyc_trip_groupby.agg({"trip_duration": ["count", "mean"]})

    nyc_trip_agg.columns = nyc_trip_agg.columns.droplevel()
    nyc_trip_agg.rename(columns={"count": "trip_count", "mean": "trip_duration_avg"}, inplace=True)
    nyc_trip_agg.reset_index(level=["start_date", "start_hour_range", "start_station_name"], inplace=True)
    nyc_trip_agg["trip_duration_avg"] = nyc_trip_agg["trip_duration_avg"].astype(int)
    
    nyc_merged_df = nyc_trip_agg.merge(nyc_weather, how="inner", left_on="start_date", right_on="DATE")
    nyc_merged_df.drop(columns=["DATE"], inplace=True)
    
    nyc_merged_list_2018.append(nyc_merged_df)

In [5]:
nyc_merged_2018 = pd.concat(nyc_merged_list_2018)
nyc_merged_2018.to_csv("./Resources/Bike_Data/NYC_Citibike/2018/Step_3/nyc_merged_2018.csv", index=False )
nyc_merged_2018.head()

Unnamed: 0,start_date,start_hour_range,start_station_name,trip_count,trip_duration_avg,TAVG,TMAX,TMIN,PRCP,AWND,SNWD
0,01/01/2018,1am-6am,1 Ave & E 110 St,1,459,13,19,8,0.0,17.0,0.0
1,01/01/2018,1am-6am,1 Ave & E 16 St,9,228,13,19,8,0.0,17.0,0.0
2,01/01/2018,1am-6am,1 Ave & E 18 St,2,555,13,19,8,0.0,17.0,0.0
3,01/01/2018,1am-6am,1 Ave & E 30 St,1,800,13,19,8,0.0,17.0,0.0
4,01/01/2018,1am-6am,1 Ave & E 62 St,1,737,13,19,8,0.0,17.0,0.0


In [6]:
nyc_list_2014 = []

nyc_file_list_2014 = ["NYC_Trip_201401.csv", "NYC_Trip_201402.csv", "NYC_Trip_201403.csv", "NYC_Trip_201404.csv", 
                "NYC_Trip_201405.csv", "NYC_Trip_201406.csv", "NYC_Trip_201407.csv", "NYC_Trip_201408.csv",
                "NYC_Trip_201409.csv", "NYC_Trip_201410.csv", "NYC_Trip_201411.csv", "NYC_Trip_201412.csv"]

In [7]:
for file_name in nyc_file_list_2014:
    
    file_path = "./Resources/Bike_Data/NYC_Citibike/2014/Step_2/" + file_name
    nyc_trip_df = pd.read_csv(file_path)
    
    bins = [0, 6, 9, 16, 19, 24]
    label_names = ["1am-6am", "7am-9am", "10am-4pm", "5pm-7pm", "8pm_12am"]
    nyc_trip_df["start_hour_range"] = pd.cut(nyc_trip_df["start_hour"], bins, labels=label_names)
    nyc_trip_df.loc[nyc_trip_df["start_hour"] == 0, "start_hour_range"] = "8pm_12am"
    
    nyc_trip_groupby = nyc_trip_df.groupby(["start_date", "start_hour_range", "start_station_name"])
    nyc_trip_agg = nyc_trip_groupby.agg({"trip_duration": ["count", "mean"]})

    nyc_trip_agg.columns = nyc_trip_agg.columns.droplevel()
    nyc_trip_agg.rename(columns={"count": "trip_count", "mean": "trip_duration_avg"}, inplace=True)
    nyc_trip_agg.reset_index(level=["start_date", "start_hour_range", "start_station_name"], inplace=True)
    nyc_trip_agg["trip_duration_avg"] = nyc_trip_agg["trip_duration_avg"].astype(int)
    
    nyc_list_2014.append(nyc_trip_agg)

In [8]:
nyc_2014 = pd.concat(nyc_list_2014)
nyc_2014.to_csv("./Resources/Bike_Data/NYC_Citibike/2014/Step_3/nyc_agg_2014.csv", index=False)
nyc_2014.head()

Unnamed: 0,start_date,start_hour_range,start_station_name,trip_count,trip_duration_avg
0,01/01/2014,1am-6am,1 Ave & E 15 St,12,407
1,01/01/2014,1am-6am,1 Ave & E 18 St,1,304
2,01/01/2014,1am-6am,1 Ave & E 44 St,1,1170
3,01/01/2014,1am-6am,10 Ave & W 28 St,7,505
4,01/01/2014,1am-6am,11 Ave & W 27 St,3,1270


In [9]:
nyc_2014.to_csv("./Resources/Bike_Data/Cleaned/nyc_agg_2014.csv", index=False)
nyc_merged_2018.to_csv("./Resources/Bike_Data/Cleaned/nyc_merged_2018.csv", index=False)