## Merging hourly taxi, weather, and incoming passenger data for 2017

### Taxi Data

In [1]:
import numpy as np
from tqdm import tqdm
import pandas as pd

In [110]:
def taxi_resample(path):
    taxi_df = pd.read_csv(path, compression='gzip')
    taxi_df["tpep_pickup_datetime"] = pd.DatetimeIndex(taxi_df.tpep_pickup_datetime).tz_localize('America/New_York')
    taxi_df.set_index("tpep_pickup_datetime", inplace = True)
    taxi_hourly = taxi_df.resample('H')['passenger_count'].count()
    taxi_hourly_df = taxi_hourly.to_frame()
    taxi_hourly_df.rename(columns={'passenger_count':'pickup_count'}, inplace = True)
    return taxi_hourly_df

In [111]:
taxi_hourly_df = taxi_resample('../clean_data/TaxiData_Jan17-Jun17.gz')

### Weather Data

In [113]:
def clean_weather(path):
    weather_df = pd.read_csv(path)
    weather_df["date"] = pd.DatetimeIndex(pd.to_datetime(weather_df.dt, unit="s")) \
                    .tz_localize('UTC').tz_convert('America/New_York')
    weather_df.set_index("date", inplace=True)
    
    weather_df.drop(["dt", "dt_iso", "city_id", "city_name", "lat", "lon", "weather_icon", "grnd_level", \
                 "sea_level", "clouds_all", "weather_id", "rain_1h", "rain_3h", "rain_24h", \
                 "rain_today", "snow_1h", "snow_3h", "snow_24h", "snow_today"], axis=1, inplace=True)
    return weather_df

In [114]:
weather_df = clean_weather("../clean_data/weather_NY.csv")

### Merge Taxi and Weather Data

In [116]:
taxi_weather = pd.merge(left=taxi_hourly_df, right=weather_df, left_index=True, right_index=True)

In [117]:
taxi_weather.index = taxi_weather.index.tz_convert('America/New_York')

In [118]:
taxi_weather.head()

Unnamed: 0,pickup_count,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,weather_main,weather_description
2017-01-01 00:00:00-05:00,53,279.07,277.15,280.15,1013,45,2,250,Clouds,overcast clouds
2017-01-01 01:00:00-05:00,3,279.14,277.15,281.15,1013,45,2,260,Clouds,overcast clouds
2017-01-01 02:00:00-05:00,0,279.21,276.15,283.15,1012,45,3,270,Rain,light rain
2017-01-01 02:00:00-05:00,0,279.21,276.15,283.15,1012,45,3,270,Snow,light snow
2017-01-01 03:00:00-05:00,4,279.43,277.15,285.15,1013,45,3,270,Clouds,overcast clouds


### Passenger seat data

In [120]:
def clean_seats(path):
    seats_per_hour = pd.read_csv(path, header=None)
    seats_per_hour.columns = ['Time', 'Passengers']
    seats_per_hour["Time"] = pd.DatetimeIndex(seats_per_hour.Time)
    seats_per_hour.set_index("Time", inplace = True)

    #To get rid of errors surrounding daylight savings time. All of taxi pickups and passenger counts are NaNs for these times
    seats_per_hour = seats_per_hour.drop([pd.Timestamp('2016-03-13 02:00:00'), pd.Timestamp('2016-11-06 01:00:00'), pd.Timestamp('2017-03-12 02:00:00')])
    seats_per_hour.index = seats_per_hour.index.tz_localize('America/New_York')
    return seats_per_hour

In [121]:
seats_per_hour = clean_seats("../clean_data/seats_per_hour.csv")

### Merging all dataframes

In [124]:
merged_df = pd.merge(left=taxi_weather, right=seats_per_hour, left_index=True, right_index=True)

In [125]:
merged_df.head(20)

Unnamed: 0,pickup_count,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,weather_main,weather_description,Passengers
2017-01-01 00:00:00-05:00,53,279.07,277.15,280.15,1013,45,2,250,Clouds,overcast clouds,
2017-01-01 01:00:00-05:00,3,279.14,277.15,281.15,1013,45,2,260,Clouds,overcast clouds,150.0
2017-01-01 02:00:00-05:00,0,279.21,276.15,283.15,1012,45,3,270,Rain,light rain,
2017-01-01 02:00:00-05:00,0,279.21,276.15,283.15,1012,45,3,270,Snow,light snow,
2017-01-01 03:00:00-05:00,4,279.43,277.15,285.15,1013,45,3,270,Clouds,overcast clouds,
2017-01-01 04:00:00-05:00,4,279.5,276.15,286.15,1014,42,4,280,Clouds,scattered clouds,
2017-01-01 05:00:00-05:00,6,279.21,276.15,286.15,1015,48,2,280,Clouds,overcast clouds,
2017-01-01 06:00:00-05:00,4,278.8,276.15,286.15,1016,52,2,290,Clear,sky is clear,
2017-01-01 07:00:00-05:00,35,277.84,275.15,281.15,1017,55,2,220,Clouds,scattered clouds,272.0
2017-01-01 08:00:00-05:00,101,277.3,274.15,281.15,1018,64,2,220,Clear,sky is clear,1130.0


In [126]:
merged_df.to_csv("../clean_data/2017_combined_data.csv")