In [61]:
import numpy as np
from tqdm import tqdm
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [62]:
def taxi_resample(path):
    taxi_df = pd.read_csv(path, compression='gzip')
    taxi_df["tpep_pickup_datetime"] = pd.DatetimeIndex(taxi_df.tpep_pickup_datetime).tz_localize('America/New_York')
    taxi_df["tpep_dropoff_datetime"] = pd.DatetimeIndex(taxi_df.tpep_dropoff_datetime).tz_localize('America/New_York')
    taxi_df.set_index("tpep_pickup_datetime", inplace = True)
    taxi_hourly_df = taxi_df.resample('H').agg({'tpep_dropoff_datetime' : 'count', 'passenger_count' : 'sum'})
#    taxi_hourly_df = taxi_hourly.to_frame()
    taxi_hourly_df.rename(columns={'tpep_dropoff_datetime' : 'num_pickups', 'passenger_count':'num_passengers'}, inplace = True)
    return taxi_hourly_df

In [47]:
taxi_hourly_df = taxi_resample('../clean_data/TaxiData_Jan17-Jun17.gz')

In [48]:
taxi_hourly_df.head()

Unnamed: 0_level_0,num_pickups,num_passengers
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01 00:00:00-05:00,53,85.0
2017-01-01 01:00:00-05:00,3,3.0
2017-01-01 02:00:00-05:00,0,
2017-01-01 03:00:00-05:00,4,6.0
2017-01-01 04:00:00-05:00,4,8.0


In [63]:
taxi_old = pd.read_csv('../raw_data/taxi_pickups_2014-2016.csv')

In [64]:
taxi_old.head()

Unnamed: 0,num_pickups,num_passengers,yr,mnth,d,hr
0,107,210,2014,1,1,0
1,34,45,2014,1,1,1
2,34,70,2014,1,1,2
3,12,18,2014,1,1,3
4,25,37,2014,1,1,4


In [65]:
from datetime import datetime

In [66]:
taxi_old['tpep_pickup_datetime'] = taxi_old.apply(lambda row: datetime(row.yr, row.mnth, row.d, row.hr), axis=1)

In [67]:
taxi_old = taxi_old.set_index('tpep_pickup_datetime')

In [68]:
taxi_old = taxi_old.drop(['yr', 'mnth', 'd', 'hr'], axis = 1)

In [69]:
taxi_old.head()

Unnamed: 0_level_0,num_pickups,num_passengers
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-01-01 00:00:00,107,210
2014-01-01 01:00:00,34,45
2014-01-01 02:00:00,34,70
2014-01-01 03:00:00,12,18
2014-01-01 04:00:00,25,37


In [57]:
taxi_full = pd.concat([taxi_old, taxi_hourly_df])

In [60]:
taxi_full.to_csv('../clean_data/aggregate_taxis_all.csv')