In [1]:
import pandas as pd 
import geopandas as gp
import altair as alt 
import gpdvega 
import json
import matplotlib.pyplot as plt
alt.renderers.enable('notebook') # render for Jupyter Notebook

RendererRegistry.enable('notebook')

Read the data for turnstiles with equipments

In [2]:
turnstile_to_outage = pd.read_csv('/nas/dft/ire/susanw/out/turnstile_to_outage_elevations.csv')
stations_with_equipments = turnstile_to_outage.Turnstile_station.unique()

Read all turnstile data and filter

In [96]:
turnstile = pd.read_pickle('/nas/dft/ire/kmohan/hackday_2019_q4_mta_accessibility/data/turnstile_new.pkl.gz')
filtered_turnstile = turnstile.loc[(turnstile.entry_diff > 0)
                                    & (turnstile.exit_diff > 0)
                                    & (turnstile.STATION.isin(stations_with_equipments))]
filtered_turnstile.reset_index(inplace=True)
filtered_turnstile = filtered_turnstile.loc[filtered_turnstile['datetime'] > pd.to_datetime('20190101', format='%Y%m%d', errors='ignore')]

Aggregate turnstile data by LINENAME, STATION and datetime

In [97]:
filtered_turnstile = filtered_turnstile.groupby(['LINENAME', 'STATION', 'datetime']).sum().reset_index()

In [98]:
def interpolate(data):
    data = data.set_index(pd.DatetimeIndex(data.datetime))
    augmented = data.resample('1H').asfreq()
    augmented.entry_diff = augmented.entry_diff.interpolate(method='linear')
    augmented.exit_diff = augmented.exit_diff.interpolate(method='linear')
    augmented = augmented.fillna(method='ffill')
    return augmented

In [108]:
results = pd.DataFrame()
for ids,group in filtered_turnstile.groupby(['LINENAME', 'STATION']):
    try:    
        results = results.append(interpolate(group))
    except:
        print('issues with ', ids)

In [109]:
results.head()

In [110]:
DATA_DIR = '/nas/dft/ire/jujiang/DataClinic2019Q4HackDay'
results.to_csv(DATA_DIR + "/hourly_turnstile_aggregated_by_station.csv")