In [110]:
import pandas as pd

In [111]:
# Source: http://web.mta.info/developers/turnstile.html
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [160903, 160910, 160917]
df = get_data(week_nums)

In [112]:
#clean column names
df.rename(columns={'C/A': 'c_a', 'UNIT': 'unit', 'SCP': 'scp', 'STATION': 'station', 'LINENAME': 'linename', 'DIVISION': 'division', 'DATE': 'date','TIME': 'time', 'DESC': 'desc', 'ENTRIES': 'entries', df.columns[10]: 'exits'}, inplace=True)

#keep only subway divisions
df = df[(df['division'] == 'BMT') | (df['division'] == 'IND') | (df['division'] == 'IRT')]

#Add formatted timestamp column
df['timestamp'] = pd.to_datetime(df['date'] + " " + df['time'])

#add unique turnstile id
df['unique_turnstile_id'] = df['c_a'] + df['unit'] + ' ' + df['station'] + ' ' + df['scp']

#add day of week column
df['day_of_week'] = df.timestamp.dt.day_name()

df = df.reset_index()

In [113]:
#df.desc.value_counts()

In [114]:
# Get rid of duplicate entries caused by "RECOVER AUD" in desc column, if any
df.sort_values(["c_a", "unit", "scp", "station", "timestamp"], 
                          inplace=True, ascending=False)
df.drop_duplicates(subset=["c_a", "unit", "scp", "station", "timestamp"], inplace=True)

In [115]:
#keep only needed columns
df = df.loc[:,['station', 'entries', 'exits', 'timestamp', 'unique_turnstile_id']]

In [116]:
#calculate actual volumes for each record (record minus previous record)
df['diff_entries'] = abs(df.entries.diff())
df['diff_exits'] = abs(df.exits.diff())
df['total_traffic'] = df.diff_entries + df.diff_exits

#record index in new column (needed to identify first instance of turnstile data)
df['record_index'] = df.index
#df

In [117]:
#identify first rows of unique turnstile data
df_first_rows = df.groupby('unique_turnstile_id').first()

#index of records of first turnstile rows
list_index = df_first_rows.record_index.values

#drop first rows from main df
df.drop(list_index, inplace=True)

In [118]:
#keep only data with traffic > 0 amd < 14400 (1 person per second)
df = df[(df['total_traffic'] > 0) & (df['total_traffic'] < 14400)]

In [119]:
df = df.reset_index()


In [120]:
#drop record index column. no longer applies after re-indexing
df.drop('record_index', axis=1)

Unnamed: 0,index,station,entries,exits,timestamp,unique_turnstile_id,diff_entries,diff_exits,total_traffic
0,545020,EASTCHSTER/DYRE,2446837,1015318,2016-09-16 17:00:00,R730R431 EASTCHSTER/DYRE 00-00-04,76.0,104.0,180.0
1,545019,EASTCHSTER/DYRE,2446752,1015247,2016-09-16 13:00:00,R730R431 EASTCHSTER/DYRE 00-00-04,85.0,71.0,156.0
2,545018,EASTCHSTER/DYRE,2446696,1015230,2016-09-16 09:00:00,R730R431 EASTCHSTER/DYRE 00-00-04,56.0,17.0,73.0
3,545017,EASTCHSTER/DYRE,2446383,1015188,2016-09-16 05:00:00,R730R431 EASTCHSTER/DYRE 00-00-04,313.0,42.0,355.0
4,545016,EASTCHSTER/DYRE,2446380,1015178,2016-09-16 01:00:00,R730R431 EASTCHSTER/DYRE 00-00-04,3.0,10.0,13.0
...,...,...,...,...,...,...,...,...,...
480214,4,59 ST,5799833,1966214,2016-08-27 16:00:00,A002R051 59 ST 02-00-00,288.0,57.0,345.0
480215,3,59 ST,5799610,1966155,2016-08-27 12:00:00,A002R051 59 ST 02-00-00,223.0,59.0,282.0
480216,2,59 ST,5799492,1966079,2016-08-27 08:00:00,A002R051 59 ST 02-00-00,118.0,76.0,194.0
480217,1,59 ST,5799463,1966044,2016-08-27 04:00:00,A002R051 59 ST 02-00-00,29.0,35.0,64.0


In [121]:
df_by_station = df.groupby(["unique_turnstile_id", "timestamp"]).agg({'total_traffic': 'sum'}).reset_index().sort_values(by="total_traffic", ascending=False)
df_by_station

Unnamed: 0,unique_turnstile_id,timestamp,total_traffic
157763,N095R014 FULTON ST 00-03-03,2016-09-13 09:00:00,13960.0
104713,JFK01R535 HOWARD BCH JFK 00-00-03,2016-09-12 09:00:00,9895.0
105670,JFK03R536 JFK JAMAICA CT1 00-00-02,2016-09-06 09:00:00,8392.0
405862,R323R387 WEST FARMS SQ 00-00-03,2016-09-14 14:59:38,4952.0
260280,N519R461 B'WAY-LAFAYETTE 00-00-00,2016-09-08 13:00:00,4257.0
...,...,...,...
205921,N311R339 36 ST 01-00-01,2016-09-11 01:00:00,1.0
214033,N325AR218 ELMHURST AV 00-00-01,2016-08-27 20:00:00,1.0
214032,N325AR218 ELMHURST AV 00-00-01,2016-08-27 12:00:00,1.0
428858,R509R121 QUEENSBORO PLZ 00-00-05,2016-09-07 00:00:00,1.0
