In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [58]:
def get_mta_data(weeks):
    url = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt'
    df_list = []
    for week in weeks:
        df_list.append(pd.read_csv(url.format(week)))
    return pd.concat(df_list)

In [59]:
mta_df = get_mta_data([170506, 170513, 170520, 170527])
#mta_df = get_mta_data([170506])

In [60]:
mta_df.head(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,00:00:00,REGULAR,6157740,2085315
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,04:00:00,REGULAR,6157777,2085319
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,08:00:00,REGULAR,6157810,2085353
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,12:00:00,REGULAR,6157963,2085453
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,16:00:00,REGULAR,6158212,2085529
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,20:00:00,REGULAR,6158521,2085589
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/30/2017,00:00:00,REGULAR,6158637,2085612
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/30/2017,04:00:00,REGULAR,6158654,2085617
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/30/2017,08:00:00,REGULAR,6158661,2085641
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/30/2017,12:00:00,REGULAR,6158703,2085730


In [72]:
mta_df.groupby(['LINENAME', 'STATION']).size()

LINENAME  STATION        
1         103 ST             1008
          116 ST-COLUMBIA    1008
          125 ST              855
          137 ST CITY COL    1685
          145 ST             1176
          14TH STREET        1759
          157 ST             1360
          18 ST              1344
          181 ST              840
          191 ST              676
          207 ST             1002
          215 ST              672
          23 ST              1172
          231 ST             1689
          238 ST              504
          28 ST              1846
          50 ST              3007
          66 ST-LINCOLN      3862
          79 ST              1520
          86 ST              2345
          9TH STREET          780
          CANAL ST           1006
          CATHEDRAL PKWY     1175
          CHRISTOPHER ST     1638
          CITY / BUS         3640
          DYCKMAN ST         1008
          EXCHANGE PLACE     4144
          FRANKLIN ST        1350
          GROVE STREET

In [62]:
def get_entries(mta_df):
    d = {}
    turnstile_d = {}
    for row in mta_df.itertuples():
        C_A, unit, scp, station, linename = row[1], row[2], row[3], row[4], row[5]
        date = row[7]
        entries = row[10]
        k = (C_A, unit, scp, station, linename, date)
        if k not in d:
            d[k] = [entries]
        else:
            d[k].append(entries)
    for key, value in d.items():
        entries_diff = abs(max(value)-min(value))
        turnstile_d[key] = [entries_diff]
    return turnstile_d

In [63]:
d = get_entries(mta_df)

In [64]:
dict_df = pd.DataFrame.from_dict(d, orient='index')
dict_df.rename(columns = {0:'Entries'}, inplace=True)
dict_df.head(10)

Unnamed: 0,Entries
"(A002, R051, 02-00-00, 59 ST, NQR456W, 04/29/2017)",781
"(A002, R051, 02-00-00, 59 ST, NQR456W, 04/30/2017)",378
"(A002, R051, 02-00-00, 59 ST, NQR456W, 05/01/2017)",1267
"(A002, R051, 02-00-00, 59 ST, NQR456W, 05/02/2017)",1322
"(A002, R051, 02-00-00, 59 ST, NQR456W, 05/03/2017)",1193
"(A002, R051, 02-00-00, 59 ST, NQR456W, 05/04/2017)",1348
"(A002, R051, 02-00-00, 59 ST, NQR456W, 05/05/2017)",1072
"(A002, R051, 02-00-01, 59 ST, NQR456W, 04/29/2017)",646
"(A002, R051, 02-00-01, 59 ST, NQR456W, 04/30/2017)",538
"(A002, R051, 02-00-01, 59 ST, NQR456W, 05/01/2017)",976


In [65]:
turnstile_df = pd.DataFrame(columns=[])

In [66]:
turnstile_df['C/A'] = [row[0][0] for row in dict_df.itertuples()]
turnstile_df['Unit'] = [row[0][1] for row in dict_df.itertuples()]
turnstile_df['SCP'] = [row[0][2] for row in dict_df.itertuples()]
turnstile_df['Station'] = [row[0][3] for row in dict_df.itertuples()]
turnstile_df['Linename'] = [row[0][4] for row in dict_df.itertuples()]
turnstile_df['Date'] = [row[0][5] for row in dict_df.itertuples()]
turnstile_df['Entries'] = [row[1] for row in dict_df.itertuples()]

In [67]:
turnstile_df.head(10)

Unnamed: 0,C/A,Unit,SCP,Station,Linename,Date,Entries
0,A002,R051,02-00-00,59 ST,NQR456W,04/29/2017,781
1,A002,R051,02-00-00,59 ST,NQR456W,04/30/2017,378
2,A002,R051,02-00-00,59 ST,NQR456W,05/01/2017,1267
3,A002,R051,02-00-00,59 ST,NQR456W,05/02/2017,1322
4,A002,R051,02-00-00,59 ST,NQR456W,05/03/2017,1193
5,A002,R051,02-00-00,59 ST,NQR456W,05/04/2017,1348
6,A002,R051,02-00-00,59 ST,NQR456W,05/05/2017,1072
7,A002,R051,02-00-01,59 ST,NQR456W,04/29/2017,646
8,A002,R051,02-00-01,59 ST,NQR456W,04/30/2017,538
9,A002,R051,02-00-01,59 ST,NQR456W,05/01/2017,976


In [87]:
d = turnstile_df.groupby(['Linename', 'Station']).mean()