In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [15]:
def get_mta_data(weeks):
    url = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt'
    df_list = []
    for week in weeks:
        df_list.append(pd.read_csv(url.format(week)))
    return pd.concat(df_list)

In [16]:
mta_df = get_mta_data([170506, 170513, 170520, 170527])
#mta_df = get_mta_data([170506])

In [74]:
mta_df.head(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,00:00:00,REGULAR,6157740,2085315
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,04:00:00,REGULAR,6157777,2085319
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,08:00:00,REGULAR,6157810,2085353
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,12:00:00,REGULAR,6157963,2085453
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,16:00:00,REGULAR,6158212,2085529
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/29/2017,20:00:00,REGULAR,6158521,2085589
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/30/2017,00:00:00,REGULAR,6158637,2085612
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/30/2017,04:00:00,REGULAR,6158654,2085617
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/30/2017,08:00:00,REGULAR,6158661,2085641
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/30/2017,12:00:00,REGULAR,6158703,2085730


In [81]:
def get_entries(mta_df):
    d = {}
    turnstile_d = {}
    for row in mta_df.itertuples():
        C_A, unit, position, station, date = row[1], row[2], row[3], row[4], row[7]
        entries = row[10]
        if (C_A, unit, position, station, date) not in d:
            d[(C_A, unit, position, station, date)] = [entries]
        else:
            d[(C_A, unit, position, station, date)].append(entries)
    for key, value in d.items():
        entries_diff = abs(max(value)-min(value))
        turnstile_d[key] = [entries_diff]
    return turnstile_d

In [115]:
d = get_entries(mta_df)

In [117]:
dict_df = pd.DataFrame.from_dict(d, orient='index')
dict_df.rename(columns = {0:'Entries'}, inplace=True)
dict_df.head(10)

Unnamed: 0,Entries
"(A002, R051, 02-00-00, 59 ST, 04/29/2017)",781
"(A002, R051, 02-00-00, 59 ST, 04/30/2017)",378
"(A002, R051, 02-00-00, 59 ST, 05/01/2017)",1267
"(A002, R051, 02-00-00, 59 ST, 05/02/2017)",1322
"(A002, R051, 02-00-00, 59 ST, 05/03/2017)",1193
"(A002, R051, 02-00-00, 59 ST, 05/04/2017)",1348
"(A002, R051, 02-00-00, 59 ST, 05/05/2017)",1072
"(A002, R051, 02-00-01, 59 ST, 04/29/2017)",646
"(A002, R051, 02-00-01, 59 ST, 04/30/2017)",538
"(A002, R051, 02-00-01, 59 ST, 05/01/2017)",976


In [118]:
turnstile_df = pd.DataFrame(columns=[])

In [119]:
turnstile_df['C/A'] = [row[0][0] for row in dict_df.itertuples()]
turnstile_df['Unit'] = [row[0][1] for row in dict_df.itertuples()]
turnstile_df['SCP'] = [row[0][2] for row in dict_df.itertuples()]
turnstile_df['Station'] = [row[0][3] for row in dict_df.itertuples()]
turnstile_df['Date'] = [row[0][4] for row in dict_df.itertuples()]
turnstile_df['Entries'] = [row[1] for row in dict_df.itertuples()]

In [120]:
turnstile_df.head(10)

Unnamed: 0,C/A,Unit,SCP,Station,Date,Entries
0,A002,R051,02-00-00,59 ST,04/29/2017,781
1,A002,R051,02-00-00,59 ST,04/30/2017,378
2,A002,R051,02-00-00,59 ST,05/01/2017,1267
3,A002,R051,02-00-00,59 ST,05/02/2017,1322
4,A002,R051,02-00-00,59 ST,05/03/2017,1193
5,A002,R051,02-00-00,59 ST,05/04/2017,1348
6,A002,R051,02-00-00,59 ST,05/05/2017,1072
7,A002,R051,02-00-01,59 ST,04/29/2017,646
8,A002,R051,02-00-01,59 ST,04/30/2017,538
9,A002,R051,02-00-01,59 ST,05/01/2017,976


In [121]:
turnstile_df.groupby(['C/A', 'SCP'])['Entries'].sum()

C/A    SCP     
A002   02-00-00    31915
       02-00-01    21520
       02-03-00    11701
       02-03-01    28609
       02-03-02    25370
       02-03-03    27908
       02-03-04    29725
       02-03-05    44742
       02-03-06    36191
       02-05-00        8
       02-05-01        0
       02-06-00    35761
A006   00-00-00    38968
       00-00-01    36975
       00-00-02    28035
       00-00-03    23247
       00-00-04    22534
       00-03-00    31818
       00-03-01    41933
       00-03-02    55608
A007   01-05-00        9
       01-05-01        0
       01-06-00    16861
       01-06-01    33857
       01-06-02    28491
       01-06-03    43567
A010   00-00-00    62761
       00-00-01    43211
       00-00-02    31494
       00-00-03    26473
                   ...  
S101A  01-00-00     4077
       01-00-01     6651
       01-00-02     9287
       01-00-03    11612
       01-00-04    11977
       01-00-05    13989
       01-00-06    15378
       01-03-00    25856
       01