# NYC MTA Turnstile Usage Dataset

Have a look at / download the following dataset :

- Go to http://web.mta.info/developers/turnstile.html
- This dataset shows entry & exit counter values for each turnstile-device in each station in the NYC Subway System.
- Note these aren’t counts per interval, but equivalent to an “odometer” reading for each device.
 

# Step 0 - data download

Tool used: Firefox plugin --- DownThemAll!!!

Since the analytics and visulization tasks are based on 2013 data stats. I only downloaded the last day of 2011, all 2012 data, all 2013 data, and the first day of 2014.

# Step 1 - raw data load

In [191]:
import numpy  as np
import pandas as pd

import os, sys
import csv
import glob

from datetime import datetime
from glob     import glob

# working place
work_path = "/Users/Wenjie/Projects/GitHub/Tinny/TS/MTA"
file_list = glob( work_path + "/*.txt" )

In [196]:
file_list = ["turnstile_130601.txt", "turnstile_130608.txt", "turnstile_130615.txt"]

# header used
header = ("C/A,UNIT,SCP," + \
          "DATE1,TIME1,DESC1,ENTRIES1,EXITS1,DATE2,TIME2,DESC2,ENTRIES2,EXITS2," + \
          "DATE3,TIME3,DESC3,ENTRIES3,EXITS3,DATE4,TIME4,DESC4,ENTRIES4,EXITS4," + \
          "DATE5,TIME5,DESC5,ENTRIES5,EXITS5,DATE6,TIME6,DESC6,ENTRIES6,EXITS6," + \
          "DATE7,TIME7,DESC7,ENTRIES7,EXITS7,DATE8,TIME8,DESC8,ENTRIES8,EXITS8").split(",")

df_raw_list = []
for filename in file_list:
    df_raw = pd.read_csv(work_path + "/" + filename, sep = ',', header = None)
    #df_raw = pd.read_csv(filename, sep = ',', header = None)
    df_raw.columns = header
    df_raw_list.append( df_raw )

#df_raw = pd.read_csv(work_path + "/" + filename, sep = ',', header = None)
#df_raw.shape
#df_raw.head()
len(df_raw_list)

3

In [198]:
def mta_df_normalization( df ):
    """ normalize the data frame. claps datetime, desc, entries, and exists """
    
    columns_name = ["C/A","UNIT","SCP","Date","Time","Desc","Entries","Exits"]    
    df_out       = pd.DataFrame(columns = columns_name)
    
    rec_1 = df[ ["C/A","UNIT","SCP","DATE1","TIME1","DESC1","ENTRIES1","EXITS1"] ]; rec_1.columns = columns_name
    rec_2 = df[ ["C/A","UNIT","SCP","DATE2","TIME2","DESC2","ENTRIES2","EXITS2"] ]; rec_2.columns = columns_name 
    rec_3 = df[ ["C/A","UNIT","SCP","DATE3","TIME3","DESC3","ENTRIES3","EXITS3"] ]; rec_3.columns = columns_name
    rec_4 = df[ ["C/A","UNIT","SCP","DATE4","TIME4","DESC4","ENTRIES4","EXITS4"] ]; rec_4.columns = columns_name
    rec_5 = df[ ["C/A","UNIT","SCP","DATE5","TIME5","DESC5","ENTRIES5","EXITS5"] ]; rec_5.columns = columns_name
    rec_6 = df[ ["C/A","UNIT","SCP","DATE6","TIME6","DESC6","ENTRIES6","EXITS6"] ]; rec_6.columns = columns_name
    rec_7 = df[ ["C/A","UNIT","SCP","DATE7","TIME7","DESC7","ENTRIES7","EXITS7"] ]; rec_7.columns = columns_name
    rec_8 = df[ ["C/A","UNIT","SCP","DATE8","TIME8","DESC8","ENTRIES8","EXITS8"] ]; rec_8.columns = columns_name
    
    df_out = pd.concat( [rec_1, rec_2, rec_3, rec_4, rec_5, rec_6, rec_7, rec_8] )
    
    return df_out

In [200]:
norm_list = [ mta_df_normalization( df ) for df in df_raw_list]
df_norm   = pd.concat( norm_list )

#test = mta_df_normalization( df_raw )
#test.shape
#test[70:80]
#test["DateTime"] = test.apply( lambda x: convert_date_time( x["Date"] + " " + x["Time"] ) )

In [201]:
#test["DateTime"] = test.apply( lambda x: convert_date_time( str(x["Date"]) + " " + str(x["Time"]) ) )
#test["DateTime"] = test.apply( lambda x: convert_date_time(str(x['Date']), str(x['Time'])), axis=1 )
df_norm.shape

(718160, 8)

# Step 2 - Pre-processing

In [204]:
def convert_date_time( dt_str, tm_str ):
    """ convert datetime str to datetime obj """
    
    t_str = dt_str + " " + tm_str
    try:
        return datetime.strptime(t_str, '%m-%d-%y %H:%M:%S')
    except ValueError:
        return None

df_norm["DateTime"] = df_norm.apply( lambda x: convert_date_time(str(x['Date']), str(x['Time'])), axis=1 )


In [205]:
df_norm.head()

Unnamed: 0,C/A,UNIT,SCP,Date,Time,Desc,Entries,Exits,DateTime
0,A002,R051,02-00-00,05-25-13,00:00:00,REGULAR,4134240.0,1421141.0,2013-05-25 00:00:00
1,A002,R051,02-00-00,05-26-13,08:00:00,REGULAR,4135142.0,1421455.0,2013-05-26 08:00:00
2,A002,R051,02-00-00,05-27-13,16:00:00,REGULAR,4136076.0,1421853.0,2013-05-27 16:00:00
3,A002,R051,02-00-00,05-29-13,00:00:00,REGULAR,4138040.0,1422467.0,2013-05-29 00:00:00
4,A002,R051,02-00-00,05-30-13,08:00:00,REGULAR,4139863.0,1423127.0,2013-05-30 08:00:00


In [206]:
df_norm_grp = df_norm.groupby( ["C/A","UNIT","SCP"] )


In [211]:
print(df_norm_grp.count().head()) 

                    Date  Time  Desc  Entries  Exits  DateTime
C/A  UNIT SCP                                                 
A002 R051 02-00-00   133   133   133      133    133       133
          02-00-01   135   135   135      135    135       135
          02-03-00   131   131   131      131    131       131
          02-03-01   133   133   133      133    133       133
          02-03-02   133   133   133      133    133       133
