# NYC MTA Turnstile Usage Dataset

Have a look at / download the following dataset :

- Go to http://web.mta.info/developers/turnstile.html
- This dataset shows entry & exit counter values for each turnstile-device in each station in the NYC Subway System.
- Note these aren’t counts per interval, but equivalent to an “odometer” reading for each device.
 

# Step 0 - data download

Tool used: Firefox plugin --- DownThemAll!!!

Since the analytics and visulization tasks are based on 2013 data stats. I only downloaded the last day of 2011, all 2012 data, all 2013 data, and the first day of 2014.

In [1]:
# files were downloaded to 
work_path = "/Users/Wenjie/Projects/GitHub/Tinny/TS/MTA"

# Step 1 - raw data load

In [2]:
import numpy  as np
import pandas as pd

import os, sys
import csv
import glob

from datetime import datetime
from glob     import glob

In [3]:
#file_list = glob( work_path + "/*.txt" )
file_list = ["turnstile_130601.txt", "turnstile_130608.txt", "turnstile_130615.txt"]

# header used
header = ("C/A,UNIT,SCP," + \
          "DATE1,TIME1,DESC1,ENTRIES1,EXITS1,DATE2,TIME2,DESC2,ENTRIES2,EXITS2," + \
          "DATE3,TIME3,DESC3,ENTRIES3,EXITS3,DATE4,TIME4,DESC4,ENTRIES4,EXITS4," + \
          "DATE5,TIME5,DESC5,ENTRIES5,EXITS5,DATE6,TIME6,DESC6,ENTRIES6,EXITS6," + \
          "DATE7,TIME7,DESC7,ENTRIES7,EXITS7,DATE8,TIME8,DESC8,ENTRIES8,EXITS8").split(",")

df_raw_list = []
for filename in file_list:
    try:
        df_raw = pd.read_csv(work_path + "/" + filename, sep = ',', header = None)
        #df_raw = pd.read_csv(filename, sep = ',', header = None)
        df_raw.columns = header
        df_raw_list.append( df_raw )
    except:
        pass

#df_raw = pd.read_csv(work_path + "/" + filename, sep = ',', header = None)
#df_raw.shape
#df_raw.head()
len(df_raw_list)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


105

In [4]:
# convert format from

# ca,unit,scp,rec1 ... rec8
# .
# .
# to
# ca,unit,scp,rec1
# ca,unit,scp,rec2
# .
# .

def mta_df_normalization( df ):
    """ normalize the data frame. claps datetime, desc, entries, and exists """
    
    columns_name = ["C/A","UNIT","SCP","Date","Time","Desc","Entries","Exits"]    
    df_out       = pd.DataFrame(columns = columns_name)
    
    rec_1 = df[ ["C/A","UNIT","SCP","DATE1","TIME1","DESC1","ENTRIES1","EXITS1"] ]; rec_1.columns = columns_name
    rec_2 = df[ ["C/A","UNIT","SCP","DATE2","TIME2","DESC2","ENTRIES2","EXITS2"] ]; rec_2.columns = columns_name 
    rec_3 = df[ ["C/A","UNIT","SCP","DATE3","TIME3","DESC3","ENTRIES3","EXITS3"] ]; rec_3.columns = columns_name
    rec_4 = df[ ["C/A","UNIT","SCP","DATE4","TIME4","DESC4","ENTRIES4","EXITS4"] ]; rec_4.columns = columns_name
    rec_5 = df[ ["C/A","UNIT","SCP","DATE5","TIME5","DESC5","ENTRIES5","EXITS5"] ]; rec_5.columns = columns_name
    rec_6 = df[ ["C/A","UNIT","SCP","DATE6","TIME6","DESC6","ENTRIES6","EXITS6"] ]; rec_6.columns = columns_name
    rec_7 = df[ ["C/A","UNIT","SCP","DATE7","TIME7","DESC7","ENTRIES7","EXITS7"] ]; rec_7.columns = columns_name
    rec_8 = df[ ["C/A","UNIT","SCP","DATE8","TIME8","DESC8","ENTRIES8","EXITS8"] ]; rec_8.columns = columns_name
    
    df_out = pd.concat( [rec_1, rec_2, rec_3, rec_4, rec_5, rec_6, rec_7, rec_8] )
    
    return df_out

norm_list = [ mta_df_normalization( df ) for df in df_raw_list]
df_norm   = pd.concat( norm_list )
df_norm.shape

(24276984, 8)

# Step 2 - data pre-processing / cleaning

- definiton of "station"
- datetime convertion
- hourly / daily traffic calculation
- anormilies and outliers cleaning

In [5]:
del norm_list

In [6]:
# definition of "station"

# data source: Remote-Booth-Station.xls
remote_station_xls = pd.ExcelFile(work_path + "/" + "Remote-Booth-Station.xls")
df_remote_station  = remote_station_xls.parse( "Remote-Booth-Station (3)" )

# we need to link the following key value in main data set:
#     UNIT = Remote Unit for a station (R051)
# to Station in this data source

# note that "Remote" means the entrance of a station. Conceptually, a station has many entrances and
# many line of trains. Therefor, we assume station and remote is "one-to-many" mapping, which can be
# verified by the data. But occationally, a remote can be mapped to more than one stations. We consider
# them as data oddities at this moment for simplification purpose.

# remote => station mapping
dic_remote_station = {}
for idx, row in df_remote_station.iterrows():
    remote  = row["Remote"]
    station = row["Station"]
    if remote not in dic_remote_station.keys():
        dic_remote_station[remote] = station
    

#df_remote_station.head()
#dic_remote_station
df_norm["Station"] = df_norm.apply( lambda x: dic_remote_station.get(x["UNIT"], None), axis=1 )

# check the None value of Station
#df_norm[ pd.isnull(df_norm["Station"]) ].shape

df_norm.head()

KeyboardInterrupt: 

In [None]:
# many None values in Date and Time column
sum(pd.isnull(df_norm["Date"]) | pd.isnull(df_norm["Time"]))

In [None]:
df_norm = df_norm[ pd.notnull(df_norm["Date"]) & pd.notnull(df_norm["Time"]) ]
df_norm.shape

In [None]:
# datetime convertion

def convert_date_time( dt_str, tm_str ):
    """ convert datetime str to datetime obj """
    
    t_str = dt_str + " " + tm_str
    try:
        return datetime.strptime(t_str, '%m-%d-%y %H:%M:%S')
    except ValueError:
        return None

df_norm["DateTime"] = df_norm.apply( lambda x: convert_date_time(str(x['Date']), str(x['Time'])), axis=1 )

# drop None DateTime entries
df_norm = df_norm[ pd.notnull(df_norm["DateTime"]) ]
df_norm.shape

In [None]:
df_norm.head()

In [None]:
# hourly / daily traffic calculation for each device
# not sure about 'Desc' column, here we calculate the trafffic number for all types of 'Desc".

In [None]:
# hourly calcualtion (at device level)

# groupped at device level
df_norm_grp = df_norm.groupby( ["Station","C/A","UNIT","SCP"] )
#len(df_norm_grp)

def cal_hourly_traffic( df ):
    """ calculate hourly traffic """
    t_df = df.sort_values( by="DateTime" )
    
    t_df.loc[:, "EntriesLag"]    = t_df["Entries"].shift(periods=1)
    t_df.loc[:, "ExitsLag"]      = t_df["Exits"].shift(periods=1)

    t_df.loc[:, "EntryTraffic"]  = t_df["Entries"] - t_df["EntriesLag"]
    t_df.loc[:, "ExitTraffic"]   = t_df["Exits"] - t_df["ExitsLag"]

    t_df.loc[:, "HourlyTraffic"] = t_df["EntryTraffic"] + t_df["ExitTraffic"]
    
    return t_df.copy()


df_hourly_list = []
for name, group in df_norm_grp:
    df = cal_hourly_traffic( df_norm_grp.get_group(name) )
    df_hourly_list.append(df)    
df_hourly = pd.concat(df_hourly_list)
df_hourly.shape

#df_norm_grp.get_group(("59 ST", "A002","R051","02-00-00")).head()

In [None]:
# anormilies and outliers cleaning

# 1. check monotonic trend of accumulated value, i.e., negative delta
#    we did find negative EntryTraffic and ExitTraffic. It means Entries and Exits do NOT 
#    increase monotonically over time. There are three ways to handle this situation
#       a. replace with zeros    -  this is misleading because it can be interpretted as device shutdown.
#       b. get absolute value    -  this may work, but the impact is unknown. it may mess up the stats later
#       c. drop negative entries - this solution is adopted here for simplicity purpose

# drop negative / none entries
df_hourly = df_hourly[ (df_hourly["EntryTraffic"] >= 0)  |  (df_hourly["ExitTraffic"] >= 0)  ]
df_hourly.shape

df_hourly.to_pickle('df_hourly.pickle')

In [None]:
# 2. check outliers
# the definition of "outlier" is 6 standard deviations away from this mean.

# daily traffic calculation (at device level)

# create a device ID for further analysis
df_hourly["DeviceID"] = df_hourly["C/A"] + "-" + df_hourly["UNIT"] + "-" + df_hourly["SCP"]

# groupped at device/daily level
df_daily_grp = df_hourly.groupby( ["Station", "DeviceID", "Date"] )

# calculate daily traffic at device level
df_daily = df_daily_grp["HourlyTraffic"].agg({'DailyTraffic': np.sum})
df_daily = df_daily.reset_index()

#df_daily.head()

# get daily stats
daily_stats = df_daily.groupby(["Station", "DeviceID"])["DailyTraffic"].agg( {"DailyMean": np.mean, 
                                                                              "DailyStd" : np.std} )
daily_stats = daily_stats.reset_index()
#daily_stats.head()

df_daily = pd.merge(df_daily, daily_stats, on=["Station","DeviceID"], how='inner');
#df_daily.head()

# drop outliers
df_daily = df_daily[ df_daily["DailyTraffic"] <= (df_daily["DailyMean"] + 6.0 * df_daily["DailyStd"]) ]
df_daily.shape

# Step 3 - data analysis:
-                 Which station has the most number of units?
-                 What is the total number of entries & exits across the subway system for August 1, 2013?
-                 Let’s define the busy-ness as sum of entry & exit count. What station was the busiest on August 1, 2013? What turnstile was the busiest on that date?
-                 What stations have seen the most usage growth/decline in 2013?
-                 What dates are the least busy? Could you identify days on which stations were not operating at full capacity or closed entirely?

# Step 4 - data visualization:
-                 Plot the daily row counts for data files in Q3 2013.
-                 Plot the daily total number of entries & exits across the system for Q3 2013.
-                 Plot the mean and standard deviation of the daily total number of entries & exits for each month in Q3 2013 for station 34 ST-PENN STA.
-                 Plot 25/50/75 percentile of the daily total number of entries & exits for each month in Q3 2013 for station 34 ST-PENN STA.
-                 Plot the daily number of closed stations and number of stations that were not operating at full capacity in Q3 2013.
 