In [47]:
# Author: Yijun Xiao <ryjxiao@nyu.edu>

## 0. Retrieve data from MTA website and load

In [79]:
from __future__ import division, print_function
import pandas as pd
import urllib2
import datetime
import os
import cPickle as pickle

In [111]:
df = pd.read_excel("http://web.mta.info/developers/resources/nyct/turnstile/Remote-Booth-Station.xls")

In [115]:
df.columns = ["C/A", "UNIT", "STATION", "LINENAME", "DIVISION"]

In [170]:
class TurnstileDataLoader:
    """Automatically load data from MTA website"""
    
    def __init__(self):
        # format of links to the txt files
        self.url_base = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{0}.txt"
        # first day of data
        self.begining_of_time = datetime.date(2010, 5, 1)
        # date when format of data changed
        self.new_era = datetime.date(2014, 10, 18)
        self.today = datetime.date.today()
        
        # prepare station df for old format data
        self.data_dir = "data/"
        station_df_path = os.path.join(self.data_dir, "station.pkl")
        if os.path.isfile(station_df_path):
            with open(station_df_path) as f:
                self.station_df = pickle.load(f)
        else:
            self.station_df = pd.read_excel("http://web.mta.info/developers/resources/nyct/turnstile/Remote-Booth-Station.xls")
            self.station_df.columns = ["UNIT", "C/A", "STATION", "LINENAME", "DIVISION"]
            # save to data directory
            if not os.path.exists(self.data_dir):
                os.makedirs(self.data_dir)
            with open(station_df_path, "wb") as f:
                pickle.dump(self.station_df, f)
        
    def _next_saturday(self, date):
        """Find the nearest saturday after input date when data updated"""
        weekday = date.weekday()
        delta = datetime.timedelta(7 - (weekday + 2) % 7)
        return date + delta
    
    def _find_files(self, start_date, end_date):
        """Find list of files covering specified starting date and 
        end date."""
        # some assertions
        assert(start_date <= end_date)
        assert(start_date >= self.begining_of_time)
        assert(self._next_saturday(end_date) <= self.today)
        
        files = []
        start_saturday = self._next_saturday(start_date)
        end_saturday = self._next_saturday(end_date)
        while start_saturday <= end_saturday:
            datestr = start_saturday.strftime("%y%m%d")
            files.append(self.url_base.format(datestr))
            start_saturday += datetime.timedelta(7)
        return files
    
    def _load_old(self, txtfileurl):
        """Load old format txt file which needs quite some reformating"""
        records = []
        txtfile = urllib2.urlopen(txtfileurl)
        for line in txtfile:
            row = line.strip().split(",")
            if len(row) < 8:
                continue
            ca, unit, scp = row[:3]
            i = 3
            while i < len(row):
                date, time, desc, entries, exits = row[i:i+5]
                date_time = datetime.datetime.strptime(date + " " + time, "%m-%d-%y %H:%M:%S")
                record = dict(DATE_TIME=date_time, UNIT=unit, SCP=scp, 
                              DESC=desc, ENTRIES=int(entries), EXITS=int(exits))
                record["C/A"] = ca
                records.append(record)
                i += 5
        old_df = pd.DataFrame.from_records(records)
        return pd.merge(old_df, self.station_df, how="left").set_index(["DATE_TIME"])
        
    def load(self, txtfileurl):
        """Load txt file specified by a url as a DataFrame"""
        datestr = txtfileurl[-10:-4]
        # detect whether this file is of the newer, cleaner format
        is_new = datetime.datetime.strptime(datestr, "%y%m%d").date() >= self.new_era
        if is_new:
            return pd.read_csv(txtfileurl, parse_dates=[[6,7]], index_col=["DATE_TIME"])
        else:
            return self._load_old(txtfileurl)          
            
    def retrieve(self, start_date, end_date):
        """Retrieve data given starting date and end date"""
        files = self._find_files(start_date, end_date)
        frames = []
        for fileurl in files:
            frames.append(self.load(fileurl))
        result = pd.concat(frames)
        return result

In [55]:
txtfileurl = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_141018.txt"

In [58]:
df = pd.read_csv(txtfileurl, index_col=[0,1,2,3,4], parse_dates=[[6,7]])

{'CA': 1}

In [171]:
data_loader = TurnstileDataLoader()

In [172]:
start_date = datetime.date(2013, 7, 1)
end_date = datetime.date(2013, 7, 2)
df = data_loader.retrieve(start_date, end_date)

In [173]:
df

Unnamed: 0_level_0,C/A,DESC,ENTRIES,EXITS,SCP,UNIT,STATION,LINENAME,DIVISION
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-06-29 00:00:00,A002,REGULAR,4174592,1433672,02-00-00,R051,LEXINGTON AVE,456NQR,BMT
2013-06-29 04:00:00,A002,REGULAR,4174628,1433675,02-00-00,R051,LEXINGTON AVE,456NQR,BMT
2013-06-29 08:00:00,A002,REGULAR,4174641,1433706,02-00-00,R051,LEXINGTON AVE,456NQR,BMT
2013-06-29 12:00:00,A002,REGULAR,4174741,1433775,02-00-00,R051,LEXINGTON AVE,456NQR,BMT
2013-06-29 16:00:00,A002,REGULAR,4174936,1433826,02-00-00,R051,LEXINGTON AVE,456NQR,BMT
2013-06-29 20:00:00,A002,REGULAR,4175270,1433877,02-00-00,R051,LEXINGTON AVE,456NQR,BMT
2013-06-30 00:00:00,A002,REGULAR,4175403,1433908,02-00-00,R051,LEXINGTON AVE,456NQR,BMT
2013-06-30 04:00:00,A002,REGULAR,4175441,1433914,02-00-00,R051,LEXINGTON AVE,456NQR,BMT
2013-06-30 08:00:00,A002,REGULAR,4175457,1433928,02-00-00,R051,LEXINGTON AVE,456NQR,BMT
2013-06-30 12:00:00,A002,REGULAR,4175520,1433981,02-00-00,R051,LEXINGTON AVE,456NQR,BMT


In [132]:
df

Unnamed: 0_level_0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,ENTRIES,EXITS
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-06-27 00:00:00,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,REGULAR,5203784,1759879
2015-06-27 04:00:00,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,REGULAR,5203840,1759886
2015-06-27 08:00:00,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,REGULAR,5203860,1759910
2015-06-27 12:00:00,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,REGULAR,5203987,1759995
2015-06-27 16:00:00,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,REGULAR,5204270,1760060
2015-06-27 20:00:00,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,REGULAR,5204563,1760125
2015-06-28 00:00:00,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,REGULAR,5204723,1760168
2015-06-28 04:00:00,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,REGULAR,5204748,1760170
2015-06-28 08:00:00,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,REGULAR,5204757,1760188
2015-06-28 12:00:00,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,REGULAR,5204849,1760240
