In [1]:
from __future__ import division
import csv
from datetime import datetime
from collections import Counter

In [2]:
#!curl -O http://web.mta.info/developers/data/nyct/turnstile/turnstile_150627.txt

In [3]:
with open('turnstile_160319.txt') as f:    #open the link to the data
    reader = csv.reader(f)    #read in the data
    rows = [[cell.strip() for cell in row] for row in reader]    #loop over the rows and witin the rows to extract the data and remove any whitespace from the beginning and end

In [4]:
assert rows.pop(0) == ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME',
                       'DIVISION', 'DATE', 'TIME', 'DESC', 'ENTRIES',
                       'EXITS']    #check to make sure that we have the correct variable names

In [5]:
raw_readings = {}    #an empty dictionary
for row in rows[0:100]:    #loop over the elements in the list rows
    raw_readings.setdefault(tuple(row[:4]), []).append(tuple(row[4:]))    #loop over the rows, add new keys to the dictionary when the row info is not already present, append to existing data when it is

In [6]:
#raw_readings.keys()    #the dictionary raw_readings solves challenge 1

In [7]:
datetime_cumulative = {turnstile: [(datetime.strptime(date + time,'%m/%d/%Y%X'),int(in_cumulative))
                                   for _, _, date, time,_, in_cumulative, _ in rows]
                       for turnstile, rows in raw_readings.items()}    #make a new dictionary; the keys will be the turnstiles, and each will have a list of tuples consisting of date and cumulative total

In [9]:
for rows in datetime_cumulative.values():    #loop over the time series
    assert rows == sorted(rows)    #and ensure that each is in the correct order in time - note that tuples are sorted by first element

In [10]:
count = 0
for key, value in datetime_cumulative.iteritems():
    if count > 5:
        break
    count += 1
    print key,value
    print

('A002', 'R051', '02-00-00', '59 ST') [(datetime.datetime(2016, 3, 12, 3, 0), 5583673), (datetime.datetime(2016, 3, 12, 7, 0), 5583689), (datetime.datetime(2016, 3, 12, 11, 0), 5583785), (datetime.datetime(2016, 3, 12, 15, 0), 5584037), (datetime.datetime(2016, 3, 12, 19, 0), 5584482), (datetime.datetime(2016, 3, 12, 23, 0), 5584768), (datetime.datetime(2016, 3, 13, 4, 0), 5584835), (datetime.datetime(2016, 3, 13, 8, 0), 5584851), (datetime.datetime(2016, 3, 13, 12, 0), 5584909), (datetime.datetime(2016, 3, 13, 16, 0), 5585119), (datetime.datetime(2016, 3, 13, 20, 0), 5585346), (datetime.datetime(2016, 3, 14, 0, 0), 5585473), (datetime.datetime(2016, 3, 14, 4, 0), 5585482), (datetime.datetime(2016, 3, 14, 8, 0), 5585531), (datetime.datetime(2016, 3, 14, 12, 0), 5585680), (datetime.datetime(2016, 3, 14, 16, 0), 5585933), (datetime.datetime(2016, 3, 14, 20, 0), 5586481), (datetime.datetime(2016, 3, 15, 0, 0), 5586640), (datetime.datetime(2016, 3, 15, 4, 0), 5586649), (datetime.datetime(2

In [13]:
datetime_count_times = {turnstile: [[rows[i][0],
                                     rows[i+1][1] - rows[i][1],
                                     rows[i+1][0] - rows[i][0]]
                                    for i in range(len(rows) - 1)]
                        for turnstile, rows in datetime_cumulative.items()}    #make a new dictionary; keys will be the turnstiles, each will have a list of lists, each of which will give date, change in entries from last time, and elasped time

In [14]:
#datetime_count_times

In [15]:
datetime_counts = {turnstile: [(time, count)
                               for (time, count, _) in rows
                               if 0 <= count <= 5000]
                   for turnstile, rows in datetime_count_times.items()}    #make one more dictionary; keys will be turnstiles, each will have a list of lists with date and change in entries

In [67]:
#datetime_counts.keys()    #the dictionary datetime_counts solves challenge 2

In [76]:
day_counts = {}    #an empty dictionary to hold the total counts
for turnstile, rows in datetime_counts.items():    #loop over key/data pairs in datetime_counts
    by_day = {}    #for each make a new empty dictionary
    for time, count in rows:    #loop over the time and the count
        day = time.date()    #keep only the day info
        by_day[day] = by_day.get(day, 0) + count    #pull out the data associated with key <day> and add the count to it - if <day> is in the dictionary return 0
    day_counts[turnstile] = sorted(by_day.items())    #save the count by turnstile in the dictionary

In [80]:
#day_counts.items()    #the dictionary day_counts solves challenge 3