In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from datetime import datetime as dt
import re

%matplotlib inline

In [2]:
df1 = pd.read_csv("http://web.mta.info/developers/data/nyct/turnstile/turnstile_190504.txt")
df2 = pd.read_csv("http://web.mta.info/developers/data/nyct/turnstile/turnstile_190511.txt")
df3 = pd.read_csv("http://web.mta.info/developers/data/nyct/turnstile/turnstile_190518.txt")
df4 = pd.read_csv("http://web.mta.info/developers/data/nyct/turnstile/turnstile_190525.txt")
raw_data = pd.concat([df1,df2,df3,df4])

We found that the `EXITS` column contains trailing spaces, so we get rid of those

In [3]:
raw_data.columns = raw_data.columns.str.rstrip()

Adding a Datetime column

In [4]:
raw_data["DATE_TIME"] = pd.to_datetime(raw_data.DATE + " " + raw_data.TIME, 
                                            format="%m/%d/%Y %H:%M:%S")

The data is organized by turnstile, but does not have a direct unique identifier for each turnstile. To create one we add the column `ID`, which concatenates into a unique turnstile ID.

In [5]:
raw_data["ID"] = raw_data['C/A']+raw_data['UNIT']+raw_data['SCP']+raw_data['STATION']

The data contains duplicates, so we deal with those next

In [6]:
# Sanity Check to verify that "C/A", "UNIT", "SCP", "STATION", "DATE_TIME" is unique
(raw_data
 .groupby(["ID", "DATE_TIME"])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head(5)

Unnamed: 0,ID,DATE_TIME,ENTRIES
118612,G009R15102-00-04CONEY IS-STILLW,2019-05-16 17:00:00,2
118444,G009R15102-00-03CONEY IS-STILLW,2019-05-16 17:00:00,2
408244,N525R14201-00-03DELANCEY/ESSEX,2019-05-11 05:00:00,2
408415,N525R14201-00-04DELANCEY/ESSEX,2019-05-11 05:00:00,2
0,A002R05102-00-0059 ST,2019-04-27 00:00:00,1


In [7]:
# Get rid of the duplicate entry
raw_data.sort_values(["ID", "DATE_TIME"], 
                          inplace=True, ascending=False)
raw_data.drop_duplicates(subset=["ID", "DATE_TIME"], inplace=True)

In [8]:
#check to see if TIME is as neat as it first appears
raw_data.TIME.value_counts().head(40)

00:00:00    69096
04:00:00    69092
08:00:00    69075
16:00:00    69061
12:00:00    69049
20:00:00    68972
21:00:00    50879
09:00:00    50873
17:00:00    50864
05:00:00    50851
13:00:00    50833
01:00:00    50828
18:00:00     3137
22:00:00     3136
10:00:00     3132
02:00:00     3131
06:00:00     3128
14:00:00     3125
15:00:00     1076
19:00:00     1076
07:00:00     1074
11:00:00     1073
03:00:00     1066
23:00:00     1064
20:22:00     1038
12:22:00     1037
08:22:00     1037
00:22:00     1037
16:22:00     1036
04:22:00     1030
22:30:00      563
10:30:00      562
06:30:00      560
18:30:00      560
02:30:00      560
14:30:00      560
08:30:00      340
04:30:00      339
16:30:00      338
00:30:00      336
Name: TIME, dtype: int64

In [9]:
raw_data.TIME.str.match('..:00:00').value_counts()

True     744691
False     78294
Name: TIME, dtype: int64

Turns out the time intervals aren't all neatly every 4 hours. I will need to group the column into more regular intervals to proceed to a simpler analysis. Let's try making a function that will categorize the column that way.

In [10]:
def get_time_slot(time):
    timestr = str(time)
    timenum = int(re.sub('[^0-9]', '', timestr))
    if 0 <= timenum < 40000:
        return '12am - 4am'
    elif 40000 <= timenum < 80000:
        return '4am - 8am'
    elif 80000 <= timenum < 120000:
        return '8am - 12pm'
    elif 120000 <= timenum < 160000:
        return '12pm - 4pm'
    elif 160000 <= timenum < 200000:
        return '4pm - 8pm'
    else:
        return '8pm - 12am'

In [11]:
raw_data['TIMESLOT'] = raw_data.TIME.apply(get_time_slot)

In [12]:
raw_data.TIMESLOT.value_counts()

8am - 12pm    145660
4am - 8am     136927
4pm - 8pm     136921
12pm - 4pm    134958
8pm - 12am    134298
12am - 4am    134221
Name: TIMESLOT, dtype: int64

Now there are 6 roughly equal groups of time intervals to analyze.

In [13]:
#time_mask = (raw_data.TIME.str.match('..:00:00'))
# used this line earlier to just work with 

Let's now create columns which show entries and exits since the last tally, rather than cumulative entries and exits.

In [14]:
raw_data['NEWENTRIES'] = raw_data['ENTRIES'].diff()
raw_data.dropna(subset=['NEWENTRIES'], axis=0, inplace=True)

raw_data['NEWEXITS'] = raw_data['EXITS'].diff()
raw_data.dropna(subset=['NEWEXITS'], axis=0, inplace=True)

In [15]:
raw_data[['NEWENTRIES', 'NEWEXITS']]

Unnamed: 0,NEWENTRIES,NEWEXITS
203360,0.0,0.0
203359,0.0,0.0
203358,0.0,0.0
203357,0.0,0.0
203356,0.0,0.0
...,...,...
4,-279.0,-50.0
3,-259.0,-69.0
2,-100.0,-76.0
1,-23.0,-35.0


Our group's analysis of the resulting data shows some problems
* negative values, indicating turnstile tickers that count down instead of up
* Very large values that indicate a reset of a ticker

We take the absolute values for the negative ticker data and set a limit for the maximum reasonable value of people going through a turnstile. Our limit is based on a max of 1 person per second, or 14400 within each 4 hour time window. Anything higher than this number, we replace with 0.

In [16]:
def get_timeslot_counts_entries(row, max_counter):
    if row["NEWENTRIES"] < 0:
        # Maybe counter is reversed?
        row["NEWENTRIES"] = abs(row["NEWENTRIES"])
    if row["NEWENTRIES"] > max_counter:
        # Check it again to make sure we're not still giving a counter that's too big
        row["NEWENTRIES"] = 0
    return row["NEWENTRIES"]

raw_data["NEWENTRIES"] = raw_data.apply(get_timeslot_counts_entries, axis=1, max_counter=14400)

In [17]:
def get_timeslot_counts_exits(row, max_counter):
    if row["NEWEXITS"] < 0:
        # Maybe counter is reversed?
        row["NEWEXITS"] = abs(row["NEWEXITS"])
    if row["NEWEXITS"] > max_counter:
        # Check it again to make sure we're not still giving a counter that's too big
        row["NEWEXITS"] = 0
    return row["NEWEXITS"]

raw_data["NEWEXITS"] = raw_data.apply(get_timeslot_counts_exits, axis=1, max_counter=14400)

Add up entries and exits for total station traffic.

In [18]:
raw_data["TRAFFIC"] = raw_data.NEWENTRIES + raw_data.NEWEXITS

In [19]:
raw_data['TRAFFIC'].head(40)

203360    0.0
203359    0.0
203358    0.0
203357    0.0
203356    0.0
203355    0.0
203354    0.0
203353    0.0
203352    0.0
203351    0.0
203350    0.0
203349    0.0
203348    0.0
203347    0.0
203346    0.0
203345    0.0
203344    0.0
203343    0.0
203342    0.0
203341    0.0
203340    0.0
203339    0.0
203338    0.0
203337    0.0
203336    0.0
203335    0.0
203334    0.0
203333    0.0
203332    0.0
203331    0.0
203330    0.0
203329    0.0
203328    0.0
203327    0.0
203326    0.0
203325    0.0
203324    0.0
203323    0.0
203322    0.0
203321    0.0
Name: TRAFFIC, dtype: float64

Now we can look at the MTA's overall busiest time slots.

In [22]:
raw_data.groupby(['TIMESLOT']).TRAFFIC.mean()

TIMESLOT
12am - 4am     38.451747
12pm - 4pm    438.518050
4am - 8am     293.508497
4pm - 8pm     527.529506
8am - 12pm    400.324722
8pm - 12am    201.061364
Name: TRAFFIC, dtype: float64

Next we can group by station of the busiest stations and find out the best times of day *for those stations*.

In [23]:
#need busiest_stations dataframe to be supplied to it

data_by_timeslot = busiest_stations.groupby(['STATION', 'ID', 'TIMESLOT'])['TRAFFIC'].mean()

NameError: name 'busiest_stations' is not defined