<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
from __future__ import print_function, division

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
import datetime

In [4]:
# Source: http://web.mta.info/developers/turnstile.html
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []

    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [190907, 190914, 190921]
turnstiles_df = get_data(week_nums)

In [5]:
turnstiles_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,00:00:00,REGULAR,7183242,2433142
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,04:00:00,REGULAR,7183258,2433149
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,08:00:00,REGULAR,7183278,2433176
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,12:00:00,REGULAR,7183393,2433262
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,16:00:00,REGULAR,7183572,2433312


In [6]:
# TODO: We're not going to use this
mta_stations_csv_path = '../data/mta_stations.csv'

mta_stations = pd.read_csv(mta_stations_csv_path)

mta_stations.rename(columns={'GTFS Stop ID': 'UNIT', 'Division': 'DIVISION'}, inplace=True)

mta_stations

Unnamed: 0,Station ID,Complex ID,UNIT,DIVISION,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,North Direction Label,South Direction Label
0,1,1,R01,BMT,Astoria,Astoria - Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034,,Manhattan
1,2,2,R03,BMT,Astoria,Astoria Blvd,Q,N W,Elevated,40.770258,-73.917843,Ditmars Blvd,Manhattan
2,3,3,R04,BMT,Astoria,30 Av,Q,N W,Elevated,40.766779,-73.921479,Astoria - Ditmars Blvd,Manhattan
3,4,4,R05,BMT,Astoria,Broadway,Q,N W,Elevated,40.761820,-73.925508,Astoria - Ditmars Blvd,Manhattan
4,5,5,R06,BMT,Astoria,36 Av,Q,N W,Elevated,40.756804,-73.929575,Astoria - Ditmars Blvd,Manhattan
5,6,6,R08,BMT,Astoria,39 Av,Q,N W,Elevated,40.752882,-73.932755,Astoria - Ditmars Blvd,Manhattan
6,7,613,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.762660,-73.967258,Queens,Downtown & Brooklyn
7,8,8,R13,BMT,Astoria,5 Av/59 St,M,N W R,Subway,40.764811,-73.973347,Queens,Downtown & Brooklyn
8,9,9,R14,BMT,Broadway - Brighton,57 St - 7 Av,M,N Q R W,Subway,40.764664,-73.980658,Uptown & Queens,Downtown & Brooklyn
9,10,10,R15,BMT,Broadway - Brighton,49 St,M,N R W,Subway,40.759901,-73.984139,Uptown & Queens,Downtown & Brooklyn


In [7]:
# TODO: We're also not going to use this
turnstiles_df_with_borough = \
    pd.merge(turnstiles_df, mta_stations[[ 'UNIT', 'Borough' ]], on='UNIT', how='left')

turnstiles_df_with_borough.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,Borough
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,00:00:00,REGULAR,7183242,2433142,
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,04:00:00,REGULAR,7183258,2433149,
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,08:00:00,REGULAR,7183278,2433176,
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,12:00:00,REGULAR,7183393,2433262,
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,16:00:00,REGULAR,7183572,2433312,


In [8]:
# Remove any leading and trailing spaces from all column names (The `EXITS` column name contained a large number
# of trailing spaces)
#
# Based on a solution I found here...
# https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
turnstiles_df.columns = turnstiles_df.columns.str.strip()

print('\nColumn names: {}'.format(turnstiles_df.columns))


Column names: Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')


In [9]:
import datetime as dt

# Add the extra columns that we need
turnstiles_df["DATE_TIME"] = pd.to_datetime(turnstiles_df.DATE + " " + turnstiles_df.TIME, 
                                            format="%m/%d/%Y %H:%M:%S")
# Add a DAY_OF_WEEK column
turnstiles_df['DAY_OF_WEEK'] = (turnstiles_df['DATE_TIME']
                                   .dt.dayofweek.map({0: "Monday", 
                                       1: "Tuesday", 
                                       2: "Wednesday", 
                                       3: "Thursday", 
                                       4: "Friday", 
                                       5: "Saturday", 
                                       6: "Sunday"}))

# Time
time = turnstiles_df['TIME']

turnstiles_df['HOUR'] = pd.to_datetime(time).dt.hour
turnstiles_df['MINUTE'] = pd.to_datetime(time).dt.minute

turnstiles_df['ENTRIES_PER_HOUR'] = (turnstiles_df['ENTRIES'] / 4).astype(int)
turnstiles_df['EXITS_PER_HOUR'] = (turnstiles_df['EXITS'] / 4).astype(int)

turnstiles_df.head(20)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,DAY_OF_WEEK,HOUR,MINUTE,ENTRIES_PER_HOUR,EXITS_PER_HOUR
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,00:00:00,REGULAR,7183242,2433142,2019-08-31 00:00:00,Saturday,0,0,1795810,608285
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,04:00:00,REGULAR,7183258,2433149,2019-08-31 04:00:00,Saturday,4,0,1795814,608287
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,08:00:00,REGULAR,7183278,2433176,2019-08-31 08:00:00,Saturday,8,0,1795819,608294
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,12:00:00,REGULAR,7183393,2433262,2019-08-31 12:00:00,Saturday,12,0,1795848,608315
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,16:00:00,REGULAR,7183572,2433312,2019-08-31 16:00:00,Saturday,16,0,1795893,608328
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,20:00:00,REGULAR,7183842,2433348,2019-08-31 20:00:00,Saturday,20,0,1795960,608337
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/01/2019,00:00:00,REGULAR,7184008,2433376,2019-09-01 00:00:00,Sunday,0,0,1796002,608344
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/01/2019,04:00:00,REGULAR,7184025,2433380,2019-09-01 04:00:00,Sunday,4,0,1796006,608345
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/01/2019,08:00:00,REGULAR,7184042,2433397,2019-09-01 08:00:00,Sunday,8,0,1796010,608349
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,09/01/2019,12:00:00,REGULAR,7184137,2433450,2019-09-01 12:00:00,Sunday,12,0,1796034,608362


In [20]:
turnstiles_df.LINENAME.unique()

array(['R', '1', '5', '25', '3', '34', '2345S', '23', 'BDNQR2345',
       '2345BDNQR', '2345', '2345R', '7', '7EFMR', 'NQW', '7NQW', '7BDFM',
       '6', '2', '4', '4BD', '245', '45', '456', '456NQRW', '4567S',
       '456LNQRW', '6DF', '456JZ', '2345ACJZ', '123', '1ABCD',
       '1237ACENQRSW', '1237ACENQRS', '123ACE', '123FLM', '23ACE', '1RW',
       'Q', 'EJZ', 'E', 'F', 'FG', 'DFGMNR', 'FJMZ', 'BD', 'BDFQ6',
       'FLM123', 'FM', 'BDFMNQRW', 'BDFM7', 'BDFM', 'G', 'GL', 'EMG',
       'EF', 'EFMR', 'MR', 'EFMR7', 'EMR', 'EM6', 'EM', 'BDE', 'D', 'BD4',
       'A', 'AS', 'AC', 'C', 'ACJLZ', 'ACS', 'ACG', 'ACF', 'ACJZ2345',
       'ACE23', 'ACE', 'CE', 'ABCDEFM', 'ACEL', 'ACENQRS1237W',
       'ACENGRS1237W', 'ABCD1', 'BC', 'ACBD', 'ABCD', 'AC1', 'M', 'JZ',
       'J', 'JMZ', 'JM', 'L', 'LM', 'LG', 'FQ', 'DFNQ', 'N', 'ND', 'NRW',
       'DNR', 'BDNQR', 'R2345', 'BQ', 'BQS', 'S2345', 'S', 'JZ456',
       'JNQRZ6W', 'R1W', 'RNW', 'LNQR456W', 'NQRW', 'NQR456W'],
      dtype=object)

In [21]:
turnstiles_df.STATION.unique()

array(['RIT-ROOSEVELT', 'RIT-MANHATTAN', 'TOMPKINSVILLE', 'ST. GEORGE',
       'EASTCHSTER/DYRE', 'BAYCHESTER AV', 'GUN HILL RD', 'PELHAM PKWY',
       'MORRIS PARK', 'FLATBUSH AV-B.C', 'NEWKIRK AV', 'BEVERLY RD',
       'CHURCH AV', 'WINTHROP ST', 'STERLING ST', 'PRESIDENT ST',
       'NEW LOTS AV', 'VAN SICLEN AV', 'PENNSYLVANIA AV', 'JUNIUS ST',
       'ROCKAWAY AV', 'SARATOGA AV', 'SUTTER AV-RUTLD', 'CROWN HTS-UTICA',
       'KINGSTON AV', 'NOSTRAND AV', 'FRANKLIN AV', 'EASTN PKWY-MUSM',
       'GRAND ARMY PLAZ', 'BERGEN ST', 'ATL AV-BARCLAY', 'NEVINS ST',
       'HOYT ST', 'BOROUGH HALL', 'CLARK ST', '34 ST-HUDSON YD',
       'FLUSHING-MAIN', 'METS-WILLETS PT', '111 ST', '103 ST-CORONA',
       'JUNCTION BLVD', '90 ST-ELMHURST', '82 ST-JACKSON H',
       '74 ST-BROADWAY', '69 ST', '61 ST WOODSIDE', '52 ST',
       '46 ST BLISS ST', '40 ST LOWERY ST', '33 ST-RAWSON ST',
       'ASTORIA DITMARS', '30 AV', 'BROADWAY', '36 AV', '39 AV',
       'QUEENSBORO PLZ', 'COURT SQ', 'HUNTERS PT

In [10]:
# Get rid of the duplicate entries
turnstiles_df.sort_values(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], 
                          inplace=True, ascending=False)
turnstiles_df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], inplace=True)

# Sanity Check to verify that "C/A", "UNIT", "SCP", "STATION", "DATE_TIME" is unique for ENTRIES
(turnstiles_df
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE_TIME,ENTRIES
0,A002,R051,02-00-00,59 ST,2019-08-31 00:00:00,1
410196,R138,R293,00-03-02,34 ST-PENN STA,2019-09-05 10:00:00,1
410198,R138,R293,00-03-02,34 ST-PENN STA,2019-09-05 18:00:00,1
410199,R138,R293,00-03-02,34 ST-PENN STA,2019-09-05 22:00:00,1
410200,R138,R293,00-03-02,34 ST-PENN STA,2019-09-06 02:00:00,1


In [11]:
# Get an overview of how much data we have - i.e. Print the number of records for each date in the dataset
turnstiles_df.DATE.value_counts().sort_index()

08/31/2019    29375
09/01/2019    29243
09/02/2019    29283
09/03/2019    29290
09/04/2019    29366
09/05/2019    29132
09/06/2019    29106
09/07/2019    29225
09/08/2019    29356
09/09/2019    29394
09/10/2019    29552
09/11/2019    29476
09/12/2019    29359
09/13/2019    29221
09/14/2019    29224
09/15/2019    29173
09/16/2019    29174
09/17/2019    29600
09/18/2019    29232
09/19/2019    29262
09/20/2019    29261
Name: DATE, dtype: int64

In [12]:
# Add the total number of entries and exits... This indicates how many people are in the area (either going into
# the subway or coming out of it)
# turnstiles_df['ENTRIES_AND_EXITS'] = (turnstiles_df['ENTRIES'] + turnstiles_df['EXITS']).astype(int)

# turnstiles_df.head()

turnstiles_daily = (turnstiles_df
                        .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"],as_index=False).ENTRIES.first())

# turnstiles_daily[["PREV_DATE", "PREV_ENTRIES"]] = (turnstiles_daily
#                                                        .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES", "EXITS"]
#                                                        .apply(lambda grp: grp.shift(1)))

turnstiles_daily.head(20)

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES
0,A002,R051,02-00-00,59 ST,08/31/2019,7183842
1,A002,R051,02-00-00,59 ST,09/01/2019,7184559
2,A002,R051,02-00-00,59 ST,09/02/2019,7185132
3,A002,R051,02-00-00,59 ST,09/03/2019,7186355
4,A002,R051,02-00-00,59 ST,09/04/2019,7187672
5,A002,R051,02-00-00,59 ST,09/05/2019,7189025
6,A002,R051,02-00-00,59 ST,09/06/2019,7190331
7,A002,R051,02-00-00,59 ST,09/07/2019,7191261
8,A002,R051,02-00-00,59 ST,09/08/2019,7191906
9,A002,R051,02-00-00,59 ST,09/09/2019,7193205


In [13]:
# print(turnstiles_df.HOUR.value_counts())
# print(turnstiles_df.MINUTE.value_counts())

# There are many different MINUTE logging times so... For now, we're only going to use records where MINUTE == 0
#
# Meal time assumptions (These need to be improved)
#
# Breakfast - 8:00 - 12:00
# Lunch - 12:00 - 16:00
# Dinner - 16:00 - 20:00
breakfast_mask = ((turnstiles_df["HOUR"] >= 8) & (turnstiles_df["HOUR"] <= 12) & (turnstiles_df["MINUTE"] == 0))
lunch_mask = ((turnstiles_df["HOUR"] >= 12) & (turnstiles_df["HOUR"] <= 16) & (turnstiles_df["MINUTE"] == 0))
dinner_mask = ((turnstiles_df["HOUR"] >= 16) & (turnstiles_df["HOUR"] <= 20) & (turnstiles_df["MINUTE"] == 0))

morning_mask = ((turnstiles_df["HOUR"] < 12) & (turnstiles_df["MINUTE"] == 0))
afternoon_mask = ((turnstiles_df["HOUR"] >= 12) & (turnstiles_df["MINUTE"] == 0))

In [14]:
breakfast_df = turnstiles_df[breakfast_mask]
lunch_df = turnstiles_df[lunch_mask]
dinner_df = turnstiles_df[dinner_mask]

morning_df = turnstiles_df[morning_mask]
afternoon_df = turnstiles_df[afternoon_mask]

# Print how many breakfast records we have for each day
# print(breakfast_df.DATE.value_counts(ascending=True))
# print(lunch_df.STATION.value_counts(ascending=True))

# breakfast_df.head(20)
# breakfast_df.tail(20)

# turnstiles_breakfast = (breakfast_df
#                         .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"],as_index=False)
#                         .ENTRIES.first())

breakfast_df.head(20)

# turnstiles_breakfast.head()

# turnstiles_breakfast[["PREV_DATE", "PREV_ENTRIES"]] = (turnstiles_daily
#                                                        .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES"]
#                                                        .apply(lambda grp: grp.shift(1)))

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,DAY_OF_WEEK,HOUR,MINUTE,ENTRIES_PER_HOUR,EXITS_PER_HOUR
204924,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,09/20/2019,09:00:00,REGULAR,5554,420,2019-09-20 09:00:00,Friday,9,0,1388,105
204918,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,09/19/2019,09:00:00,REGULAR,5554,420,2019-09-19 09:00:00,Thursday,9,0,1388,105
204912,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,09/18/2019,09:00:00,REGULAR,5554,420,2019-09-18 09:00:00,Wednesday,9,0,1388,105
204905,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,09/17/2019,09:00:00,REGULAR,5554,420,2019-09-17 09:00:00,Tuesday,9,0,1388,105
204899,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,09/16/2019,09:00:00,REGULAR,5554,420,2019-09-16 09:00:00,Monday,9,0,1388,105
204893,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,09/15/2019,09:00:00,REGULAR,5554,420,2019-09-15 09:00:00,Sunday,9,0,1388,105
204887,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,09/14/2019,09:00:00,REGULAR,5554,420,2019-09-14 09:00:00,Saturday,9,0,1388,105
205581,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,09/13/2019,09:00:00,REGULAR,5554,420,2019-09-13 09:00:00,Friday,9,0,1388,105
205575,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,09/12/2019,09:00:00,REGULAR,5554,420,2019-09-12 09:00:00,Thursday,9,0,1388,105
205569,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,09/11/2019,09:00:00,REGULAR,5554,420,2019-09-11 09:00:00,Wednesday,9,0,1388,105


In [15]:
breakfast_df.groupby(["C/A", "UNIT", "SCP", "STATION", "DAY_OF_WEEK"])['ENTRIES', 'EXITS', 'ENTRIES_AND_EXITS'].sum()

KeyError: "Columns not found: 'ENTRIES_AND_EXITS'"

In [None]:
breakfast_df.groupby(["C/A", "UNIT", "SCP", "STATION", "DAY_OF_WEEK"])['ENTRIES', 'EXITS', 'ENTRIES_AND_EXITS'].sum()

In [121]:
busiest_breakfast_stations = \
    (breakfast_df.groupby(['STATION', 'DAY_OF_WEEK'])['ENTRIES_AND_EXITS'].sum()
                   .reset_index()
                   .sort_values(by='ENTRIES_AND_EXITS',ascending=False))

print(busiest_breakfast_stations.head(20))

              STATION DAY_OF_WEEK  ENTRIES_AND_EXITS
456   42 ST-PORT AUTH      Friday        81260140766
460   42 ST-PORT AUTH    Thursday        81259369206
462   42 ST-PORT AUTH   Wednesday        81258601460
461   42 ST-PORT AUTH     Tuesday        81257842024
457   42 ST-PORT AUTH      Monday        81257169602
459   42 ST-PORT AUTH      Sunday        81256660901
458   42 ST-PORT AUTH    Saturday        81256187855
396   34 ST-HERALD SQ      Sunday        62399882576
395   34 ST-HERALD SQ    Saturday        62399075573
393   34 ST-HERALD SQ      Friday        62383315855
399   34 ST-HERALD SQ   Wednesday        62380485691
398   34 ST-HERALD SQ     Tuesday        62379093158
394   34 ST-HERALD SQ      Monday        62377913475
397   34 ST-HERALD SQ    Thursday        62375976649
2383   TIMES SQ-42 ST      Friday        61881832414
2387   TIMES SQ-42 ST    Thursday        61880641683
2389   TIMES SQ-42 ST   Wednesday        61879454542
2388   TIMES SQ-42 ST     Tuesday        61878

In [117]:
# Rinse and repeat for lunch
# lunch_df.groupby(["C/A", "UNIT", "SCP", "STATION", "DAY_OF_WEEK"])['ENTRIES_AND_EXITS'].sum()

busiest_lunch_stations = \
    (lunch_df.groupby(['STATION', 'DATE_TIME'])['EXITS'].sum()
                   .reset_index()
                   .sort_values(by='EXITS', ascending=False))

# busiest_lunch_stations.head(20)
busiest_lunch_stations.head(20)

Unnamed: 0,STATION,DATE_TIME,EXITS
2643,42 ST-PORT AUTH,2019-09-20 16:00:00,6089882479
2641,42 ST-PORT AUTH,2019-09-20 12:00:00,6089871704
2640,42 ST-PORT AUTH,2019-09-19 16:00:00,6089830775
2638,42 ST-PORT AUTH,2019-09-19 12:00:00,6089821263
2637,42 ST-PORT AUTH,2019-09-18 16:00:00,6089780604
2635,42 ST-PORT AUTH,2019-09-18 12:00:00,6089771385
2634,42 ST-PORT AUTH,2019-09-17 16:00:00,6089730002
2632,42 ST-PORT AUTH,2019-09-17 12:00:00,6089720937
2631,42 ST-PORT AUTH,2019-09-16 16:00:00,6089680232
2629,42 ST-PORT AUTH,2019-09-16 12:00:00,6089671389


In [113]:
# Rinse and repeat for dinner
dinner_df.groupby(["C/A", "UNIT", "SCP", "STATION", "DAY_OF_WEEK"])['ENTRIES_AND_EXITS'].sum()

busiest_dinner_stations = \
    (dinner_df.groupby(['STATION'])['ENTRIES_AND_EXITS'].sum()
                   .reset_index()
                   .sort_values(by='ENTRIES_AND_EXITS',ascending=False))

busiest_dinner_stations.head()

Unnamed: 0,STATION,ENTRIES_AND_EXITS
65,42 ST-PORT AUTH,568808055489
56,34 ST-HERALD SQ,436717216981
347,TIMES SQ-42 ST,430436540110
195,DEKALB AV,389742678913
43,23 ST,367076084659


In [132]:
# Mornings
morning_df.groupby(["STATION", "DAY_OF_WEEK"])['ENTRIES', 'ENTRIES_AND_EXITS'].sum()

# busiest_morning_stations = \
#     (morning_df.groupby(['STATION'])['ENTRIES_AND_EXITS'].sum()
#                    .reset_index()
#                    .sort_values(by='ENTRIES_AND_EXITS',ascending=False))

# busiest_morning_stations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ENTRIES,ENTRIES_AND_EXITS
STATION,DAY_OF_WEEK,Unnamed: 2_level_1,Unnamed: 3_level_1
1 AV,Friday,12737416136,25928693781
1 AV,Monday,12736803598,25927410329
1 AV,Saturday,12736738110,25927253709
1 AV,Sunday,12736773613,25927337417
1 AV,Thursday,12737244370,25928335976
1 AV,Tuesday,12736922027,25927664440
1 AV,Wednesday,12737081351,25927995841
103 ST,Friday,1292908546,2353652970
103 ST,Monday,1291825351,2351799086
103 ST,Saturday,1291514529,2351253662


In [None]:
# TODO: Late night coding (to correctly align recording periods to standard 00:00 - 04:00, 04:00 - 08:00, etc...
#
# This is a work in progress

print(result.head(20))
# turnstiles_df[mask]['HOUR'].value_counts()

def align_hours(row, lower_boundary, upper_boundary):
    # For exmaple, the boundary is 8(:00) but row['HOUR'] is 9(:00)...
    # For exmaple, the boundary is 8(:00) but row['HOUR'] is 7(:00)...
    hour = row['HOUR']
    entries_per_hour = row['ENTRIES_PER_HOUR']
    
    if (hour < upper_boundary):
        if (hour < lower_boundary):
            return (4 - (lower_boundary - hour)) * entries_per_hour
        elif (hour > lower_boundary):
            return (4 - (hour - lower_boundary)) * entries_per_hour
        else:
            return row['ENTRIES']
    else:
        return 0

result['ALIGNED_HOURS'] = result.apply(align_hours, axis=1, lower_boundary=8, upper_boundary=12)

print(result.head(50))
print(result.tail(50))

result.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"], as_index=False)['ALIGNED_HOURS'].sum()
#result.groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"], as_index=False)['ALIGNED_HOURS']

# turnstiles_daily

# turnstiles_hourly = (turnstiles_df
#                        .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE"], as_index=False)

# morning = turnstiles_df[turnstiles_df['TIME'] > '08:00:00' & turnstiles_df['TIME'] < '12:00:00']

# print(morning['TIME'].value_counts().head(20))

# morning

# after_noon = turnstiles_df[turnstiles_df['TIME'] > '12:00:00']

# print(after_noon['TIME'].value_counts().head())

# after_noon

In [48]:
# Get an overview of how much data we have - i.e. Print the number of records for each date in the dataset
turnstiles_df.DATE.value_counts().sort_index()

08/31/2019    29375
09/01/2019    29243
09/02/2019    29283
09/03/2019    29290
09/04/2019    29366
09/05/2019    29132
09/06/2019    29106
09/07/2019    29225
09/08/2019    29356
09/09/2019    29394
09/10/2019    29552
09/11/2019    29476
09/12/2019    29359
09/13/2019    29223
09/14/2019    29224
09/15/2019    29173
09/16/2019    29174
09/17/2019    29602
09/18/2019    29232
09/19/2019    29262
09/20/2019    29261
Name: DATE, dtype: int64