# MTA Turnstile Analysis

In [2]:
#######################
# standard code block #
#######################

%pylab inline
# see https://ipython.readthedocs.io/en/stable/interactive/magics.html

%config InlineBackend.figure_format = 'svg'

Populating the interactive namespace from numpy and matplotlib


### Section 1: Describing and Preparing DataFrame for Manipulation

In [3]:
import numpy as np
import pandas as pd
import pickle

In [4]:
# Pasrsing the data for the four months March till June that lead up to summer #
# We focused on these four months --> Ideal timing to spread awareness of the event #
df = pd.read_csv('mar_jun_mta_2019_turnstile_data_compiled.csv',index_col=False)
df.head(7)

Unnamed: 0.1,Unnamed: 0,C/A,DATE,DESC,DIVISION,ENTRIES,EXITS,LINENAME,SCP,STATION,TIME,UNIT
0,0,A002,02/23/2019,REGULAR,BMT,6955483,2359112,NQR456W,02-00-00,59 ST,03:00:00,R051
1,1,A002,02/23/2019,REGULAR,BMT,6955494,2359125,NQR456W,02-00-00,59 ST,07:00:00,R051
2,2,A002,02/23/2019,REGULAR,BMT,6955554,2359199,NQR456W,02-00-00,59 ST,11:00:00,R051
3,3,A002,02/23/2019,REGULAR,BMT,6955714,2359248,NQR456W,02-00-00,59 ST,15:00:00,R051
4,4,A002,02/23/2019,REGULAR,BMT,6956004,2359292,NQR456W,02-00-00,59 ST,19:00:00,R051
5,5,A002,02/23/2019,REGULAR,BMT,6956147,2359325,NQR456W,02-00-00,59 ST,23:00:00,R051
6,6,A002,02/24/2019,REGULAR,BMT,6956177,2359332,NQR456W,02-00-00,59 ST,03:00:00,R051


#### Section 1.1: Describing the Data

In [4]:
df.describe()

Unnamed: 0,ENTRIES,EXITS
count,205263.0,205263.0
mean,42361810.0,34986730.0
std,216159800.0,199924000.0
min,0.0,0.0
25%,286397.5,124770.0
50%,2066203.0,1179877.0
75%,6640564.0,4554985.0
max,2129132000.0,2123825000.0


In [5]:
# Checking for null data within our dataframe #
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205263 entries, 0 to 205262
Data columns (total 11 columns):
C/A                                                                     205263 non-null object
UNIT                                                                    205263 non-null object
SCP                                                                     205263 non-null object
STATION                                                                 205263 non-null object
LINENAME                                                                205263 non-null object
DIVISION                                                                205263 non-null object
DATE                                                                    205263 non-null object
TIME                                                                    205263 non-null object
DESC                                                                    205263 non-null object
ENTRIES                           

In [12]:
def basic_clean(df):
    """ 
    Performing a basic cleaning and  of our dataframe, by removing whitespace in columns,
    removing unnecessary columns and changing date to UNIX format.
    """
    
    # Removing whitespace from column names #
    df.columns = df.columns.str.strip()
    
    df.drop(["Unnamed: 0", "LINENAME"], axis = 1, inplace=True)

    # Changing DATE to UNIX #
    df["DATE"] = pd.to_datetime(df.DATE, format = "%m/%d/%Y")

    return df

In [13]:
basic_clean(df)

Unnamed: 0,C/A,DATE,DESC,DIVISION,ENTRIES,EXITS,SCP,STATION,TIME,UNIT
0,A002,2019-02-23,REGULAR,BMT,6955483,2359112,02-00-00,59 ST,03:00:00,R051
1,A002,2019-02-23,REGULAR,BMT,6955494,2359125,02-00-00,59 ST,07:00:00,R051
2,A002,2019-02-23,REGULAR,BMT,6955554,2359199,02-00-00,59 ST,11:00:00,R051
3,A002,2019-02-23,REGULAR,BMT,6955714,2359248,02-00-00,59 ST,15:00:00,R051
4,A002,2019-02-23,REGULAR,BMT,6956004,2359292,02-00-00,59 ST,19:00:00,R051
...,...,...,...,...,...,...,...,...,...,...
3680528,TRAM2,2019-06-28,REGULAR,RIT,5554,379,00-05-01,RIT-ROOSEVELT,05:00:00,R469
3680529,TRAM2,2019-06-28,REGULAR,RIT,5554,379,00-05-01,RIT-ROOSEVELT,09:00:00,R469
3680530,TRAM2,2019-06-28,REGULAR,RIT,5554,379,00-05-01,RIT-ROOSEVELT,13:00:00,R469
3680531,TRAM2,2019-06-28,REGULAR,RIT,5554,379,00-05-01,RIT-ROOSEVELT,17:00:00,R469


### Section 2: Manipulating Data

We understand that the turnstiles update their counter every 4 hours.  
Therefore, the total entry traffic for a single day would be to deduct 00:00 present day from 00:00 from a previous day.  
Therefore, we want to find the minimum for a given a day and subtract it with the minimum from the previous day.  
We first groupby `CA,UNIT,SCP,STATION,DATE` and apply a .agg to the `ENTRIES` column in order to find the minimum entries for the day.

In [15]:
# We have now grouped the dataframe by date
df_ordered_date = df.groupby(['C/A','UNIT','SCP','STATION', 'DATE'])['ENTRIES'].agg({'MIN ENTRIES':'min'})
df_ordered_date

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,MIN ENTRIES
C/A,UNIT,SCP,STATION,DATE,Unnamed: 5_level_1
A002,R051,02-00-00,59 ST,2019-02-23,6955483
A002,R051,02-00-00,59 ST,2019-02-24,6956177
A002,R051,02-00-00,59 ST,2019-02-25,6956582
A002,R051,02-00-00,59 ST,2019-02-26,6957937
A002,R051,02-00-00,59 ST,2019-02-27,6959345
...,...,...,...,...,...
TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-24,5554
TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-25,5554
TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-26,5554
TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-27,5554


In [16]:
# Resetting index in order to perform more operations on each CA/UNIT/SCP
df_min_daily = df_ordered_date.reset_index()
df_min_daily

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,MIN ENTRIES
0,A002,R051,02-00-00,59 ST,2019-02-23,6955483
1,A002,R051,02-00-00,59 ST,2019-02-24,6956177
2,A002,R051,02-00-00,59 ST,2019-02-25,6956582
3,A002,R051,02-00-00,59 ST,2019-02-26,6957937
4,A002,R051,02-00-00,59 ST,2019-02-27,6959345
...,...,...,...,...,...,...
609685,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-24,5554
609686,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-25,5554
609687,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-26,5554
609688,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-27,5554


In [19]:
# Finding out the first daily entry per day and applying this to only the most right column of the dataframe (MIN ENTRIES)
# Creating a new column for all this data
# Now we are interested in the entry PER STATION. Not by date anymore.

df_min_daily['DAILY ENTRIES'] = df_min_daily.groupby(by=['C/A','UNIT','SCP','STATION'])['MIN ENTRIES'].diff().shift(-1)
df_min_daily

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,MIN ENTRIES,DAILY ENTRIES
0,A002,R051,02-00-00,59 ST,2019-02-23,6955483,694.0
1,A002,R051,02-00-00,59 ST,2019-02-24,6956177,405.0
2,A002,R051,02-00-00,59 ST,2019-02-25,6956582,1355.0
3,A002,R051,02-00-00,59 ST,2019-02-26,6957937,1408.0
4,A002,R051,02-00-00,59 ST,2019-02-27,6959345,1345.0
...,...,...,...,...,...,...,...
609685,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-24,5554,0.0
609686,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-25,5554,0.0
609687,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-26,5554,0.0
609688,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-27,5554,0.0


In [20]:
# We now drop the `MIN ENTRIES' column because we don't need it anymore
df_min_daily.drop('MIN ENTRIES',axis=1, inplace=True)
df_min_daily

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,DAILY ENTRIES
0,A002,R051,02-00-00,59 ST,2019-02-23,694.0
1,A002,R051,02-00-00,59 ST,2019-02-24,405.0
2,A002,R051,02-00-00,59 ST,2019-02-25,1355.0
3,A002,R051,02-00-00,59 ST,2019-02-26,1408.0
4,A002,R051,02-00-00,59 ST,2019-02-27,1345.0
...,...,...,...,...,...,...
609685,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-24,0.0
609686,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-25,0.0
609687,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-26,0.0
609688,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-27,0.0


In [21]:
# Sorting the values for more data exploration/understanding the dataset more
df_min_daily.sort_values(by='DAILY ENTRIES')

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,DAILY ENTRIES
491301,R252,R180,00-03-02,103 ST,2019-04-16,-2.071640e+09
225590,N300,R113,01-00-04,7 AV,2019-04-23,-1.835262e+09
420965,R160A,R164,00-06-00,66 ST-LINCOLN,2019-05-30,-1.661416e+09
501611,R288,R275,00-00-03,183 ST,2019-05-14,-1.437240e+09
463304,R228,R143,00-00-01,28 ST,2019-03-22,-1.428035e+09
...,...,...,...,...,...,...
609189,TRAM2,R469,00-00-01,RIT-ROOSEVELT,2019-06-28,
609314,TRAM2,R469,00-03-00,RIT-ROOSEVELT,2019-06-28,
609439,TRAM2,R469,00-03-01,RIT-ROOSEVELT,2019-06-28,
609564,TRAM2,R469,00-05-00,RIT-ROOSEVELT,2019-06-28,


There seem to be anomalies in the data. Negative values and also nan and zero values. Now perform cleaning operations on this dataframe.

In [22]:
df_dailyEntries = df_min_daily
# Removing negative entries within the column DAILY ENTRIES #
df_dailyEntries.loc[df_dailyEntries['DAILY ENTRIES'] < 0, 'DAILY ENTRIES'] = np.nan

# Removing extremely large values that are likely to be anomalous #
df_dailyEntries.loc[df_dailyEntries['DAILY ENTRIES'] > 1000000, 'DAILY ENTRIES'] = np.nan

df_dailyEntries

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,DAILY ENTRIES
0,A002,R051,02-00-00,59 ST,2019-02-23,694.0
1,A002,R051,02-00-00,59 ST,2019-02-24,405.0
2,A002,R051,02-00-00,59 ST,2019-02-25,1355.0
3,A002,R051,02-00-00,59 ST,2019-02-26,1408.0
4,A002,R051,02-00-00,59 ST,2019-02-27,1345.0
...,...,...,...,...,...,...
609685,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-24,0.0
609686,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-25,0.0
609687,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-26,0.0
609688,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2019-06-27,0.0


Combine turnstiles into a single C/A, UNIT, STATION

In [23]:
df_dailyEntries.drop('SCP',axis=1)
df_TurnCombined = df_dailyEntries.groupby(['C/A','UNIT','STATION','DATE'])['DAILY ENTRIES'].agg({'DAILY ENTRIES':'sum'})
df_TurnCombined

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,DAILY ENTRIES
C/A,UNIT,STATION,DATE,Unnamed: 4_level_1
A002,R051,59 ST,2019-02-23,6863.0
A002,R051,59 ST,2019-02-24,4488.0
A002,R051,59 ST,2019-02-25,12239.0
A002,R051,59 ST,2019-02-26,12980.0
A002,R051,59 ST,2019-02-27,12732.0
...,...,...,...,...
TRAM2,R469,RIT-ROOSEVELT,2019-06-24,3603.0
TRAM2,R469,RIT-ROOSEVELT,2019-06-25,2843.0
TRAM2,R469,RIT-ROOSEVELT,2019-06-26,3716.0
TRAM2,R469,RIT-ROOSEVELT,2019-06-27,3441.0


In [27]:
df_StationCombined = df_TurnCombined.reset_index()
df_StationCombined.sort_values(by="DAILY ENTRIES", ascending=False)

Unnamed: 0,C/A,UNIT,STATION,DATE,DAILY ENTRIES
27871,N071,R013,34 ST-PENN STA,2019-02-28,1743125.0
27922,N071,R013,34 ST-PENN STA,2019-04-20,1090011.0
31126,N111,R284,CLINTON-WASH AV,2019-06-13,857187.0
80861,R401,R445,3 AV 138 ST,2019-04-08,793294.0
85631,R519,R223,46 ST BLISS ST,2019-03-02,696818.0
...,...,...,...,...,...
20265,J023,R436,NORWOOD AV,2019-05-26,0.0
20298,J023,R436,NORWOOD AV,2019-06-28,0.0
55418,PTH05,R543,EXCHANGE PLACE,2019-06-28,0.0
20391,J024,R437,CRESCENT ST,2019-05-26,0.0


In [26]:
with open('condensed_by_entries.pickle', 'wb') as to_write:
    pickle.dump(df_StationCombined, to_write)

In [17]:
df_StationCombined.drop(['C/A','UNIT'],axis=1)
df_StationCombined = df_StationCombined.groupby(['STATION','DATE'])['DAILY ENTRIES'].agg({'DAILY ENTRIES':'sum'})
df_StationCombined

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


Unnamed: 0_level_0,Unnamed: 1_level_0,DAILY ENTRIES
STATION,DATE,Unnamed: 2_level_1
1 AV,08/24/2019,6260.0
1 AV,08/25/2019,4463.0
1 AV,08/26/2019,14736.0
1 AV,08/27/2019,15819.0
1 AV,08/28/2019,15969.0
...,...,...
ZEREGA AV,08/26/2019,2397.0
ZEREGA AV,08/27/2019,2519.0
ZEREGA AV,08/28/2019,2300.0
ZEREGA AV,08/29/2019,2318.0


In [18]:
df_StationCombined = df_StationCombined.groupby(['STATION'])['DAILY ENTRIES'].agg({'DAILY ENTRIES':'sum'})
df_StationCombined.sort_values(by='DAILY ENTRIES', ascending=False)

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,DAILY ENTRIES
STATION,Unnamed: 1_level_1
34 ST-PENN STA,752978.0
GRD CNTRL-42 ST,656563.0
57 ST-7 AV,628529.0
34 ST-HERALD SQ,602630.0
BROOKLYN BRIDGE,552649.0
...,...
TOMPKINSVILLE,2676.0
BEACH 105 ST,2034.0
BROAD CHANNEL,1703.0
NEWARK HM HE,790.0
