# Deriving the top 5 stations

In [1]:
#######################
# standard code block #
#######################

%pylab inline
# see https://ipython.readthedocs.io/en/stable/interactive/magics.html

%config InlineBackend.figure_format = 'svg'

Populating the interactive namespace from numpy and matplotlib


### Section 1: Describing and Preparing DataFrame for Manipulation

In [2]:
import numpy as np
import pandas as pd

In [3]:
# Only loading 1 weeks' worth of data as a preliminary data exploration
df = pd.read_csv('turnstile_190824.csv',index_col=False)
df.head(7)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,00:00:00,REGULAR,7170294,2427642
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,04:00:00,REGULAR,7170310,2427649
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,08:00:00,REGULAR,7170330,2427678
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,12:00:00,REGULAR,7170410,2427748
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,16:00:00,REGULAR,7170594,2427804
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,20:00:00,REGULAR,7170816,2427842
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/18/2019,00:00:00,REGULAR,7170943,2427869


#### Section 1.1: Describing the Data

In [4]:
df.describe()

Unnamed: 0,ENTRIES,EXITS
count,205669.0,205669.0
mean,42337710.0,34973760.0
std,216003500.0,199949100.0
min,0.0,0.0
25%,291641.0,125712.0
50%,2080118.0,1187266.0
75%,6671913.0,4580819.0
max,2129142000.0,2123841000.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205669 entries, 0 to 205668
Data columns (total 11 columns):
C/A                                                                     205669 non-null object
UNIT                                                                    205669 non-null object
SCP                                                                     205669 non-null object
STATION                                                                 205669 non-null object
LINENAME                                                                205669 non-null object
DIVISION                                                                205669 non-null object
DATE                                                                    205669 non-null object
TIME                                                                    205669 non-null object
DESC                                                                    205669 non-null object
ENTRIES                           

In [6]:
# Here, we realize there is a white space in the columns
df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',
       'EXITS                                                               '],
      dtype='object')

In [7]:
# Whitespace cleaned
df.columns = df.columns.str.strip()
df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

### Section 2: Manipulating Data

In [69]:
# Preliminary inspection of dataframe
df.head(7)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,00:00:00,REGULAR,7170294,2427642
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,04:00:00,REGULAR,7170310,2427649
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,08:00:00,REGULAR,7170330,2427678
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,12:00:00,REGULAR,7170410,2427748
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,16:00:00,REGULAR,7170594,2427804
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/17/2019,20:00:00,REGULAR,7170816,2427842
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/18/2019,00:00:00,REGULAR,7170943,2427869


We understand that the turnstiles update their counter every 4 hours.  
Therefore, the total entry traffic for a single day would be to deduct 00:00 present day from 00:00 from a previous day.  
Therefore, we want to find the minimum for a given a day and subtract it with the minimum from the previous day.  
We first groupby `CA,UNIT,SCP,STATION,DATE` and apply a .agg to the `ENTRIES` column in order to find the minimum entries for the day.

In [70]:
# We have now grouped the dataframe by date
df_ordered_date = df.groupby(['C/A','UNIT','SCP','STATION', 'DATE'])['ENTRIES'].agg({'MIN ENTRIES':'min'})
df_ordered_date

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,MIN ENTRIES
C/A,UNIT,SCP,STATION,DATE,Unnamed: 5_level_1
A002,R051,02-00-00,59 ST,08/17/2019,7170294
A002,R051,02-00-00,59 ST,08/18/2019,7170943
A002,R051,02-00-00,59 ST,08/19/2019,7171363
A002,R051,02-00-00,59 ST,08/20/2019,7172478
A002,R051,02-00-00,59 ST,08/21/2019,7173517
...,...,...,...,...,...
TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/19/2019,5554
TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/20/2019,5554
TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/21/2019,5554
TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/22/2019,5554


In [71]:
# Resetting index in order to perform more operations on each CA/UNIT/SCP
df_min_daily = df_ordered_date.reset_index()
df_min_daily

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,MIN ENTRIES
0,A002,R051,02-00-00,59 ST,08/17/2019,7170294
1,A002,R051,02-00-00,59 ST,08/18/2019,7170943
2,A002,R051,02-00-00,59 ST,08/19/2019,7171363
3,A002,R051,02-00-00,59 ST,08/20/2019,7172478
4,A002,R051,02-00-00,59 ST,08/21/2019,7173517
...,...,...,...,...,...,...
34176,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/19/2019,5554
34177,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/20/2019,5554
34178,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/21/2019,5554
34179,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/22/2019,5554


In [72]:
# Finding out the daily entry per day and applying this to only the most right column of the dataframe (MIN ENTRIES)
# Creating a new column for all this data
# Now we are interested in the entry PER STATION. Not by date anymore.

df_min_daily['DAILY ENTRIES'] = df_min_daily.groupby(by=['C/A','UNIT','SCP','STATION'])['MIN ENTRIES'].diff().shift(-1)
df_min_daily

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,MIN ENTRIES,DAILY ENTRIES
0,A002,R051,02-00-00,59 ST,08/17/2019,7170294,649.0
1,A002,R051,02-00-00,59 ST,08/18/2019,7170943,420.0
2,A002,R051,02-00-00,59 ST,08/19/2019,7171363,1115.0
3,A002,R051,02-00-00,59 ST,08/20/2019,7172478,1039.0
4,A002,R051,02-00-00,59 ST,08/21/2019,7173517,1108.0
...,...,...,...,...,...,...,...
34176,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/19/2019,5554,0.0
34177,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/20/2019,5554,0.0
34178,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/21/2019,5554,0.0
34179,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/22/2019,5554,0.0


In [73]:
# We now drop the `MIN ENTRIES' column because we don't need it anymore
df_min_daily.drop('MIN ENTRIES',axis=1, inplace=True)
df_min_daily

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,DAILY ENTRIES
0,A002,R051,02-00-00,59 ST,08/17/2019,649.0
1,A002,R051,02-00-00,59 ST,08/18/2019,420.0
2,A002,R051,02-00-00,59 ST,08/19/2019,1115.0
3,A002,R051,02-00-00,59 ST,08/20/2019,1039.0
4,A002,R051,02-00-00,59 ST,08/21/2019,1108.0
...,...,...,...,...,...,...
34176,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/19/2019,0.0
34177,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/20/2019,0.0
34178,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/21/2019,0.0
34179,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/22/2019,0.0


In [74]:
# Sorting the values for more data exploration/understanding the dataset more
df_min_daily.sort_values(by='DAILY ENTRIES')

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,DAILY ENTRIES
3020,B026,R230,00-06-00,NECK RD,08/18/2019,-67989081.0
26599,R238,R046,00-06-02,GRD CNTRL-42 ST,08/20/2019,-10211443.0
9968,N094,R029,01-00-03,WORLD TRADE CTR,08/21/2019,-986871.0
5396,H009,R235,00-00-00,BEDFORD AV,08/21/2019,-915652.0
25556,R210A,R044,03-06-00,BROOKLYN BRIDGE,08/20/2019,-545472.0
...,...,...,...,...,...,...
34152,TRAM2,R469,00-00-01,RIT-ROOSEVELT,08/23/2019,
34159,TRAM2,R469,00-03-00,RIT-ROOSEVELT,08/23/2019,
34166,TRAM2,R469,00-03-01,RIT-ROOSEVELT,08/23/2019,
34173,TRAM2,R469,00-05-00,RIT-ROOSEVELT,08/23/2019,


There seem to be anomalies in the data. Negative values and also nan and zero values. Now perform cleaning operations on this dataframe.

In [75]:
df_dailyEntries = df_min_daily
df_dailyEntries.loc[df_dailyEntries['DAILY ENTRIES'] < 0, 'DAILY ENTRIES'] = np.nan
df_dailyEntries.loc[df_dailyEntries['DAILY ENTRIES'] > 1000000, 'DAILY ENTRIES'] = np.nan
df_dailyEntries

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,DAILY ENTRIES
0,A002,R051,02-00-00,59 ST,08/17/2019,649.0
1,A002,R051,02-00-00,59 ST,08/18/2019,420.0
2,A002,R051,02-00-00,59 ST,08/19/2019,1115.0
3,A002,R051,02-00-00,59 ST,08/20/2019,1039.0
4,A002,R051,02-00-00,59 ST,08/21/2019,1108.0
...,...,...,...,...,...,...
34176,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/19/2019,0.0
34177,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/20/2019,0.0
34178,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/21/2019,0.0
34179,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/22/2019,0.0


Combine turnstiles into a single C/A, UNIT, STATION

In [76]:
df_dailyEntries.drop('SCP',axis=1)
df_TurnCombined = df_dailyEntries.groupby(['C/A','UNIT','STATION','DATE'])['DAILY ENTRIES'].agg({'DAILY ENTRIES':'sum'})
df_TurnCombined

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,DAILY ENTRIES
C/A,UNIT,STATION,DATE,Unnamed: 4_level_1
A002,R051,59 ST,08/17/2019,7640.0
A002,R051,59 ST,08/18/2019,6135.0
A002,R051,59 ST,08/19/2019,11387.0
A002,R051,59 ST,08/20/2019,11906.0
A002,R051,59 ST,08/21/2019,11957.0
...,...,...,...,...
TRAM2,R469,RIT-ROOSEVELT,08/19/2019,3461.0
TRAM2,R469,RIT-ROOSEVELT,08/20/2019,3406.0
TRAM2,R469,RIT-ROOSEVELT,08/21/2019,3447.0
TRAM2,R469,RIT-ROOSEVELT,08/22/2019,3420.0


In [77]:
df_StationCombined = df_TurnCombined.reset_index()
df_StationCombined

Unnamed: 0,C/A,UNIT,STATION,DATE,DAILY ENTRIES
0,A002,R051,59 ST,08/17/2019,7640.0
1,A002,R051,59 ST,08/18/2019,6135.0
2,A002,R051,59 ST,08/19/2019,11387.0
3,A002,R051,59 ST,08/20/2019,11906.0
4,A002,R051,59 ST,08/21/2019,11957.0
...,...,...,...,...,...
5219,TRAM2,R469,RIT-ROOSEVELT,08/19/2019,3461.0
5220,TRAM2,R469,RIT-ROOSEVELT,08/20/2019,3406.0
5221,TRAM2,R469,RIT-ROOSEVELT,08/21/2019,3447.0
5222,TRAM2,R469,RIT-ROOSEVELT,08/22/2019,3420.0


In [78]:
df_StationCombined.drop(['C/A','UNIT'],axis=1)
df_StationCombined = df_StationCombined.groupby(['STATION','DATE'])['DAILY ENTRIES'].agg({'DAILY ENTRIES':'sum'})
df_StationCombined

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


Unnamed: 0_level_0,Unnamed: 1_level_0,DAILY ENTRIES
STATION,DATE,Unnamed: 2_level_1
1 AV,08/17/2019,6208.0
1 AV,08/18/2019,4367.0
1 AV,08/19/2019,14726.0
1 AV,08/20/2019,15737.0
1 AV,08/21/2019,15953.0
...,...,...
ZEREGA AV,08/19/2019,2415.0
ZEREGA AV,08/20/2019,2487.0
ZEREGA AV,08/21/2019,2425.0
ZEREGA AV,08/22/2019,2428.0


In [87]:
df_StationCombined = df_StationCombined.groupby(['STATION'])['DAILY ENTRIES'].agg({'DAILY ENTRIES':'sum'})
df_StationCombined.sort_values(by='DAILY ENTRIES', ascending=False)

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,DAILY ENTRIES
STATION,Unnamed: 1_level_1
DEKALB AV,965193.0
34 ST-PENN STA,760480.0
JKSN HT-ROOSVLT,669339.0
GRD CNTRL-42 ST,657435.0
34 ST-HERALD SQ,609703.0
...,...
TOMPKINSVILLE,3446.0
BEACH 105 ST,2371.0
NEWARK HM HE,1838.0
BROAD CHANNEL,1525.0
