# Deriving the top 5 stations

In [1]:
#######################
# standard code block #
#######################

%pylab inline
# see https://ipython.readthedocs.io/en/stable/interactive/magics.html

%config InlineBackend.figure_format = 'svg'

Populating the interactive namespace from numpy and matplotlib


### Section 1: Describing and Preparing DataFrame for Manipulation

In [2]:
import numpy as np
import pandas as pd

In [3]:
# Only loading 1 weeks' worth of data as a preliminary data exploration
df = pd.read_csv('turnstile_190831.csv',index_col=False)
df.head(7)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,00:00:00,REGULAR,7176756,2430372
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,04:00:00,REGULAR,7176772,2430375
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,08:00:00,REGULAR,7176786,2430408
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,12:00:00,REGULAR,7176862,2430481
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,16:00:00,REGULAR,7177008,2430520
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,20:00:00,REGULAR,7177221,2430563
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/25/2019,00:00:00,REGULAR,7177326,2430585


#### Section 1.1: Describing the Data

In [4]:
df.describe()

Unnamed: 0,ENTRIES,EXITS
count,205263.0,205263.0
mean,42361810.0,34986730.0
std,216159800.0,199924000.0
min,0.0,0.0
25%,286397.5,124770.0
50%,2066203.0,1179877.0
75%,6640564.0,4554985.0
max,2129132000.0,2123825000.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205263 entries, 0 to 205262
Data columns (total 11 columns):
C/A                                                                     205263 non-null object
UNIT                                                                    205263 non-null object
SCP                                                                     205263 non-null object
STATION                                                                 205263 non-null object
LINENAME                                                                205263 non-null object
DIVISION                                                                205263 non-null object
DATE                                                                    205263 non-null object
TIME                                                                    205263 non-null object
DESC                                                                    205263 non-null object
ENTRIES                           

In [6]:
# Here, we realize there is a white space in the columns
df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',
       'EXITS                                                               '],
      dtype='object')

In [7]:
# Whitespace cleaned
df.columns = df.columns.str.strip()
df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

### Section 2: Manipulating Data

In [8]:
# Preliminary inspection of dataframe
df.head(7)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,00:00:00,REGULAR,7176756,2430372
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,04:00:00,REGULAR,7176772,2430375
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,08:00:00,REGULAR,7176786,2430408
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,12:00:00,REGULAR,7176862,2430481
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,16:00:00,REGULAR,7177008,2430520
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/24/2019,20:00:00,REGULAR,7177221,2430563
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/25/2019,00:00:00,REGULAR,7177326,2430585


We understand that the turnstiles update their counter every 4 hours.  
Therefore, the total entry traffic for a single day would be to deduct 00:00 present day from 00:00 from a previous day.  
Therefore, we want to find the minimum for a given a day and subtract it with the minimum from the previous day.  
We first groupby `CA,UNIT,SCP,STATION,DATE` and apply a .agg to the `ENTRIES` column in order to find the minimum entries for the day.

In [9]:
# We have now grouped the dataframe by date
df_ordered_date = df.groupby(['C/A','UNIT','SCP','STATION', 'DATE'])['ENTRIES'].agg({'MIN ENTRIES':'min'})
df_ordered_date

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,MIN ENTRIES
C/A,UNIT,SCP,STATION,DATE,Unnamed: 5_level_1
A002,R051,02-00-00,59 ST,08/24/2019,7176756
A002,R051,02-00-00,59 ST,08/25/2019,7177326
A002,R051,02-00-00,59 ST,08/26/2019,7177742
A002,R051,02-00-00,59 ST,08/27/2019,7178840
A002,R051,02-00-00,59 ST,08/28/2019,7179982
...,...,...,...,...,...
TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/26/2019,5554
TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/27/2019,5554
TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/28/2019,5554
TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/29/2019,5554


In [10]:
# Resetting index in order to perform more operations on each CA/UNIT/SCP
df_min_daily = df_ordered_date.reset_index()
df_min_daily

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,MIN ENTRIES
0,A002,R051,02-00-00,59 ST,08/24/2019,7176756
1,A002,R051,02-00-00,59 ST,08/25/2019,7177326
2,A002,R051,02-00-00,59 ST,08/26/2019,7177742
3,A002,R051,02-00-00,59 ST,08/27/2019,7178840
4,A002,R051,02-00-00,59 ST,08/28/2019,7179982
...,...,...,...,...,...,...
34221,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/26/2019,5554
34222,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/27/2019,5554
34223,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/28/2019,5554
34224,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/29/2019,5554


In [11]:
# Finding out the daily entry per day and applying this to only the most right column of the dataframe (MIN ENTRIES)
# Creating a new column for all this data
# Now we are interested in the entry PER STATION. Not by date anymore.

df_min_daily['DAILY ENTRIES'] = df_min_daily.groupby(by=['C/A','UNIT','SCP','STATION'])['MIN ENTRIES'].diff().shift(-1)
df_min_daily

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,MIN ENTRIES,DAILY ENTRIES
0,A002,R051,02-00-00,59 ST,08/24/2019,7176756,570.0
1,A002,R051,02-00-00,59 ST,08/25/2019,7177326,416.0
2,A002,R051,02-00-00,59 ST,08/26/2019,7177742,1098.0
3,A002,R051,02-00-00,59 ST,08/27/2019,7178840,1142.0
4,A002,R051,02-00-00,59 ST,08/28/2019,7179982,1069.0
...,...,...,...,...,...,...,...
34221,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/26/2019,5554,0.0
34222,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/27/2019,5554,0.0
34223,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/28/2019,5554,0.0
34224,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/29/2019,5554,0.0


In [12]:
# We now drop the `MIN ENTRIES' column because we don't need it anymore
df_min_daily.drop('MIN ENTRIES',axis=1, inplace=True)
df_min_daily

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,DAILY ENTRIES
0,A002,R051,02-00-00,59 ST,08/24/2019,570.0
1,A002,R051,02-00-00,59 ST,08/25/2019,416.0
2,A002,R051,02-00-00,59 ST,08/26/2019,1098.0
3,A002,R051,02-00-00,59 ST,08/27/2019,1142.0
4,A002,R051,02-00-00,59 ST,08/28/2019,1069.0
...,...,...,...,...,...,...
34221,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/26/2019,0.0
34222,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/27/2019,0.0
34223,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/28/2019,0.0
34224,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/29/2019,0.0


In [13]:
# Sorting the values for more data exploration/understanding the dataset more
df_min_daily.sort_values(by='DAILY ENTRIES')

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,DAILY ENTRIES
23741,R161A,R452,01-00-01,72 ST,08/26/2019,-5631322.0
14811,N339A,R114,00-00-01,PARSONS BLVD,08/28/2019,-3838655.0
14805,N339A,R114,00-00-00,PARSONS BLVD,08/29/2019,-3376307.0
23747,R161A,R452,01-00-02,72 ST,08/25/2019,-2543402.0
12054,N196,R285,00-03-01,FAR ROCKAWAY,08/28/2019,-2129633.0
...,...,...,...,...,...,...
34197,TRAM2,R469,00-00-01,RIT-ROOSEVELT,08/30/2019,
34204,TRAM2,R469,00-03-00,RIT-ROOSEVELT,08/30/2019,
34211,TRAM2,R469,00-03-01,RIT-ROOSEVELT,08/30/2019,
34218,TRAM2,R469,00-05-00,RIT-ROOSEVELT,08/30/2019,


There seem to be anomalies in the data. Negative values and also nan and zero values. Now perform cleaning operations on this dataframe.

In [14]:
df_dailyEntries = df_min_daily
df_dailyEntries.loc[df_dailyEntries['DAILY ENTRIES'] < 0, 'DAILY ENTRIES'] = np.nan
df_dailyEntries.loc[df_dailyEntries['DAILY ENTRIES'] > 1000000, 'DAILY ENTRIES'] = np.nan
df_dailyEntries

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,DAILY ENTRIES
0,A002,R051,02-00-00,59 ST,08/24/2019,570.0
1,A002,R051,02-00-00,59 ST,08/25/2019,416.0
2,A002,R051,02-00-00,59 ST,08/26/2019,1098.0
3,A002,R051,02-00-00,59 ST,08/27/2019,1142.0
4,A002,R051,02-00-00,59 ST,08/28/2019,1069.0
...,...,...,...,...,...,...
34221,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/26/2019,0.0
34222,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/27/2019,0.0
34223,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/28/2019,0.0
34224,TRAM2,R469,00-05-01,RIT-ROOSEVELT,08/29/2019,0.0


Combine turnstiles into a single C/A, UNIT, STATION

In [15]:
df_dailyEntries.drop('SCP',axis=1)
df_TurnCombined = df_dailyEntries.groupby(['C/A','UNIT','STATION','DATE'])['DAILY ENTRIES'].agg({'DAILY ENTRIES':'sum'})
df_TurnCombined

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,DAILY ENTRIES
C/A,UNIT,STATION,DATE,Unnamed: 4_level_1
A002,R051,59 ST,08/24/2019,7986.0
A002,R051,59 ST,08/25/2019,6836.0
A002,R051,59 ST,08/26/2019,11750.0
A002,R051,59 ST,08/27/2019,12285.0
A002,R051,59 ST,08/28/2019,11830.0
...,...,...,...,...
TRAM2,R469,RIT-ROOSEVELT,08/26/2019,3977.0
TRAM2,R469,RIT-ROOSEVELT,08/27/2019,3608.0
TRAM2,R469,RIT-ROOSEVELT,08/28/2019,3095.0
TRAM2,R469,RIT-ROOSEVELT,08/29/2019,3857.0


In [16]:
df_StationCombined = df_TurnCombined.reset_index()
df_StationCombined

Unnamed: 0,C/A,UNIT,STATION,DATE,DAILY ENTRIES
0,A002,R051,59 ST,08/24/2019,7986.0
1,A002,R051,59 ST,08/25/2019,6836.0
2,A002,R051,59 ST,08/26/2019,11750.0
3,A002,R051,59 ST,08/27/2019,12285.0
4,A002,R051,59 ST,08/28/2019,11830.0
...,...,...,...,...,...
5222,TRAM2,R469,RIT-ROOSEVELT,08/26/2019,3977.0
5223,TRAM2,R469,RIT-ROOSEVELT,08/27/2019,3608.0
5224,TRAM2,R469,RIT-ROOSEVELT,08/28/2019,3095.0
5225,TRAM2,R469,RIT-ROOSEVELT,08/29/2019,3857.0


In [17]:
df_StationCombined.drop(['C/A','UNIT'],axis=1)
df_StationCombined = df_StationCombined.groupby(['STATION','DATE'])['DAILY ENTRIES'].agg({'DAILY ENTRIES':'sum'})
df_StationCombined

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


Unnamed: 0_level_0,Unnamed: 1_level_0,DAILY ENTRIES
STATION,DATE,Unnamed: 2_level_1
1 AV,08/24/2019,6260.0
1 AV,08/25/2019,4463.0
1 AV,08/26/2019,14736.0
1 AV,08/27/2019,15819.0
1 AV,08/28/2019,15969.0
...,...,...
ZEREGA AV,08/26/2019,2397.0
ZEREGA AV,08/27/2019,2519.0
ZEREGA AV,08/28/2019,2300.0
ZEREGA AV,08/29/2019,2318.0


In [18]:
df_StationCombined = df_StationCombined.groupby(['STATION'])['DAILY ENTRIES'].agg({'DAILY ENTRIES':'sum'})
df_StationCombined.sort_values(by='DAILY ENTRIES', ascending=False)

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,DAILY ENTRIES
STATION,Unnamed: 1_level_1
34 ST-PENN STA,752978.0
GRD CNTRL-42 ST,656563.0
57 ST-7 AV,628529.0
34 ST-HERALD SQ,602630.0
BROOKLYN BRIDGE,552649.0
...,...
TOMPKINSVILLE,2676.0
BEACH 105 ST,2034.0
BROAD CHANNEL,1703.0
NEWARK HM HE,790.0
