# Project Benson

## Loading and Cleaning 

In [34]:
import pandas as pd
col_names = ['BOOTH','UNIT','SCP','STATION','LINENAME','DIVISION','DATE','TIME','DESC', 'ENTRIES','EXITS']
filenames = ['turnstile_190504.txt','turnstile_190511.txt','turnstile_190518.txt','turnstile_190525.txt']

In [35]:
def merge_data(list_of_files):
    new_df = pd.DataFrame()
    for file in list_of_files:
        df = pd.read_table('data/'+file, sep = ",", header = 0, names = col_names)
        #print(len(df)) -- just for checking 
        new_df = new_df.append(df)
    return new_df


In [36]:
df = merge_data(filenames)

In [37]:
###convert DATE and TIME to datetime objects 
df['DATE'] =  pd.to_datetime(df['DATE'], format= "%m/%d/%Y")
df['DATE'].dt.date

df['TIME'] =  pd.to_datetime(df['TIME'], format = "%H:%M:%S")
df['TIME'].dt.time

0         00:00:00
1         04:00:00
2         08:00:00
3         12:00:00
4         16:00:00
            ...   
203358    05:00:00
203359    09:00:00
203360    13:00:00
203361    17:00:00
203362    21:00:00
Name: TIME, Length: 822989, dtype: object

In [38]:
### create new variable that will uniquely identify stations because some stations have the same name 
df['STATLINE'] = df['STATION'] + '-' + df['LINENAME']

In [39]:
### to group entries and exits by date, unit, turnstile and then sort by time 
### this is to apply the diff() function in chronological order by unique turnstile 
grouped = df.groupby(['STATLINE', 'UNIT', 'SCP', 'DATE'])
grouped = grouped.apply(lambda _df: _df.sort_values(by=['TIME']))

In [49]:
### add day of week to dataframe to identify weekdays and weekends 
grouped['DAY'] = grouped['DATE'].dt.dayofweek

In [50]:
### calculate absolute number of entries and exits because data is cumulative 
grouped['entry_diff'] = grouped.ENTRIES.diff()
grouped['exit_diff'] = grouped.EXITS.diff()

In [51]:
### remove rows that where difference is negative for both entries and exits 
grouped_entry = grouped[(grouped.entry_diff > 0)]
grouped_exit = grouped[(grouped.exit_diff > 0)]

# EDA 

In [52]:
### define morning peakhours using start and end times 
import datetime

start_time = datetime.datetime.strptime('07:00:00', '%H:%M:%S')
start_time.time
end_time = datetime.datetime.strptime('10:00:00', '%H:%M:%S')
end_time.time

<function datetime.time>

In [53]:
### find the busiest stations in terms of exits during weekends for mornings 
grouped_exit2 = grouped_exit[(grouped_exit.DAY > 5) & (grouped_exit.TIME >= start_time ) & ((grouped_exit.TIME <= end_time ))]
grouped_exit2.groupby(grouped_exit2.STATLINE).exit_diff.sum().sort_values(ascending = False).head(15)

STATLINE
TWENTY THIRD ST-1               1372976.0
GRD CNTRL-42 ST-4567S             11271.0
34 ST-PENN STA-123ACE              9753.0
34 ST-HERALD SQ-BDFMNQRW           9726.0
42 ST-PORT AUTH-ACENQRS1237W       9103.0
14 ST-UNION SQ-LNQR456W            7918.0
72 ST-123                          7289.0
34 ST-PENN STA-ACE                 7226.0
96 ST-123                          6207.0
125 ST-456                         6143.0
CANAL ST-JNQRZ6W                   5975.0
59 ST COLUMBUS-ABCD1               5696.0
W 4 ST-WASH SQ-ABCDEFM             5520.0
TIMES SQ-42 ST-1237ACENQRSW        5518.0
14 ST-UNION SQ-456LNQRW            5458.0
Name: exit_diff, dtype: float64

In [55]:
### find the busiest stations in terms of exits during weekdays for mornings 
grouped_exit3 = grouped_exit[(grouped_exit.DAY >= 0) & (grouped_exit.DAY < 5) & (grouped_exit.TIME >= start_time ) & ((grouped_exit.TIME <= end_time ))]
grouped_exit3.groupby(grouped_exit3.STATLINE).exit_diff.sum().sort_values(ascending = False).head(15)

STATLINE
KINGSBRIDGE RD-BD              16730176.0
TWENTY THIRD ST-1               2795245.0
FULTON ST-ACJZ2345              1055810.0
GRD CNTRL-42 ST-4567S            632320.0
5 AV/53 ST-EM                    365904.0
14 ST-UNION SQ-LNQR456W          303822.0
FULTON ST-2345ACJZ               303744.0
PATH NEW WTC-1                   265155.0
34 ST-HERALD SQ-BDFMNQRW         224174.0
47-50 STS ROCK-BDFM              221612.0
34 ST-PENN STA-123ACE            165511.0
72 ST-2 AVE-Q                    159702.0
WALL ST-45                       158503.0
TIMES SQ-42 ST-1237ACENQRSW      157042.0
LEXINGTON AV/53-EM6              149508.0
Name: exit_diff, dtype: float64

In [59]:
### find the busiest stations in terms of entries during weekends for mornings 
grouped_entry2 = grouped_entry[(grouped_entry.DAY > 5) & (grouped_entry.TIME >= start_time ) & ((grouped_entry.TIME <= end_time ))]
grouped_entry2.groupby(grouped_entry2.STATLINE).entry_diff.sum().sort_values(ascending = False).head(15)

STATLINE
TWENTY THIRD ST-1               94846.0
JKSN HT-ROOSVLT-EFMR7           11009.0
34 ST-PENN STA-123ACE           10285.0
42 ST-PORT AUTH-ACENQRS1237W     9894.0
103 ST-CORONA-7                  6603.0
FLUSHING-MAIN-7                  6365.0
JAMAICA CENTER-EJZ               6332.0
GRD CNTRL-42 ST-4567S            6224.0
CROWN HTS-UTICA-34               6064.0
JUNCTION BLVD-7                  6008.0
96 ST-123                        5802.0
34 ST-HERALD SQ-BDFMNQRW         5771.0
EUCLID AV-AC                     5677.0
34 ST-PENN STA-ACE               5478.0
WOODHAVEN BLVD-MR                4959.0
Name: entry_diff, dtype: float64

In [60]:
### find the busiest stations in terms of entries during weekdays for mornings 
grouped_entry3 = grouped_entry[(grouped_entry.DAY >= 0) & (grouped_entry.DAY < 5) & (grouped_entry.TIME >= start_time ) & ((grouped_entry.TIME <= end_time ))]
grouped_entry3.groupby(grouped_entry3.STATLINE).entry_diff.sum().sort_values(ascending = False).head(15)

STATLINE
KINGSBRIDGE RD-BD               183937920.0
168 ST-AC1                         499914.0
34 ST-PENN STA-123ACE              389423.0
42 ST-PORT AUTH-ACENQRS1237W       378740.0
GRD CNTRL-42 ST-4567S              369553.0
FLUSHING-MAIN-7                    294568.0
GROVE STREET-1                     277859.0
34 ST-PENN STA-ACE                 253668.0
JAMAICA CENTER-EJZ                 238932.0
JKSN HT-ROOSVLT-EFMR7              219484.0
FOREST HILLS 71-EFMR               219051.0
TWENTY THIRD ST-1                  201083.0
JAMAICA 179 ST-F                   187957.0
96 ST-123                          184427.0
HARRISON-1                         180792.0
Name: entry_diff, dtype: float64

In [73]:
### define afternoon peakhours using start and end times 

e_start_time = datetime.datetime.strptime('19:00:00', '%H:%M:%S')
e_start_time.time
e_end_time = datetime.datetime.strptime('22:00:00', '%H:%M:%S')
e_end_time.time

<function datetime.time>

In [75]:
### find the busiest stations in terms of exits during weekends for evenings 
grouped_exit4 = grouped_exit[(grouped_exit.DAY > 5) & (grouped_exit.TIME >= e_start_time ) & ((grouped_exit.TIME <= e_end_time ))]
grouped_exit4.groupby(grouped_exit4.STATLINE).exit_diff.sum().sort_values(ascending = False).head(15)

STATLINE
TWENTY THIRD ST-1               1375125.0
34 ST-HERALD SQ-BDFMNQRW          57223.0
GRD CNTRL-42 ST-4567S             43003.0
42 ST-PORT AUTH-ACENQRS1237W      38983.0
34 ST-PENN STA-ACE                38790.0
TIMES SQ-42 ST-1237ACENQRSW       35018.0
JKSN HT-ROOSVLT-EFMR7             26215.0
14 ST-UNION SQ-LNQR456W           25832.0
W 4 ST-WASH SQ-ABCDEFM            25244.0
59 ST COLUMBUS-ABCD1              22565.0
96 ST-123                         22512.0
72 ST-123                         21052.0
FLUSHING-MAIN-7                   20783.0
86 ST-456                         19152.0
14 ST-UNION SQ-456LNQRW           18196.0
Name: exit_diff, dtype: float64

In [76]:
### find the busiest stations in terms of exits during weekdays for evenings 
grouped_exit5 = grouped_exit[(grouped_exit.DAY >= 0) & (grouped_exit.DAY < 5) & (grouped_exit.TIME >= e_start_time ) & ((grouped_exit.TIME <= e_end_time ))]
grouped_exit5.groupby(grouped_exit5.STATLINE).exit_diff.sum().sort_values(ascending = False).head(15)

STATLINE
BROOKLYN BRIDGE-456JZ           5420995.0
23 ST-CE                        1947511.0
TWENTY THIRD ST-1               1395549.0
34 ST-HERALD SQ-BDFMNQRW         669241.0
GRD CNTRL-42 ST-4567S            623364.0
34 ST-PENN STA-ACE               566835.0
42 ST-PORT AUTH-ACENQRS1237W     533403.0
FLUSHING-MAIN-7                  379117.0
TIMES SQ-42 ST-1237ACENQRSW      339800.0
34 ST-PENN STA-123               335420.0
JKSN HT-ROOSVLT-EFMR7            287059.0
14 ST-UNION SQ-LNQR456W          268993.0
86 ST-456                        254201.0
FULTON ST-2345ACJZ               253462.0
JAMAICA CENTER-EJZ               247972.0
Name: exit_diff, dtype: float64

In [77]:
### find the busiest stations in terms of entries during weekends for evenings  
grouped_entry4 = grouped_entry[(grouped_entry.DAY > 5) & (grouped_entry.TIME >= e_start_time ) & ((grouped_entry.TIME <= e_end_time ))]
grouped_entry4.groupby(grouped_entry4.STATLINE).entry_diff.sum().sort_values(ascending = False).head(15)

STATLINE
8 ST-NYU-NRW                    172696.0
TWENTY THIRD ST-1                97071.0
34 ST-HERALD SQ-BDFMNQRW         70427.0
GRD CNTRL-42 ST-4567S            53873.0
34 ST-PENN STA-ACE               50847.0
42 ST-PORT AUTH-ACENQRS1237W     46345.0
TIMES SQ-42 ST-1237ACENQRSW      43897.0
59 ST COLUMBUS-ABCD1             40442.0
14 ST-UNION SQ-LNQR456W          39927.0
CANAL ST-JNQRZ6W                 33552.0
W 4 ST-WASH SQ-ABCDEFM           30688.0
34 ST-PENN STA-123ACE            27983.0
72 ST-123                        25594.0
86 ST-456                        25488.0
47-50 STS ROCK-BDFM              24832.0
Name: entry_diff, dtype: float64

In [78]:
### find the busiest stations in terms of entries during weekdays for evenings 
grouped_entry5 = grouped_entry[(grouped_entry.DAY >= 0) & (grouped_entry.DAY < 5) & (grouped_entry.TIME >= e_start_time ) & ((grouped_entry.TIME <= e_end_time ))]
grouped_entry5.groupby(grouped_entry5.STATLINE).entry_diff.sum().sort_values(ascending = False).head(15)

STATLINE
23 ST-CE                       2282595.0
GRD CNTRL-42 ST-4567S          1256374.0
34 ST-HERALD SQ-BDFMNQRW        996694.0
47-50 STS ROCK-BDFM             757065.0
14 ST-UNION SQ-LNQR456W         594712.0
FULTON ST-2345ACJZ              560015.0
TIMES SQ-42 ST-1237ACENQRSW     546027.0
BROOKLYN BRIDGE-456JZ           498147.0
59 ST COLUMBUS-ABCD1            494248.0
PATH NEW WTC-1                  485907.0
34 ST-PENN STA-ACE              483550.0
LEXINGTON AV/53-EM6             450324.0
42 ST-BRYANT PK-BDFM7           417459.0
59 ST-456NQRW                   387592.0
CANAL ST-JNQRZ6W                363412.0
Name: entry_diff, dtype: float64

 ## Cross referencing other data sources

### Here's the tech company [map](https://therealdeal.com/2019/08/23/map-heres-a-look-at-all-the-big-tech-locations-in-nyc/)

In [61]:
### We identified the top stations that were closest to tech firms, but there were multiple stations with the same name 

names = ['23 ST', '28 ST', '14 ST', 'UNION', '42 ST']
for name in names:
   print('\n', name, ': ' , grouped[grouped.STATION.str.contains(name) ].STATLINE.unique())


 23 ST :  ['23 ST-1' '23 ST-6' '23 ST-CE' '23 ST-FM' '23 ST-NRW'
 'COURT SQ-23 ST-EMG']

 28 ST :  ['28 ST-1' '28 ST-6' '28 ST-NRW']

 14 ST :  ['14 ST-123FLM' '14 ST-ACEL' '14 ST-FLM123' '14 ST-UNION SQ-456LNQRW'
 '14 ST-UNION SQ-LNQR456W']

 UNION :  ['14 ST-UNION SQ-456LNQRW' '14 ST-UNION SQ-LNQR456W' 'UNION ST-R']

 42 ST :  ['42 ST-BRYANT PK-BDFM7' '42 ST-PORT AUTH-ACENGRS1237W'
 '42 ST-PORT AUTH-ACENQRS1237W' 'GRD CNTRL-42 ST-4567S'
 'TIMES SQ-42 ST-1237ACENQRS' 'TIMES SQ-42 ST-1237ACENQRSW'
 'TIMES SQ-42 ST-ACENQRS1237W']


In [62]:
### We identified the correct stations names by cross referencing the subway map and tech company map 
correct_names = ['23 ST-6', '28 ST-6', '14 ST-ACEL', '14 ST-UNION SQ-456LNQRW',
                 '14 ST-UNION SQ-LNQR456W' 'UNION ST-R',  'TIMES SQ-42 ST-1237ACENQRS',
                 'TIMES SQ-42 ST-1237ACENQRSW',  'TIMES SQ-42 ST-ACENQRS1237W']

In [83]:
exit_count = {}

for station in correct_names:
    exit_count[station]=grouped_exit3[grouped_exit3.STATLINE == station].exit_diff.sum()

print(sorted(exit_count.items(),key = lambda x: x[1], reverse = True), '\n')

[('TIMES SQ-42 ST-1237ACENQRSW', 157042.0), ('14 ST-UNION SQ-456LNQRW', 128589.0), ('23 ST-6', 115132.0), ('28 ST-6', 92242.0), ('14 ST-ACEL', 82465.0), ('TIMES SQ-42 ST-ACENQRS1237W', 38010.0), ('TIMES SQ-42 ST-1237ACENQRS', 21767.0), ('14 ST-UNION SQ-LNQR456WUNION ST-R', 0.0)] 



In [82]:
entry_count = {}

for station in correct_names:
    entry_count[station]=grouped_entry5 [grouped_entry5.STATLINE == station].entry_diff.sum()

print(sorted(entry_count.items(),key = lambda x: x[1], reverse = True), '\n')

[('TIMES SQ-42 ST-1237ACENQRSW', 546027.0), ('23 ST-6', 217057.0), ('14 ST-UNION SQ-456LNQRW', 188828.0), ('TIMES SQ-42 ST-ACENQRS1237W', 171127.0), ('14 ST-ACEL', 162558.0), ('28 ST-6', 147689.0), ('TIMES SQ-42 ST-1237ACENQRS', 123570.0), ('14 ST-UNION SQ-LNQR456WUNION ST-R', 0.0)] 

