In [1]:
import pandas as pd

Transportation Data and Examples

http://transitdatatoolkit.com/lessons/mapping-a-transit-system/

###  Turntile Data
Turntile data includes information regarding the entries and exits in each turntile at a MTA station around every 4 hours. Each turntile is distinguished by UNIT, SCP and STATION.  Each station is distinguished by the station name, line name, and division.

Major operations within the data include 1) get the per period entries and exits for each turntile (using groupby), 2) combine data within a station by the time (here the time will be checked by a larger time slice), 3) There are some reset record for entries and exits, how to handle these records (delete or incorporate?)

In [377]:
# save MTA turnstile data into a dataframe
# Source: http://web.mta.info/developers/turnstile.html
df_tt = pd.read_csv('turnstile_191102.txt')

df_tt.head()


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,7247322,2455491
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,04:00:00,REGULAR,7247336,2455499
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,08:00:00,REGULAR,7247351,2455532
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,12:00:00,REGULAR,7247463,2455623
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,16:00:00,REGULAR,7247755,2455679


In [376]:
df_tt.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',
       'EXITS                                                               '],
      dtype='object')

Notice that the last column name has a very long whitespace string. We need to remove these whitespaces.

In [378]:
print(f'Check the name: ("{df_tt.columns[-1]}")')

Check the name: ("EXITS                                                               ")


In [379]:
# rename the column name
df_tt.rename(columns={df_tt.columns[-1]:df_tt.columns[-1].strip(' ')}, inplace=True)
print(f'The new columns are {df_tt.columns}')

The new columns are Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')


In [160]:
df_tt.DESC.unique()


# df_tt[df_tt.DESC=='RECOVR AUD']

array(['REGULAR', 'RECOVR AUD'], dtype=object)

In [180]:
df_tt.DIVISION.unique()

array(['BMT', 'IND', 'PTH', 'IRT', 'SRT', 'RIT'], dtype=object)

In [111]:
df_tt['ENTRIES_DIFF']=df_tt['ENTRIES'].diff()
df_tt['EXITS_DIFF']=df_tt['EXITS'].diff()

df_tt.head()

#### We can group

In [170]:
df_grouped = df_tt.groupby(['UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION'])

In [174]:
df_tt['ENTRIES_DIFF']=df_grouped[['ENTRIES']].diff()
df_tt['EXITS_DIFF']=df_grouped[['EXITS']].diff()

In [178]:
df_tt[df_tt['ENTRIES_DIFF'].isnull()]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,7247322,2455491,,
43,A002,R051,02-00-01,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,6444736,1447575,,
86,A002,R051,02-03-00,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,1304582,4917593,,
129,A002,R051,02-03-01,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,1298894,1980216,,
172,A002,R051,02-03-02,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,6385842,8541069,,
215,A002,R051,02-03-03,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,5914215,6737402,,
258,A002,R051,02-03-04,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,7309400,3764334,,
301,A002,R051,02-03-05,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,11700756,1419321,,
344,A002,R051,02-03-06,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,9157846,503364,,
387,A002,R051,02-05-00,59 ST,NQR456W,BMT,10/26/2019,00:00:00,REGULAR,176,0,,


In [32]:
# Check if the selected dataframe has no rows

df[df.STATION=='Broadway'].empty

True

### Station Data

Station data includes information of each station in MTA. It mainly provides the geolocation information, i.e., latitude and longitude of each station. The record is mainly distinguished by the station name, line name, division.

Major operations within this data include 1) select stations in Manhattan, 2)  



https://en.wikipedia.org/wiki/New_York_City_Subway_nomenclature

In [6]:
#df_geo = pd.read_csv('DOITT_SUBWAY_STATION_01_13SEPT2010.csv')
#df_geo.loc[0]['LINE'].split('-')

# df_station_entrances = pd.read_csv('NYC_Transit_Subway_Entrance_And_Exit_Data.csv')

In [381]:
df_stations = pd.read_csv('Stations.csv')

# http://web.mta.info/developers/data/nyct/subway/Stations.csv 
# in GTFS format

df_stations.head()

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,North Direction Label,South Direction Label
0,1,1,R01,BMT,Astoria,Astoria - Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034,,Manhattan
1,2,2,R03,BMT,Astoria,Astoria Blvd,Q,N W,Elevated,40.770258,-73.917843,Ditmars Blvd,Manhattan
2,3,3,R04,BMT,Astoria,30 Av,Q,N W,Elevated,40.766779,-73.921479,Astoria - Ditmars Blvd,Manhattan
3,4,4,R05,BMT,Astoria,Broadway,Q,N W,Elevated,40.76182,-73.925508,Astoria - Ditmars Blvd,Manhattan
4,5,5,R06,BMT,Astoria,36 Av,Q,N W,Elevated,40.756804,-73.929575,Astoria - Ditmars Blvd,Manhattan


In [33]:
df_stations.Borough.unique().tolist()

['Q', 'M', 'Bk', 'Bx', 'SI']

In [204]:
df_stations['Borough'].value_counts()

Bk    169
M     153
Q      83
Bx     70
SI     21
Name: Borough, dtype: int64

In [39]:
df_stations[df_stations.Borough=='M']

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,North Direction Label,South Direction Label
6,7,613,R11,BMT,Astoria,Lexington Av/59 St,M,N W R,Subway,40.762660,-73.967258,Queens,Downtown & Brooklyn
7,8,8,R13,BMT,Astoria,5 Av/59 St,M,N W R,Subway,40.764811,-73.973347,Queens,Downtown & Brooklyn
8,9,9,R14,BMT,Broadway - Brighton,57 St - 7 Av,M,N Q R W,Subway,40.764664,-73.980658,Uptown & Queens,Downtown & Brooklyn
9,10,10,R15,BMT,Broadway - Brighton,49 St,M,N R W,Subway,40.759901,-73.984139,Uptown & Queens,Downtown & Brooklyn
10,11,611,R16,BMT,Broadway - Brighton,Times Sq - 42 St,M,N Q R W,Subway,40.754672,-73.986754,Uptown & Queens,Downtown & Brooklyn
11,12,607,R17,BMT,Broadway - Brighton,34 St - Herald Sq,M,N Q R W,Subway,40.749567,-73.987950,Uptown & Queens,Downtown & Brooklyn
12,13,13,R18,BMT,Broadway - Brighton,28 St,M,R W,Subway,40.745494,-73.988691,Uptown & Queens,Downtown & Brooklyn
13,14,14,R19,BMT,Broadway - Brighton,23 St,M,R W,Subway,40.741303,-73.989344,Uptown & Queens,Downtown & Brooklyn
14,15,602,R20,BMT,Broadway - Brighton,14 St - Union Sq,M,N Q R W,Subway,40.735736,-73.990568,Uptown & Queens,Downtown & Brooklyn
15,16,16,R21,BMT,Broadway - Brighton,8 St - NYU,M,R W,Subway,40.730328,-73.992629,Uptown & Queens,Downtown & Brooklyn


In [57]:
df_stations[df_stations.Borough=='M']['Stop Name'].nunique()

99

In [179]:
df_stations[df_stations.Borough=='M']['Division'].unique()


array(['BMT', 'IND', 'IRT'], dtype=object)

In [126]:
df_stations[df_stations.Borough=='M']['Stop Name'].unique()

array(['Lexington Av/59 St', '5 Av/59 St', '57 St - 7 Av', '49 St',
       'Times Sq - 42 St', '34 St - Herald Sq', '28 St', '23 St',
       '14 St - Union Sq', '8 St - NYU', 'Prince St', 'Canal St',
       'City Hall', 'Cortlandt St', 'Rector St', 'Whitehall St',
       'Essex St', 'Bowery', 'Chambers St', 'Fulton St', 'Broad St',
       '8 Av', '6 Av', 'Union Sq - 14 St', '3 Av', '1 Av',
       'Inwood - 207 St', 'Dyckman St', '190 St', '181 St', '175 St',
       '168 St', '163 St - Amsterdam Av', '155 St', '145 St', '135 St',
       '125 St', '116 St', 'Cathedral Pkwy (110 St)', '103 St', '96 St',
       '86 St', '81 St - Museum of Natural History', '72 St',
       '59 St - Columbus Circle', '50 St',
       '42 St - Port Authority Bus Terminal', '34 St - Penn Station',
       '14 St', 'W 4 St', 'Spring St', 'World Trade Center',
       'Roosevelt Island', 'Lexington Av/63 St', '57 St',
       '47-50 Sts - Rockefeller Ctr', '42 St - Bryant Pk',
       'Broadway-Lafayette St', 'Grand 

In [119]:
df_stations[df_stations.Borough=='M']['Stop Name'].value_counts()

Canal St                        6
23 St                           5
Times Sq - 42 St                4
125 St                          4
Fulton St                       4
86 St                           4
96 St                           4
145 St                          4
Grand Central - 42 St           3
72 St                           3
28 St                           3
Chambers St                     3
14 St                           3
116 St                          3
103 St                          3
Rector St                       2
59 St - Columbus Circle         2
14 St - Union Sq                2
Wall St                         2
155 St                          2
Dyckman St                      2
34 St - Penn Station            2
135 St                          2
W 4 St                          2
34 St - Herald Sq               2
Spring St                       2
181 St                          2
50 St                           2
Broadway-Lafayette St           1
Cathedral Pkwy

In [139]:
# The stations in different Routes may have the same name. Their coordinates may differ from each other but in a vicinity. Since the turntile data 
# use the stop name for all routes, we will later select one typical 
df_stations[df_stations['Stop Name']=='Canal St']

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,North Direction Label,South Direction Label
17,18,623,R23,BMT,Broadway,Canal St,M,R W,Subway,40.719527,-74.001775,Uptown & Queens,Downtown & Brooklyn
18,19,623,Q01,BMT,Manhattan Bridge,Canal St,M,N Q,Subway,40.718383,-74.00046,Uptown & Queens,Downtown & Brooklyn
103,104,623,M20,BMT,Jamaica,Canal St,M,J Z,Subway,40.718092,-73.999892,Brooklyn,Broad St
169,169,169,A34,IND,8th Av - Fulton St,Canal St,M,A C E,Subway,40.720824,-74.005229,Uptown - Queens,Downtown & Brooklyn
325,325,325,135,IRT,Broadway - 7Av,Canal St,M,1,Subway,40.722854,-74.006277,Uptown & The Bronx,Downtown
410,410,623,639,IRT,Lexington Av,Canal St,M,6,Subway,40.718803,-74.000193,Uptown & The Bronx,Downtown


In [58]:
stationNYC = df_stations[df_stations.Borough=='M']['Stop Name'].unique().tolist()
lstStationNYC = list(map(lambda x : x.upper(), stationNYC))
len(lstStationNYC)

In [64]:
lstDataSta = df.STATION.unique().tolist()

In [197]:
# distance calculation between two points using their latitude and longitude information
# Use geopy module
# https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude

import geopy.distance

coords_1 = (40.576209, -73.967875)
coords_2 = (40.576507, -73.969445)

d = geopy.distance.distance(coords_1, coords_2).m
print(d)

136.99125557229425


In [202]:
df_stations['Stop Name'].nunique()

377

In [203]:
df_stations.columns

Index(['Station ID', 'Complex ID', 'GTFS Stop ID', 'Division', 'Line',
       'Stop Name', 'Borough', 'Daytime Routes', 'Structure', 'GTFS Latitude',
       'GTFS Longitude', 'North Direction Label', 'South Direction Label'],
      dtype='object')

In [206]:
df_stations[df_stations['Borough']=='M']['Stop Name'].unique()

array(['Lexington Av/59 St', '5 Av/59 St', '57 St - 7 Av', '49 St',
       'Times Sq - 42 St', '34 St - Herald Sq', '28 St', '23 St',
       '14 St - Union Sq', '8 St - NYU', 'Prince St', 'Canal St',
       'City Hall', 'Cortlandt St', 'Rector St', 'Whitehall St',
       'Essex St', 'Bowery', 'Chambers St', 'Fulton St', 'Broad St',
       '8 Av', '6 Av', 'Union Sq - 14 St', '3 Av', '1 Av',
       'Inwood - 207 St', 'Dyckman St', '190 St', '181 St', '175 St',
       '168 St', '163 St - Amsterdam Av', '155 St', '145 St', '135 St',
       '125 St', '116 St', 'Cathedral Pkwy (110 St)', '103 St', '96 St',
       '86 St', '81 St - Museum of Natural History', '72 St',
       '59 St - Columbus Circle', '50 St',
       '42 St - Port Authority Bus Terminal', '34 St - Penn Station',
       '14 St', 'W 4 St', 'Spring St', 'World Trade Center',
       'Roosevelt Island', 'Lexington Av/63 St', '57 St',
       '47-50 Sts - Rockefeller Ctr', '42 St - Bryant Pk',
       'Broadway-Lafayette St', 'Grand 

In [221]:
df_stations[df_stations['Borough']=='M']['Stop Name'].value_counts()

Canal St                        6
23 St                           5
Times Sq - 42 St                4
125 St                          4
Fulton St                       4
86 St                           4
96 St                           4
145 St                          4
Grand Central - 42 St           3
72 St                           3
28 St                           3
Chambers St                     3
14 St                           3
116 St                          3
103 St                          3
Rector St                       2
59 St - Columbus Circle         2
14 St - Union Sq                2
Wall St                         2
155 St                          2
Dyckman St                      2
34 St - Penn Station            2
135 St                          2
W 4 St                          2
34 St - Herald Sq               2
Spring St                       2
181 St                          2
50 St                           2
Broadway-Lafayette St           1
Cathedral Pkwy

In [384]:
df_stations[df_stations['Borough']=='M'][['Stop Name', 'Line', 'GTFS Latitude', 'GTFS Longitude']].head()

Unnamed: 0,Stop Name,Line,GTFS Latitude,GTFS Longitude
6,Lexington Av/59 St,Astoria,40.76266,-73.967258
7,5 Av/59 St,Astoria,40.764811,-73.973347
8,57 St - 7 Av,Broadway - Brighton,40.764664,-73.980658
9,49 St,Broadway - Brighton,40.759901,-73.984139
10,Times Sq - 42 St,Broadway - Brighton,40.754672,-73.986754


In [214]:
df_stations[df_stations['Borough']=='M']['Division'].unique()

array(['BMT', 'IND', 'IRT'], dtype=object)





### Station Mapping
The key of bridging these two datasets is the mapping between stations. 

One station may have multiple geolocations that serve different lines. Since we don't obtain the turntiles location information, we will take them as a whole. 

Due to the spelling rules in different datasets, we need the manual check and correction. The mapping information will be saved in a separate csv file for later use.





In [388]:
df_grouped = df_tt.groupby(['UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION'])
df_tt_grouped = df_tt.groupby(['STATION', 'LINENAME', 'DIVISION'])[['STATION', 'LINENAME', 'DIVISION']]

In [301]:
df_tt.DIVISION.unique()

df_tt[df_tt.DIVISION.isin(['BMT', 'IND', 'IRT'])].STATION.nunique()

df_tt['ID'] = df_tt[['STATION', 'LINENAME', 'DIVISION']].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)


tt_list = df_tt['ID'].unique().tolist()
tt_list = list(map(lambda x: x.split(','), tt_list))

df_tt_list = pd.DataFrame(tt_list, columns=['Station', 'Routes', 'Division'])
df_tt_list.head()

def hyphen_adjust(x):
    if '-' in x:
        tmp = x.split('-')
        return ' - '.join(tmp)
    else:
        return x

df_tt_list['Station'] = df_tt_list['Station'].apply(hyphen_adjust)

df_tt_list['Routes'] = df_tt_list['Routes'].apply(lambda x: set(x))

df_tt_list.head()

df_tt_list.shape

In [256]:
indexNames = df_stations[ df_stations['Borough'] != 'M' ].index
 
# Delete these row indexes from dataFrame
df_stations.drop(indexNames , inplace=True)

df_stations['ID'] = df_stations[['Stop Name', 'Daytime Routes', 'Division']].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)

In [312]:
sta_list = df_stations['ID'].unique().tolist()

sta_list = list(map(lambda x: x.upper(), sta_list))

sta_list = list(map(lambda x: x.split(','), sta_list))

df_station_list = pd.DataFrame(sta_list, columns=['Station', 'Routes', 'Division'])

df_station_list['Routes'] = df_station_list['Routes'].apply(lambda x: set(x.split(' ')))

df_station_list.head()

df_station_list.shape

In [325]:
list(df_tt_list.loc[0])

['59 ST', {'4', '5', '6', 'N', 'Q', 'R', 'W'}, 'BMT']

In [340]:
df_tt_list['geo_sta']=df_tt_list['Station'].apply(lambda x: [])

In [343]:
df_station_list['Matched']=df_station_list['Station'].apply(lambda x: False)

In [335]:
def station_match(tt, station):
    matched = False
    #print(f'tt: {tt}, station: {station}')
    if tt[2] == station[2]:
        #print('pos1')
        if tt[0] == station[0]:
            #print(f'Station: {station} matches with TT: {tt}')
            if station[1].issubset(tt[1]):
                matched = True
                print(f'Station: {station} matches with TT: {tt} in lines {station[1]}')
    return matched

In [356]:
def station_match(tt, station):
    matched = False
    #print(f'tt: {tt}, station: {station}')
    if tt[0] == station[0]:
        #print(f'Station: {station} matches with TT: {tt}')
        if station[1].issubset(tt[1]):
            matched = True
            print(f'Station: {station} matches with TT: {tt} in lines {station[1]}')
    return matched

In [357]:
for i in range(df_station_list.shape[0]):
    for j in range(df_tt_list.shape[0]):
        if station_match(list(df_tt_list.loc[j]), list(df_station_list.loc[i])):
            df_tt_list.loc[j]['geo_sta'].append(i)
            df_station_list.at[i, 'Matched'] = True

Station: ['5 AV/59 ST', {'R', 'N', 'W'}, 'BMT', True] matches with TT: ['5 AV/59 ST', {'R', 'Q', 'N', 'W'}, 'BMT', [1, 1, 1]] in lines {'R', 'N', 'W'}
Station: ['57 ST - 7 AV', {'R', 'Q', 'N', 'W'}, 'BMT', True] matches with TT: ['57 ST - 7 AV', {'R', 'Q', 'N', 'W'}, 'BMT', [2, 2]] in lines {'R', 'Q', 'N', 'W'}
Station: ['49 ST', {'R', 'N', 'W'}, 'BMT', True] matches with TT: ['49 ST', {'R', 'Q', 'N', 'W'}, 'BMT', [3, 3]] in lines {'R', 'N', 'W'}
Station: ['TIMES SQ - 42 ST', {'R', 'Q', 'N', 'W'}, 'BMT', True] matches with TT: ['TIMES SQ - 42 ST', {'1', 'W', 'S', '3', '2', 'C', 'R', '7', 'A', 'E', 'N', 'Q'}, 'BMT', [4, 4]] in lines {'R', 'Q', 'N', 'W'}
Station: ['TIMES SQ - 42 ST', {'R', 'Q', 'N', 'W'}, 'BMT', True] matches with TT: ['TIMES SQ - 42 ST', {'1', 'W', 'S', '3', '2', '7', 'C', 'R', 'A', 'E', 'N', 'Q'}, 'IRT', [98, 146, 147, 98, 146, 147]] in lines {'R', 'Q', 'N', 'W'}
Station: ['34 ST - HERALD SQ', {'R', 'Q', 'N', 'W'}, 'BMT', True] matches with TT: ['34 ST - HERALD SQ', {'

Station: ['42 ST - BRYANT PK', {'F', 'B', 'D', 'M'}, 'IND', True] matches with TT: ['42 ST - BRYANT PK', {'F', 'B', 'D', '7', 'M'}, 'IND', [65, 65]] in lines {'F', 'B', 'D', 'M'}
Station: ['34 ST - HERALD SQ', {'F', 'B', 'D', 'M'}, 'IND', True] matches with TT: ['34 ST - HERALD SQ', {'M', 'W', 'F', 'B', 'D', 'R', 'Q', 'N'}, 'BMT', [5, 5, 5]] in lines {'F', 'B', 'D', 'M'}
Station: ['34 ST - HERALD SQ', {'F', 'B', 'D', 'M'}, 'IND', True] matches with TT: ['34 ST - HERALD SQ', {'M', 'W', 'F', 'B', 'D', 'R', 'Q', 'N'}, 'IND', [66, 66, 5]] in lines {'F', 'B', 'D', 'M'}
Station: ['23 ST', {'F', 'M'}, 'IND', True] matches with TT: ['23 ST', {'F', 'M'}, 'IND', [67, 67]] in lines {'F', 'M'}
Station: ['14 ST', {'F', 'M'}, 'IND', True] matches with TT: ['14 ST', {'1', '3', 'F', '2', 'M', 'L'}, 'IND', [68, 68]] in lines {'F', 'M'}
Station: ['14 ST', {'F', 'M'}, 'IND', True] matches with TT: ['14 ST', {'1', '3', 'F', '2', 'M', 'L'}, 'IRT', [103, 103]] in lines {'F', 'M'}
Station: ['GRAND ST', {'B',

Station: ['SPRING ST', {'6'}, 'IRT', True] matches with TT: ['SPRING ST', {'6'}, 'IRT', [132, 132]] in lines {'6'}
Station: ['CANAL ST', {'6'}, 'IRT', False] matches with TT: ['CANAL ST', {'6', 'W', 'Z', 'J', 'R', 'Q', 'N'}, 'BMT', [11, 12, 19, 11, 12, 19, 11, 12, 19]] in lines {'6'}
Station: ['FULTON ST', {'5', '4'}, 'IRT', True] matches with TT: ['FULTON ST', {'4', 'Z', '5', '3', 'J', '2', 'C', 'A'}, 'BMT', [21, 21, 21, 59, 113]] in lines {'5', '4'}
Station: ['FULTON ST', {'5', '4'}, 'IRT', True] matches with TT: ['FULTON ST', {'4', 'Z', '5', '3', 'J', '2', 'C', 'A'}, 'IND', [59, 59, 21, 59, 113]] in lines {'5', '4'}
Station: ['FULTON ST', {'5', '4'}, 'IRT', True] matches with TT: ['FULTON ST', {'4', '5', 'Z', '3', 'J', '2', 'C', 'A'}, 'IND', [59, 59, 21, 59, 113]] in lines {'5', '4'}
Station: ['FULTON ST', {'5', '4'}, 'IRT', True] matches with TT: ['FULTON ST', {'4', '5', 'Z', '3', 'J', '2', 'C', 'A'}, 'IRT', [113, 135, 113, 135, 21, 59, 113]] in lines {'5', '4'}
Station: ['WALL ST'

In [358]:
df_station_list['Matched'].value_counts()

True     112
False     41
Name: Matched, dtype: int64

In [359]:
df_station_list[~df_station_list['Matched']]

Unnamed: 0,Station,Routes,Division,Matched
0,LEXINGTON AV/59 ST,"{R, N, W}",BMT,False
16,WHITEHALL ST,"{R, W}",BMT,False
17,ESSEX ST,"{Z, J, M}",BMT,False
25,UNION SQ - 14 ST,{L},BMT,False
34,163 ST - AMSTERDAM AV,{C},IND,False
41,CATHEDRAL PKWY (110 ST),"{B, C}",IND,False
45,81 ST - MUSEUM OF NATURAL HISTORY,"{B, C}",IND,False
47,59 ST - COLUMBUS CIRCLE,"{A, B, C, D}",IND,False
49,42 ST - PORT AUTHORITY BUS TERMINAL,"{A, E, C}",IND,False
50,34 ST - PENN STATION,"{A, E, C}",IND,False


In [390]:
# Check the unmatched station one after another and save the result into the manually input dictionary
df_tt_list[df_tt_list['Station'].str.contains('W 4 ST')]

Unnamed: 0,Station,Routes,Division,geo_sta
153,W 4 ST - WASH SQ,"{F, B, C, D, A, E, M}",IND,[]


In [399]:
# df_station_list[~df_station_list['Matched']]['Station'].tolist()

In [396]:
dict_man = {}
dict_man[1]=3

In [397]:
dict_man

{1: 3}

In [398]:
dict_man[1]

3