In [1]:
# Importing packages
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

# MTA Subway Graph Representation and Querying

## Data Preprocessing

Replace filename.csv with the name of the MTA rides data you downloaded from the Google Drive link.

In [6]:
RIDES_FILENAME = "filename.csv"
RIDES_FILENAME = "mta_rides.csv"

Load in ride and station data.

In [7]:
rides = pd.read_csv(f'../data/{RIDES_FILENAME}')
stations = pd.read_csv('../data/mta_stations.csv')

`stations_borough` is a Dataframe with each unique station complex ID and its corresponding borough.

In [16]:
stations_borough = stations[['Complex ID', 'Borough']]
stations_borough = stations_borough.drop_duplicates()


In [17]:
stations_borough

Unnamed: 0,Complex ID,Borough
0,1,Q
1,2,Q
2,3,Q
3,4,Q
4,5,Q
...,...,...
491,517,SI
492,518,SI
493,519,SI
494,522,SI


`rides_complete` is a DataFrame where each row is a unique trip defined by the origin and destination stations. It contains trip information like the day of week, hour of day, origin and desination station complex data (ID and borough), and estimated average ridership.

A full description of columns:
 - `Origin Borough`: borough of origin station (ex. Bk)
 - `Day of Week`: day of week trip occurs (ex. Monday)
 - `Hour of Day`: hour of day trip occurs (ex. 1 for 1:00AM, 18 for 6:00PM)
 - `Origin Station Complex ID`: unique complex ID of origin station (ex. 26)
 - `Origin Station Complex Name`: name of origin station complex (ex. Grand St (B, D))
 - `Destination Station Complex ID`: unique complex ID of destination station (ex. 71)
 - `Destination Station Complex Name`: name of destination sation complex (ex. 8 Av (N))
 - `Estimated Average Ridership`: average number of people taking this trip (ex. 0.55)
 - `Destination Borough`: borough of destination station (ex. M)

In [13]:
origin_borough_rides = pd.merge(stations_borough, rides, left_on='Complex ID', right_on='Origin Station Complex ID', how='right')
rides_complete = origin_borough_rides.rename(columns={'Borough':'Origin Borough'})
rides_complete = pd.merge(rides_complete, stations_borough, left_on='Destination Station Complex ID', right_on='Complex ID', how='left')
rides_complete = rides_complete.rename(columns={'Borough': 'Destination Borough'})
rides_complete = rides_complete.drop(columns = ['Complex ID_x', 'Complex ID_y', 'Year', 'Month', 'Timestamp', 'Origin Latitude', 'Origin Longitude', 'Destination Latitude', 'Destination Longitude', 'Origin Point', 'Destination Point'])

In [14]:
rides_complete

Unnamed: 0,Origin Borough,Day of Week,Hour of Day,Origin Station Complex ID,Origin Station Complex Name,Destination Station Complex ID,Destination Station Complex Name,Estimated Average Ridership,Destination Borough
0,Bk,Monday,1,26,"DeKalb Av (B,Q,R)",355,"Winthrop St (2,5)",0.5556,Bk
1,M,Monday,1,231,"Grand St (B,D)",284,Nassau Av (G),0.3068,Bk
2,M,Monday,1,313,"72 St (1,2,3)",71,8 Av (N),0.3012,Bk
3,M,Monday,1,320,23 St (1),309,103 St (1),0.9000,M
4,M,Monday,1,399,68 St-Hunter College (6),618,"14 St (A,C,E)/8 Av (L)",0.2940,M
...,...,...,...,...,...,...,...,...,...
9169120,Bx,Sunday,12,426,"E 180 St (2,5)",379,Mosholu Pkwy (4),0.3322,Bx
9169121,Bx,Sunday,12,427,"West Farms Sq-E Tremont Av (2,5)",613,"Lexington Av (N,R,W)/59 St (4,5,6)",0.7305,M
9169122,Bk,Sunday,12,32,"36 St (D,N,R)",263,"63 Dr-Rego Park (M,R)",0.3130,Q
9169123,M,Sunday,12,324,Houston St (1),407,Astor Pl (6),0.8602,M


## Graph Representation

Initialize a directed multi-edge graph.

In [15]:
G = nx.MultiDiGraph()

Initialize station nodes, each with the following features:
 - `id`: unique ID of node based on station complex ID
 - `name`: name of station complex
 - `borough`: borough station is located in

In [18]:
for i, j, k in zip(rides_complete['Origin Station Complex ID'], rides_complete['Origin Station Complex Name'], rides_complete['Origin Borough']):
    G.add_node(i, name=j, borough=k)

for i, j, k in zip(rides_complete['Destination Station Complex ID'], rides_complete['Destination Station Complex Name'], rides_complete['Destination Borough']):
    G.add_node(i, name=j, borough=k)

Initialize ride edges between stations, each with the following features:
 - `ridership`: estimated ridership by average number of people
 - `day`: day of week
 - `hour`: hour or day

In [19]:
origin = list(rides_complete['Origin Station Complex ID'])
dest = list(rides_complete['Destination Station Complex ID'])
ridership = list(rides_complete['Estimated Average Ridership'])
dow = list(rides_complete['Day of Week'])
hod = list(rides_complete['Hour of Day'])

for i, j, k, l, m in zip(origin, dest, ridership, dow, hod):
    G.add_edge(i, j, ridership=k, day=l, hour=m)

## Querying

### Question 1

#### Part (a)

whatever tf the query is

In [6]:
# Question 1) a.
def q1a(G):
# Finding list of all boroughs
    borough_list = set([i[1]['borough'] for i in G.nodes(data=True)])
    # List of most busy origin stations in each borough
    borough_top = []
    # Iterating through each borough
    for borough in borough_list:
        # List of all stations in borough
        stations_id = [node for node in G.nodes if G.nodes[node]['borough'] == borough]
        # List of stations and their total ridership
        station_ridership_list = []
        # Iterating through each station
        for station in stations_id:
            # List of all unique ride pairs where station is origin
            origin_rides = set([i for i in G.out_edges(station)])
            # Total ridership of station
            total_ridership = 0
            # Iterating through each unique ride pair of stations
            for ride in origin_rides:
                total_ridership += sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] for i in G.get_edge_data(ride[0], ride[1])])
            station_ridership_list += [(total_ridership, station)]
        # Sorting stations by total ridership
        ridership_sorted = sorted(station_ridership_list, reverse=True)
        # Getting top 5 busiest stations
        top_5_ridership = ridership_sorted[:5]
        borough_top += [(borough, [G.nodes[i[1]]['name'] for i in top_5_ridership])]
    return borough_top

In [7]:
q1a(G)

[('M',
  ['Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
   'Grand Central-42 St (S,4,5,6,7)',
   '34 St-Herald Sq (B,D,F,M,N,Q,R,W)',
   '14 St-Union Sq (L,N,Q,R,W,4,5,6)',
   '34 St-Penn Station (A,C,E)']),
 ('Bk',
  ['Atlantic Av-Barclays Ctr (B,D,N,Q,R,2,3,4,5)',
   'Bedford Av (L)',
   'Jay St-MetroTech (A,C,F,R)',
   'Court St (R)/Borough Hall (2,3,4,5)',
   'Crown Hts-Utica Av (3,4)']),
 ('Bx',
  ['161 St-Yankee Stadium (B,D,4)',
   '3 Av-149 St (2,5)',
   'Parkchester (6)',
   'Fordham Rd (4)',
   'Hunts Point Av (6)']),
 ('Q',
  ['74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)',
   'Flushing-Main St (7)',
   '103 St-Corona Plaza (7)',
   'Sutphin Blvd-Archer Av-JFK Airport (E,J,Z)',
   'Junction Blvd (7)'])]

In [8]:
# Question 1) b.
def q1b(G):
    # List of valid days of the week
    valid_days = ['Monday', 'Tuesday', 'Wednesday']
    # List of stations and their total ridership
    station_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is origin
        origin_rides = set([i for i in G.out_edges(station)])
        # Total ridership of station
        total_ridership = 0
        for ride in origin_rides:
            # Adding ridership from valid days of the week
            total_ridership += sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['day'] in valid_days])
        station_ridership_list += [(total_ridership, station)]
    # Sorting stations by total ridership
    ridership_sorted = sorted(station_ridership_list, reverse=True)
    # Getting top 5 busiest stations
    top_5_ridership = ridership_sorted[:5]
    top_5_stations = [G.nodes[i[1]]['name'] for i in top_5_ridership]
    return top_5_stations

In [9]:
q1b(G)

['Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
 'Grand Central-42 St (S,4,5,6,7)',
 '34 St-Herald Sq (B,D,F,M,N,Q,R,W)',
 '14 St-Union Sq (L,N,Q,R,W,4,5,6)',
 'Fulton St (A,C,J,Z,2,3,4,5)']

In [10]:
# Question 1) c.
def q1c(G):
    # List of valid days of the week
    valid_days = ['Saturday', 'Sunday']
    # List of stations and their total ridership
    station_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is origin
        origin_rides = set([i for i in G.out_edges(station)])
        # Total ridership of station
        total_ridership = 0
        for ride in origin_rides:
            # Adding ridership from valid days of the week
            total_ridership += sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['day'] in valid_days])
        station_ridership_list += [(total_ridership, station)]
    # Sorting stations by total ridership
    ridership_sorted = sorted(station_ridership_list, reverse=True)
    # Getting top 5 busiest stations
    top_5_ridership = ridership_sorted[:5]
    top_5_stations = [G.nodes[i[1]]['name'] for i in top_5_ridership]
    return top_5_stations

In [11]:
q1c(G)

['Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
 '34 St-Herald Sq (B,D,F,M,N,Q,R,W)',
 'Grand Central-42 St (S,4,5,6,7)',
 '14 St-Union Sq (L,N,Q,R,W,4,5,6)',
 '34 St-Penn Station (A,C,E)']

In [12]:
# Question 1) d.
def q1d(G):
    # List of valid hours of the day
    valid_hours = np.arange(1, 6)
    # List of stations and their total ridership
    station_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is origin
        origin_rides = set([i for i in G.out_edges(station)])
        # Total ridership of station
        total_ridership = 0
        for ride in origin_rides:
            # Adding ridership from valid hours of the day
            total_ridership += sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['hour'] in valid_hours])
        station_ridership_list += [(total_ridership, station)]
    # Sorting stations by total ridership
    ridership_sorted = sorted(station_ridership_list, reverse=True)
    # Getting top 5 busiest stations
    top_5_ridership = ridership_sorted[:5]
    top_5_stations = [G.nodes[i[1]]['name'] for i in top_5_ridership]
    return top_5_stations

In [13]:
q1d(G)

['Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
 '74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)',
 'Flushing-Main St (7)',
 '103 St-Corona Plaza (7)',
 'Jamaica Center-Parsons/Archer (E,J,Z)']

In [14]:
# Question 1) e.
def q1e(G):
    # List of valid hours of the day
    valid_hours = np.arange(6, 10)
    # List of stations and their total ridership
    station_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is origin
        origin_rides = set([i for i in G.out_edges(station)])
        # Total ridership of station
        total_ridership = 0
        for ride in origin_rides:
            # Adding ridership from valid hours of the day
            total_ridership += sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['hour'] in valid_hours])
        station_ridership_list += [(total_ridership, station)]
    # Sorting stations by total ridership
    ridership_sorted = sorted(station_ridership_list, reverse=True)
    # Getting top 5 busiest stations
    top_5_ridership = ridership_sorted[:5]
    top_5_stations = [G.nodes[i[1]]['name'] for i in top_5_ridership]
    return top_5_stations

In [15]:
q1e(G)

['Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
 'Grand Central-42 St (S,4,5,6,7)',
 '74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)',
 '34 St-Penn Station (1,2,3)',
 'Flushing-Main St (7)']

### Question 2

In [16]:
# Question 2) a.
def q2a(G):
# Finding list of all boroughs
    borough_list = set([i[1]['borough'] for i in G.nodes(data=True)])
    # List of most busy origin stations in each borough
    borough_top = []
    # Iterating through each borough
    for borough in borough_list:
        # List of all stations in borough
        stations_id = [node for node in G.nodes if G.nodes[node]['borough'] == borough]
        # List of stations and their total ridership
        station_ridership_list = []
        # Iterating through each station
        for station in stations_id:
            # List of all unique ride pairs where station is destination
            dest_rides = set([i for i in G.in_edges(station)])
            # Total ridership of station
            total_ridership = 0
            # Iterating through each unique ride pair of stations
            for ride in dest_rides:
                total_ridership += sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                        for i in G.get_edge_data(ride[0], ride[1])])
            station_ridership_list += [(total_ridership, station)]
        # Sorting stations by total ridership
        ridership_sorted = sorted(station_ridership_list, reverse=True)
        # Getting top 5 busiest stations
        top_5_ridership = ridership_sorted[:5]
        borough_top += [(borough, [G.nodes[i[1]]['name'] for i in top_5_ridership])]
    return borough_top

In [17]:
q2a(G)

[('M',
  ['Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
   'Grand Central-42 St (S,4,5,6,7)',
   '34 St-Herald Sq (B,D,F,M,N,Q,R,W)',
   '14 St-Union Sq (L,N,Q,R,W,4,5,6)',
   'Fulton St (A,C,J,Z,2,3,4,5)']),
 ('Bk',
  ['Atlantic Av-Barclays Ctr (B,D,N,Q,R,2,3,4,5)',
   'Bedford Av (L)',
   'Jay St-MetroTech (A,C,F,R)',
   'Court St (R)/Borough Hall (2,3,4,5)',
   'Crown Hts-Utica Av (3,4)']),
 ('Bx',
  ['161 St-Yankee Stadium (B,D,4)',
   '3 Av-149 St (2,5)',
   'Parkchester (6)',
   '149 St-Grand Concourse (2,4,5)',
   'Fordham Rd (4)']),
 ('Q',
  ['74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)',
   'Flushing-Main St (7)',
   'Court Sq (E,G,M,7)',
   '103 St-Corona Plaza (7)',
   'Junction Blvd (7)'])]

In [18]:
# Question 2) b.
def q2b(G):
    # List of valid days of the week
    valid_days = ['Thursday', 'Friday']
    # List of stations and their total ridership
    station_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is destination
        dest_rides = set([i for i in G.in_edges(station)])
        # Total ridership of station
        total_ridership = 0
        for ride in dest_rides:
            # Adding ridership from valid days of the week
            total_ridership += sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['day'] in valid_days])
        station_ridership_list += [(total_ridership, station)]
    # Sorting stations by total ridership
    ridership_sorted = sorted(station_ridership_list, reverse=True)
    # Getting top 5 busiest stations
    top_5_ridership = ridership_sorted[:5]
    top_5_stations = [G.nodes[i[1]]['name'] for i in top_5_ridership]
    return top_5_stations

In [19]:
q2b(G)

['Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
 'Grand Central-42 St (S,4,5,6,7)',
 '34 St-Herald Sq (B,D,F,M,N,Q,R,W)',
 '14 St-Union Sq (L,N,Q,R,W,4,5,6)',
 'Fulton St (A,C,J,Z,2,3,4,5)']

In [20]:
# Question 2) c.
def q2c(G):
    # List of valid days of the week
    valid_days = ['Saturday']
    # List of stations and their total ridership
    station_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is destination
        dest_rides = set([i for i in G.in_edges(station)])
        # Total ridership of station
        total_ridership = 0
        for ride in dest_rides:
            # Adding ridership from valid days of the week
            total_ridership += sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['day'] in valid_days])
        station_ridership_list += [(total_ridership, station)]
    # Sorting stations by total ridership
    ridership_sorted = sorted(station_ridership_list, reverse=True)
    # Getting top 5 busiest stations
    top_5_ridership = ridership_sorted[:5]
    top_5_stations = [G.nodes[i[1]]['name'] for i in top_5_ridership]
    return top_5_stations

In [21]:
q2c(G)

['Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
 '34 St-Herald Sq (B,D,F,M,N,Q,R,W)',
 '14 St-Union Sq (L,N,Q,R,W,4,5,6)',
 'Grand Central-42 St (S,4,5,6,7)',
 '34 St-Penn Station (A,C,E)']

In [22]:
# Question 2) d.
def q2d(G):
    # List of valid hours of the day
    valid_hours = np.arange(0, 6)
    # List of stations and their total ridership
    station_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is destination
        dest_rides = set([i for i in G.in_edges(station)])
        # Total ridership of station
        total_ridership = 0
        for ride in dest_rides:
            # Adding ridership from valid hours of the day
            total_ridership += sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['hour'] in valid_hours])
        station_ridership_list += [(total_ridership, station)]
    # Sorting stations by total ridership
    ridership_sorted = sorted(station_ridership_list, reverse=True)
    # Getting top 5 busiest stations
    top_5_ridership = ridership_sorted[:5]
    top_5_stations = [G.nodes[i[1]]['name'] for i in top_5_ridership]
    return top_5_stations

In [23]:
q2d(G)

['Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
 'Grand Central-42 St (S,4,5,6,7)',
 '34 St-Herald Sq (B,D,F,M,N,Q,R,W)',
 '74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)',
 'Fulton St (A,C,J,Z,2,3,4,5)']

In [24]:
# Question 2) e.
def q2e(G):
    # List of valid hours of the day
    valid_hours = np.arange(18, 22)
    # List of stations and their total ridership
    station_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is destination
        dest_rides = set([i for i in G.in_edges(station)])
        # Total ridership of station
        total_ridership = 0
        for ride in dest_rides:
            # Adding ridership from valid hours of the day
            total_ridership += sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['hour'] in valid_hours])
        station_ridership_list += [(total_ridership, station)]
    # Sorting stations by total ridership
    ridership_sorted = sorted(station_ridership_list, reverse=True)
    # Getting top 5 busiest stations
    top_5_ridership = ridership_sorted[:5]
    top_5_stations = [G.nodes[i[1]]['name'] for i in top_5_ridership]
    return top_5_stations

In [25]:
q2e(G)

['Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
 'Grand Central-42 St (S,4,5,6,7)',
 '34 St-Herald Sq (B,D,F,M,N,Q,R,W)',
 '74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)',
 '14 St-Union Sq (L,N,Q,R,W,4,5,6)']

### Question 3

In [26]:
# Question 3) a.
def q3a(G):
    # List of valid hours of the day
    valid_hours = np.arange(13, 15)
    # List of valid days of the week
    valid_days = ['Monday']
    # List of station pairs and their total ridership
    station_pairs_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is destination
        origin_rides = set([i for i in G.out_edges(station)])
        for ride in origin_rides:
            total_ridership = sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['hour'] in valid_hours 
                                    and G.get_edge_data(ride[0], ride[1], i)['day'] in valid_days])
            station_pairs_ridership_list += [(total_ridership, ride)]
    # Sorting station pairs by total ridership
    ridership_sorted = sorted(station_pairs_ridership_list, reverse=True)
    # Getting top 10 busiest station pairs
    top_10_ridership = ridership_sorted[:10]
    top_10_stations = [(G.nodes[i[1][0]]['name'], G.nodes[i[1][1]]['name']) for i in top_10_ridership]
    return top_10_stations

In [27]:
q3a(G)

[('Grand Central-42 St (S,4,5,6,7)',
  'Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)'),
 ('Flushing-Main St (7)', '103 St-Corona Plaza (7)'),
 ('Fulton St (A,C,J,Z,2,3,4,5)', 'Grand Central-42 St (S,4,5,6,7)'),
 ('Flushing-Main St (7)', 'Junction Blvd (7)'),
 ('Grand Central-42 St (S,4,5,6,7)', '14 St-Union Sq (L,N,Q,R,W,4,5,6)'),
 ('Flushing-Main St (7)',
  '74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)'),
 ('14 St-Union Sq (L,N,Q,R,W,4,5,6)', 'Grand Central-42 St (S,4,5,6,7)'),
 ('Junction Blvd (7)', 'Flushing-Main St (7)'),
 ('Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
  'Grand Central-42 St (S,4,5,6,7)'),
 ('Grand Central-42 St (S,4,5,6,7)', 'Fulton St (A,C,J,Z,2,3,4,5)')]

In [28]:
q3a(G)

[('Grand Central-42 St (S,4,5,6,7)',
  'Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)'),
 ('Flushing-Main St (7)', '103 St-Corona Plaza (7)'),
 ('Fulton St (A,C,J,Z,2,3,4,5)', 'Grand Central-42 St (S,4,5,6,7)'),
 ('Flushing-Main St (7)', 'Junction Blvd (7)'),
 ('Grand Central-42 St (S,4,5,6,7)', '14 St-Union Sq (L,N,Q,R,W,4,5,6)'),
 ('Flushing-Main St (7)',
  '74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)'),
 ('14 St-Union Sq (L,N,Q,R,W,4,5,6)', 'Grand Central-42 St (S,4,5,6,7)'),
 ('Junction Blvd (7)', 'Flushing-Main St (7)'),
 ('Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)',
  'Grand Central-42 St (S,4,5,6,7)'),
 ('Grand Central-42 St (S,4,5,6,7)', 'Fulton St (A,C,J,Z,2,3,4,5)')]

In [29]:
# Question 3) b.
def q3b(G):
    # List of valid hours of the day
    valid_hours = np.arange(18, 22)
    # List of valid days of the week
    valid_days = ['Friday']
    # Valid borough
    valid_borough = 'Q'
    # List of station pairs and their total ridership
    station_pairs_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is origin and both stations in borough
        origin_rides = set([i for i in G.out_edges(station) 
                        if G.nodes[i[0]]['borough'] == valid_borough 
                        and G.nodes[i[1]]['borough'] == valid_borough])
        for ride in origin_rides:
            total_ridership = sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['hour'] in valid_hours 
                                    and G.get_edge_data(ride[0], ride[1], i)['day'] in valid_days])
            station_pairs_ridership_list += [(total_ridership, ride)]
    # Sorting station pairs by total ridership
    ridership_sorted = sorted(station_pairs_ridership_list, reverse=True)
    # Getting top 10 busiest station pairs
    top_10_ridership = ridership_sorted[:10]
    top_10_stations = [(G.nodes[i[1][0]]['name'], G.nodes[i[1][1]]['name']) for i in top_10_ridership]
    return top_10_stations

In [30]:
q3b(G)

[('Flushing-Main St (7)',
  '74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)'),
 ('Flushing-Main St (7)', '103 St-Corona Plaza (7)'),
 ('Flushing-Main St (7)', 'Junction Blvd (7)'),
 ('Junction Blvd (7)', 'Flushing-Main St (7)'),
 ('Flushing-Main St (7)', '90 St-Elmhurst Av (7)'),
 ('74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)',
  'Flushing-Main St (7)'),
 ('103 St-Corona Plaza (7)', 'Flushing-Main St (7)'),
 ('Flushing-Main St (7)', '111 St (7)'),
 ('82 St-Jackson Hts (7)', 'Flushing-Main St (7)'),
 ('74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)', 'Jamaica-179 St (F)')]

In [31]:
# Question 3) c.
def q3c(G):
    # List of valid hours of the day
    valid_hours = np.arange(1, 6)
    # Valid borough
    valid_borough = 'Bk'
    # List of station pairs and their total ridership
    station_pairs_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is origin and both stations in borough
        origin_rides = set([i for i in G.out_edges(station) 
                        if G.nodes[i[0]]['borough'] == valid_borough 
                        and G.nodes[i[1]]['borough'] == valid_borough])
        for ride in origin_rides:
            total_ridership = sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['hour'] in valid_hours])
            station_pairs_ridership_list += [(total_ridership, ride)]
    # Sorting station pairs by total ridership
    ridership_sorted = sorted(station_pairs_ridership_list, reverse=True)
    # Getting top 10 busiest station pairs
    top_10_ridership = ridership_sorted[:10]
    top_10_stations = [(G.nodes[i[1][0]]['name'], G.nodes[i[1][1]]['name']) for i in top_10_ridership]
    return top_10_stations

In [32]:
q3c(G)

[('Crown Hts-Utica Av (3,4)', 'Atlantic Av-Barclays Ctr (B,D,N,Q,R,2,3,4,5)'),
 ('Flatbush Av-Brooklyn College (2,5)',
  'Atlantic Av-Barclays Ctr (B,D,N,Q,R,2,3,4,5)'),
 ('Bedford Av (L)', 'Myrtle-Wyckoff Avs (L,M)'),
 ('Crown Hts-Utica Av (3,4)', 'Court St (R)/Borough Hall (2,3,4,5)'),
 ('Myrtle-Wyckoff Avs (L,M)', 'Bedford Av (L)'),
 ('Crown Hts-Utica Av (3,4)', 'Nevins St (2,3,4,5)'),
 ('Euclid Av (A,C)', 'Jay St-MetroTech (A,C,F,R)'),
 ('Atlantic Av-Barclays Ctr (B,D,N,Q,R,2,3,4,5)', '36 St (D,N,R)'),
 ('Bedford Av (L)', 'DeKalb Av (L)'),
 ('Lorimer St (L)/Metropolitan Av (G)', 'Myrtle-Wyckoff Avs (L,M)')]

In [33]:
# Question 3) d.
def q3d(G):
    # List of valid hours of the day
    valid_hours = np.arange(6, 8)
    # List of valid days of the week
    valid_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday']
    # Borough where origin station located
    origin_borough = 'Bk'
    # Borough where destination station located
    dest_borough = 'M'
    # List of station pairs and their total ridership
    station_pairs_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is origin and both stations in borough
        origin_rides = set([i for i in G.out_edges(station) 
                        if G.nodes[i[0]]['borough'] == origin_borough 
                        and G.nodes[i[1]]['borough'] == dest_borough])
        for ride in origin_rides:
            total_ridership = sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['hour'] in valid_hours
                                    and G.get_edge_data(ride[0], ride[1], i)['day'] in valid_days])
            station_pairs_ridership_list += [(total_ridership, ride)]
    # Sorting station pairs by total ridership
    ridership_sorted = sorted(station_pairs_ridership_list, reverse=True)
    # Getting top 10 busiest station pairs
    top_10_ridership = ridership_sorted[:10]
    top_10_stations = [(G.nodes[i[1][0]]['name'], G.nodes[i[1][1]]['name']) for i in top_10_ridership]
    return top_10_stations

In [34]:
q3d(G)

[('Atlantic Av-Barclays Ctr (B,D,N,Q,R,2,3,4,5)', 'Bowling Green (4,5)'),
 ('Crown Hts-Utica Av (3,4)', 'Grand Central-42 St (S,4,5,6,7)'),
 ('Kings Hwy (B,Q)', '34 St-Herald Sq (B,D,F,M,N,Q,R,W)'),
 ('Court St (R)/Borough Hall (2,3,4,5)', 'Grand Central-42 St (S,4,5,6,7)'),
 ('Flatbush Av-Brooklyn College (2,5)', 'Grand Central-42 St (S,4,5,6,7)'),
 ('Crown Hts-Utica Av (3,4)', 'Fulton St (A,C,J,Z,2,3,4,5)'),
 ('Flatbush Av-Brooklyn College (2,5)', 'Fulton St (A,C,J,Z,2,3,4,5)'),
 ('Bedford Av (L)', 'Grand Central-42 St (S,4,5,6,7)'),
 ('Kings Hwy (B,Q)', '47-50 Sts-Rockefeller Ctr (B,D,F,M)'),
 ('Sheepshead Bay (B,Q)', '47-50 Sts-Rockefeller Ctr (B,D,F,M)')]

In [35]:
# Question 3) e.
def q3e(G):
    # List of valid hours of the day
    valid_hours = np.arange(6, 8)
    # List of valid days of the week
    valid_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday']
    # Borough where origin station located
    origin_borough = 'Bx'
    # Borough where destination station located
    dest_borough = 'M'
    # List of station pairs and their total ridership
    station_pairs_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is origin and both stations in borough
        origin_rides = set([i for i in G.out_edges(station) 
                        if G.nodes[i[0]]['borough'] == origin_borough 
                        and G.nodes[i[1]]['borough'] == dest_borough])
        for ride in origin_rides:
            total_ridership = sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['hour'] in valid_hours
                                    and G.get_edge_data(ride[0], ride[1], i)['day'] in valid_days])
            station_pairs_ridership_list += [(total_ridership, ride)]
    # Sorting station pairs by total ridership
    ridership_sorted = sorted(station_pairs_ridership_list, reverse=True)
    # Getting top 10 busiest station pairs
    top_10_ridership = ridership_sorted[:10]
    top_10_stations = [(G.nodes[i[1][0]]['name'], G.nodes[i[1][1]]['name']) for i in top_10_ridership]
    return top_10_stations

In [36]:
q3e(G)

[('Parkchester (6)', 'Grand Central-42 St (S,4,5,6,7)'),
 ('Parkchester (6)', '14 St-Union Sq (L,N,Q,R,W,4,5,6)'),
 ('Parkchester (6)', '125 St (4,5,6)'),
 ('Parkchester (6)', '68 St-Hunter College (6)'),
 ('Parkchester (6)', '86 St (4,5,6)'),
 ('Parkchester (6)', 'Lexington Av-53 St (E,M)/51 St (6)'),
 ('Parkchester (6)', 'Fulton St (A,C,J,Z,2,3,4,5)'),
 ('Parkchester (6)', 'Brooklyn Bridge-City Hall (4,5,6)/Chambers St (J,Z)'),
 ('161 St-Yankee Stadium (B,D,4)', '59 St-Columbus Circle (A,B,C,D,1)'),
 ('Woodlawn (4)', '86 St (4,5,6)')]

In [37]:
# Question 3) f.
def q3f(G):
    # List of valid hours of the day
    valid_hours = np.arange(6, 8)
    # List of valid days of the week
    valid_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday']
    # Borough where origin station located
    origin_borough = 'SI'
    # Borough where destination station located
    dest_borough = 'M'
    # List of station pairs and their total ridership
    station_pairs_ridership_list = []
    # Iterate through each station
    for station in G.nodes:
        # List of all unique ride pairs where station is origin and both stations in borough
        origin_rides = set([i for i in G.out_edges(station) 
                        if G.nodes[i[0]]['borough'] == origin_borough 
                        and G.nodes[i[1]]['borough'] == dest_borough])
        for ride in origin_rides:
            total_ridership = sum([G.get_edge_data(ride[0], ride[1], i)['ridership'] 
                                    for i in G.get_edge_data(ride[0], ride[1]) 
                                    if G.get_edge_data(ride[0], ride[1], i)['hour'] in valid_hours
                                    and G.get_edge_data(ride[0], ride[1], i)['day'] in valid_days])
            station_pairs_ridership_list += [(total_ridership, ride)]
    # Sorting station pairs by total ridership
    ridership_sorted = sorted(station_pairs_ridership_list, reverse=True)
    # Getting top 10 busiest station pairs
    top_10_ridership = ridership_sorted[:10]
    top_10_stations = [(G.nodes[i[1][0]]['name'], G.nodes[i[1][1]]['name']) for i in top_10_ridership]
    return top_10_stations

In [38]:
q3f(G)

[]