In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from collections import Counter
import networkx as nx
from scipy import stats

### Reading already parsed datasets

In [24]:
NY_df = pd.read_csv('data/newyork_dataset', sep=',',low_memory=False, header=0, encoding='utf8')
CHI_df = pd.read_csv('data/chicago_dataset', sep=',',low_memory=False, header=0, encoding='utf8')
BOS_df = pd.read_csv('data/boston_dataset', sep=',',low_memory=False, header=0, encoding='utf8')

Number of station - nodes for each dataset

In [25]:
print('NY: # start stations =',(NY_df['start station name'].nunique()), '; # end stations =',(NY_df['end station name'].nunique()))
print('CHI: # start stations =',(CHI_df['start station name'].nunique()), '; # end stations =',(CHI_df['end station name'].nunique()))
print('BOS: # start stations =',(BOS_df['start station name'].nunique()), '; # end stations =',(BOS_df['end station name'].nunique()))

NY: # start stations = 784 ; # end stations = 791
CHI: # start stations = 571 ; # end stations = 569
BOS: # start stations = 268 ; # end stations = 268


### Network creation

In [26]:
def network_creation(df):
    nodes = np.array(list(set.union(set(df.loc[:, 'start station name']), set(df.loc[:, 'end station name']))))
    edges = zip(df['start station name'], df['end station name'])
    G = nx.DiGraph()
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)
    print(nx.info(G))
    return G

In [27]:
NY_graph = network_creation(NY_df)
CHI_graph = network_creation(CHI_df)
BOS_graph = network_creation(BOS_df)

DiGraph with 791 nodes and 187520 edges
DiGraph with 573 nodes and 43057 edges
DiGraph with 268 nodes and 23679 edges


### Nodes Degrees

In [78]:
import operator

def nodes_degrees(G):
    in_degrees = dict(G.in_degree(G.nodes()))
    out_degrees = dict(G.out_degree(G.nodes()))
    degrees = dict(G.degree(G.nodes()))
    nx.set_node_attributes(G, in_degrees, 'in degree')
    nx.set_node_attributes(G, out_degrees, 'out degree')
    nx.set_node_attributes(G, degrees, 'degree')
    df = pd.DataFrame.from_dict({'Nodes':G.nodes(), 'degree':list(degrees.values()), 'in degree':list(in_degrees.values()),
                                'out degree':list(out_degrees.values())})
    print('Top 5 stations by degree')
    display(df[['Nodes', 'degree']].sort_values('degree', ascending=False).head(5))
    print('\n Top 5 stations by in degree')
    display(df[['Nodes', 'in degree']].sort_values('in degree', ascending=False).head(5))
    print('\n Top 5 stations by out degree')
    display(df[['Nodes', 'out degree']].sort_values('out degree', ascending=False).head(5))
    return G, df

In [79]:
NY_graph, NY_nodes = nodes_degrees(NY_graph)

Top 5 stations by degree


Unnamed: 0,Nodes,degree
154,E 17 St & Broadway,985
155,Pershing Square North,979
400,Broadway & E 22 St,968
502,Cleveland Pl & Spring St,881
252,Grand St & Elizabeth St,877



 Top 5 stations by in degree


Unnamed: 0,Nodes,in degree
154,E 17 St & Broadway,491
400,Broadway & E 22 St,478
155,Pershing Square North,466
47,1 Ave & E 68 St,456
502,Cleveland Pl & Spring St,450



 Top 5 stations by out degree


Unnamed: 0,Nodes,out degree
155,Pershing Square North,513
154,E 17 St & Broadway,494
400,Broadway & E 22 St,490
252,Grand St & Elizabeth St,444
480,Broadway & E 14 St,436


In [58]:
CHI_graph, CHI_nodes = nodes_degrees(CHI_graph)

[('Streeter Dr & Grand Ave', 238), ('Damen Ave & Pierce Ave', 228), ('Daley Center Plaza', 227), ('Clinton St & Madison St', 224), ('Michigan Ave & Oak St', 214), ('Lake Shore Dr & Monroe St', 205), ('Clinton St & Washington Blvd', 205), ('Franklin St & Monroe St', 203), ('Ashland Ave & Division St', 201), ('Theater on the Lake', 200)]
[('Daley Center Plaza', 264), ('Clinton St & Madison St', 249), ('Canal St & Adams St', 246), ('Clinton St & Washington Blvd', 245), ('Franklin St & Monroe St', 242), ('Canal St & Madison St', 241), ('Orleans St & Merchandise Mart Plaza', 232), ('Dearborn St & Monroe St', 223), ('Streeter Dr & Grand Ave', 219), ('Michigan Ave & Washington St', 216)]
[('Daley Center Plaza', 491), ('Clinton St & Madison St', 473), ('Streeter Dr & Grand Ave', 457), ('Clinton St & Washington Blvd', 450), ('Franklin St & Monroe St', 445), ('Canal St & Adams St', 435), ('Damen Ave & Pierce Ave', 434), ('Canal St & Madison St', 417), ('Dearborn St & Monroe St', 416), ('Orleans 

In [59]:
BOS_graph, BOS_nodes = nodes_degrees(BOS_graph)

[('Harvard Square at Mass Ave/ Dunster', 190), ('Christian Science Plaza - Massachusetts Ave at Westland Ave', 187), ('South Station - 700 Atlantic Ave', 183), ('Back Bay T Stop - Dartmouth St at Stuart St', 180), ('Boylston St at Massachusetts Ave', 179), ('Copley Square - Dartmouth St at Boylston St', 179), ('MIT at Mass Ave / Amherst St', 178), ('Dartmouth St at Newbury St', 178), ('Central Square at Mass Ave / Essex St', 176), ('MIT Stata Center at Vassar St / Main St', 174)]
[('Christian Science Plaza - Massachusetts Ave at Westland Ave', 188), ('MIT at Mass Ave / Amherst St', 187), ('Back Bay T Stop - Dartmouth St at Stuart St', 186), ('Central Square at Mass Ave / Essex St', 182), ('Dartmouth St at Newbury St', 181), ('Boylston St at Massachusetts Ave', 178), ('Harvard Square at Mass Ave/ Dunster', 177), ('Copley Square - Dartmouth St at Boylston St', 176), ('MIT Stata Center at Vassar St / Main St', 176), ('South Station - 700 Atlantic Ave', 175)]
[('Christian Science Plaza - M