In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from operator import itemgetter
import itertools
import os
import csv
import datetime

## STRUCTURE

- set sub nodes ( by line at each station )
- set master nodes ( by station )
- link of sub nodes by train
- link of sub nodes by transfer
- link of sub nodes and master nodes by waiting time

### set sub nodes

In [2]:
#Get the station data -
ndata = pd.read_csv("network_setup/sub_node")

In [3]:
#initialize the graph
#G.clear()
G = nx.MultiDiGraph(name='network_setup/NYC Subway Network')

In [4]:
#add sub nodes and attributes
for i in range(0,len(ndata)):
    G.add_node(ndata.node_id[i],
               name = ndata.stop_name[i],
               line = ndata['train'][i],
               lat = ndata.stop_lat[i],
               lng = ndata.stop_lon[i],
               geo = (ndata.stop_lon[i],ndata.stop_lat[i]),
               stop_id = ndata.stop_id[i],
               master_node='No',
               master_node_id = ndata.master_node_id[i]
              )

In [5]:
#show the nodes in G
G.nodes(data=True)[:3]

[('625_4',
  {'geo': (-73.951070000000001, 40.785671999999998),
   'lat': 40.785671999999998,
   'line': '4',
   'lng': -73.951070000000001,
   'master_node': 'No',
   'master_node_id': '96 St_1',
   'name': '96 St',
   'stop_id': '625'}),
 ('D25_Q',
  {'geo': (-73.972367000000006, 40.677050000000001),
   'lat': 40.677050000000001,
   'line': 'Q',
   'lng': -73.972367000000006,
   'master_node': 'No',
   'master_node_id': '7 Av_1',
   'name': '7 Av',
   'stop_id': 'D25'}),
 ('625_6',
  {'geo': (-73.951070000000001, 40.785671999999998),
   'lat': 40.785671999999998,
   'line': '6',
   'lng': -73.951070000000001,
   'master_node': 'No',
   'master_node_id': '96 St_1',
   'name': '96 St',
   'stop_id': '625'})]

### set master nodes

In [6]:
#get the master node data and attributes
mdata = pd.read_csv("network_setup/master_node")

In [7]:
mdata.head()

Unnamed: 0.1,Unnamed: 0,master_node_id,stop_name,stop_lat,stop_lon,stop_id
0,0,1 Av_0,1 Av,40.730953,-73.981628,L06
1,1,103 St - Corona Plaza_0,103 St - Corona Plaza,40.749865,-73.8627,706
2,2,103 St_0,103 St,40.799446,-73.968379,119
3,3,103 St_1,103 St,40.7906,-73.947478,624
4,4,103 St_2,103 St,40.796092,-73.961454,A18


In [8]:
#add nodes and attributes
for i in range(0,len(mdata)):
    G.add_node(mdata.master_node_id[i],
               name = mdata.stop_name[i],
               master_node='Yes',
               stop_id = mdata.stop_id[i]
              )

In [9]:
G.nodes(data=True)[6:8]

[('138_5',
  {'geo': (-74.012187999999995, 40.711835000000001),
   'lat': 40.711835000000001,
   'line': '5',
   'lng': -74.012187999999995,
   'master_node': 'No',
   'master_node_id': 'Cortlandt St_0',
   'name': 'Cortlandt St',
   'stop_id': '138'}),
 ('Richmond Valley_0',
  {'master_node': 'Yes', 'name': 'Richmond Valley', 'stop_id': 'S13'})]

## link of sub nodes by train

In [10]:
## get edge data
edata = pd.read_csv('network_setup/duration_7-10am_weekday')

In [11]:
edata.head(3)

Unnamed: 0.1,Unnamed: 0,from_stop_id,to_stop_id,train,duration,from_stop_name,from_stop_lat,from_stop_lon,to_stop_name,to_stop_lat,to_stop_lon,from_node,to_node
0,0,101S,103S,1,90.0,Van Cortlandt Park - 242 St,40.889248,-73.898583,238 St,40.884667,-73.90087,101_1,103_1
1,1,103N,101N,1,90.0,238 St,40.884667,-73.90087,Van Cortlandt Park - 242 St,40.889248,-73.898583,103_1,101_1
2,2,103S,104S,1,90.0,238 St,40.884667,-73.90087,231 St,40.878856,-73.904834,103_1,104_1


In [12]:
# #ADD THE EDGES and weighted - method 1
for i in range(0,len(edata.from_node)):
    G.add_edge(edata.from_node[i],edata.to_node[i], 
               weight = edata.duration[i],
               ## attributes
               train = edata.train[i],
               edge_type = 'train')
# # add weighted edges  - method 2
# e= zip(edata.from_node_id,edata.to_node_id,edata.duration)
# G.add_weighted_edges_from(e,train=edata.train)

In [13]:
G.edges(data= True)[:3]

[('136_1', '137_1', {'edge_type': 'train', 'train': '1', 'weight': 80.0}),
 ('136_1', '135_1', {'edge_type': 'train', 'train': '1', 'weight': 60.0}),
 ('138_1', '139_1', {'edge_type': 'train', 'train': '1', 'weight': 90.0})]

### link of sub nodes by transfer

In [14]:
#ADD TRANSFER WITHOUT SWIPE EDGES
tdata = pd.read_csv("network_setup/transfer")
tdata.head(3)

Unnamed: 0.1,Unnamed: 0,from_stop_id,to_stop_id,min_transfer_time,from_stop,to_stop,from_train,to_train,from_node_id,to_node_id
0,0,101,101,180,Van Cortlandt Park - 242 St,Van Cortlandt Park - 242 St,1,1,101_1,101_1
1,1,103,103,180,238 St,238 St,1,1,103_1,103_1
2,2,104,104,180,231 St,231 St,1,1,104_1,104_1


In [15]:
## adding transfer
for i in range(len(tdata)):
    G.add_edge(tdata.from_node_id[i],tdata.to_node_id[i],
               weight = tdata.min_transfer_time[i],
               ## attributes
               from_train = tdata.from_train[i],
               to_train = tdata.to_train[i],
               edge_type = 'transfer')
    
# ## method 2- add self loop
# e2= zip(tdata.from_stop_id,tdata.to_stop_id,tdata.min_transfer_time)
# G.add_weighted_edges_from(e)

In [16]:
G.edges(data= True)[3:5]

[('136_1',
  '136_2',
  {'edge_type': 'transfer',
   'from_train': '1',
   'to_train': '2',
   'weight': 180}),
 ('136_1', '137_1', {'edge_type': 'train', 'train': '1', 'weight': 80.0})]

### link of sub nodes and master nodes by waiting time

In [17]:
# add waiting time
wdata = pd.read_csv("network_setup/waiting")
wdata.head(3)

Unnamed: 0.1,Unnamed: 0,stop_id,train,stop_name,stop_lat,stop_lon,node_id,master_node_id,train_num,interval,waiting
0,0,101,1,Van Cortlandt Park - 242 St,40.889248,-73.898583,101_1,Van Cortlandt Park - 242 St_0,29.0,372.413793,186.206897
1,1,103,1,238 St,40.884667,-73.90087,103_1,238 St_0,33.0,327.272727,163.636364
2,2,104,1,231 St,40.878856,-73.904834,104_1,231 St_0,33.0,327.272727,163.636364


In [18]:
## adding waiting
for i in range(len(wdata)):
    G.add_edge(wdata.master_node_id[i],wdata.node_id[i],
               weight = wdata.waiting[i],
               ## attributes
               train = wdata.train[i],
               stop_id = wdata.stop_id[i],
               edge_type = 'waiting')
    
# ## method 2- add self loop
# e2= zip(tdata.from_stop_id,tdata.to_stop_id,tdata.min_transfer_time)
# G.add_weighted_edges_from(e)

In [19]:
## adding exit
for i in range(len(wdata)):
    G.add_edge(wdata.node_id[i],wdata.master_node_id[i],
               weight = 0,
               ## attributes
               train = wdata.train[i],
               stop_id = wdata.stop_id[i],
               edge_type = 'exit')
    

## Network Info

In [20]:
print nx.info(G)

Name: network_setup/NYC Subway Network
Type: MultiDiGraph
Number of nodes: 1378
Number of edges: 5749
Average in degree:   4.1720
Average out degree:   4.1720


In [21]:
master = filter(lambda (n,attr):attr['master_node']=='Yes',G.nodes(data=True))
sub = filter(lambda (n,attr):attr['master_node']=='No',G.nodes(data=True))
print 'number of master nodes:', len(master)
print 'number of sub nodes:', len(sub)

number of master nodes: 465
number of sub nodes: 913


In [22]:
master_nodes = map(lambda x: x[0], master)

## Centrality

In [23]:
# #top 10 betweenness centrallity nodes
# sorted(nx.betweenness_centrality(G).items(),key=lambda x: x[1],reverse=True)[:10]

In [24]:
# ## top 10 betweenness centrallity nodes
# important_nodes = sorted(nx.betweenness_centrality(G).items(),key=lambda x: x[1],reverse=True)[:10]
# stop_id = map(lambda x: x[0],important_nodes)
# stop_name = ndata[ndata['node_id'].isin(stop_id)]['stop_name'].values
# print zip(stop_id,stop_name)

## REMOVE NODES FROM NETWORK

In [25]:
# di=nx.all_pairs_dijkstra_path_length(G,weight='weight')
# df= pd.DataFrame.from_dict(di)
# df= df.loc[master_nodes,master_nodes]
# df.index.name='to_master_id'
# df.to_csv('whole_network_duration')

In [26]:
# whole_network=pd.read_csv('whole_network_duration_shortest_path.csv',index_col=0)
# whole_network = whole_network.dropna().reset_index(drop= True)
# whole_network.head()

In [27]:
def find_sub_nodes(master_node_id):
    """
    master_node_id =[...]
    if master node is removed, find related master node
    should pass a list of master nodes to this function
    """
    
    
    removed_sub_nodes=[]
    for (n,attr) in G.nodes(data = True):
        try:
            if attr['master_node_id'] in master_node_id:
                removed_sub_nodes.append(n)
        except KeyError:
            pass
    return set(removed_sub_nodes)

In [28]:
def find_affected_od_index(master_node_id):
    """
    find affected o-d pair given a list of a master_node_id
    """
    removed_sub_nodes = find_sub_nodes(master_node_id)
    affected_od_index =[]
    for i in range(len(whole_network)):
        if len(removed_sub_nodes & set(whole_network.loc[i,'shortest_path'].split(','))) == 0:
            pass
        else:
            affected_od_index.append(i)
    return affected_od_index

## remove 2 nodes in the network , calculate the duration

In [29]:
def calculate_duration_remove2nodes(removed_master_node):
    """
    removed_master_node = [...]
    a list of master nodes to be removed
    this function is to calculate shortest path length of new network without multi-nodes
    """
    removed_sub_nodes = find_sub_nodes(removed_master_node)
    
    G_0 = G.copy()
    G_0.remove_nodes_from(removed_sub_nodes)
    dict_duration  = nx.all_pairs_dijkstra_path_length(G_0)
    df = pd.DataFrame.from_dict(dict_duration)
    df.index.name='to_master_id'
    
    ### some error raises ,eg: 'Jamaica Center - Parsons/Archer_0' cannot save to csv
    ### so change / into -
    filename = ','.join(removed_master_node)
    filename = filename.replace('/','-')
    df.loc[master_nodes,master_nodes].to_csv('remove-two-node/'+filename)
    return dict_duration


In [30]:
t1= datetime.datetime.now()
removed_master_node= ['Canal St_0','Times Sq - 42 St_0']
calculate_duration_remove2nodes(removed_master_node)
t2 = datetime.datetime.now()-t1
print t2

0:00:29.847392


# ! If you could see the time, then the setup is fine

## Find important nodes

#### Find the master nodes, who has more connections to other master nodes
eg. Jay St master node is connected to High St by line A and line C, but it's just considered as 1 degree. Jay st is connected to York St by line F, then it is consider as another degree. So calculate how many master nodes each master node is connected to and rank them.

In [None]:
# ## this step filter out all the master nodes pairs, which are next station to each other
# master_nodes_pair={}
# for i in itertools.combinations(master_nodes,2):
#     for path in nx.all_simple_paths(G, source=i[0], target=i[1],cutoff=3):
#         if path:
#             master_nodes_pair[i] = path

In [None]:
# master_nodes_pair= master_nodes_pair.keys()
# master_nodes_pair = list(np.array(master_nodes_pair).flatten())

In [None]:
# occurence = {}
# for i in set(master_nodes_pair):
#     occurence[i] = master_nodes_pair.count(i)

In [None]:
# master_nodes_connection = sorted(occurence.items(),key=lambda x:x[1],reverse=True)

In [None]:
# master_nodes_connection[:10]

In [None]:
# usually the station shall have at least two master nodes connected, -- previous stop and next stop
# station has at least 3
len(filter(lambda x:x[1]>=3, master_nodes_connection))

In [None]:
# usually the station shall have at least two master nodes connected, -- previous stop and next stop
# station has  at least 4
len(filter(lambda x:x[1]>=4, master_nodes_connection))

In [None]:
# #connection_4 is master nodes, which are connected to at least 4 other master nodes
# connection_4=filter(lambda x:x[1]>=4, master_nodes_connection)

In [None]:
# connection_4[:10]

#### Find the master nodes, have more lines come to the station

In [None]:
# degree = nx.degree(G)

In [None]:
# degree = filter(lambda (x,y): x in master_nodes,
       sorted(degree.iteritems(), key=lambda (k,v): (v,k),reverse =True))

In [None]:
#  # if set threshold as at least 3 lines come to the station, we could get 61 stations.
# len(filter(lambda x:x[1]>=6 ,degree))

In [None]:
# degree_6 = filter(lambda x:x[1]>=6 ,degree)

In [None]:
# degree_6[:10]

In [None]:
# So if combining two definition of degree, will get 81 important stations
# len(set(map(lambda x:x[0],connection_4)+map(lambda x:x[0],degree_6)))

#### Find the master nodes, top 50 criticality in one-node removal results

In [None]:
# delay = pd.read_csv('delay_one_node_removal.csv',index_col=0)
# critical_nodes =delay.head(50)['station'].values

In [None]:
# So if combining 3 concepts of important stations together ,will get 109 nodes
# len(set(map(lambda x:x[0],connection_4)+map(lambda x:x[0],degree_6)+ list(critical_nodes)))

In [None]:
# important_nodes = list(set(map(lambda x:x[0],connection_4)+map(lambda x:x[0],degree_6)+ list(critical_nodes)))

## Get addtional pairs to run and split into 5 files

In [None]:
# all_pairs=[]
# for pair in itertools.combinations(important_nodes,2):
#     all_pairs.append(pair)

In [None]:
# len(all_pairs)

In [None]:
# already_run_pairs=[]
# for pair in itertools.combinations(critical_nodes,2):
#     already_run_pairs.append(pair)

In [None]:
# len(already_run_pairs)

In [None]:
# need_run_pairs=[]
# for x,y in all_pairs:
#     if ((x,y) in already_run_pairs)or ((y,x) in already_run_pairs):
#         pass
#     else:
#         need_run_pairs.append((x,y))

In [None]:
## there are 4661 pairs needed to be run more
# len(need_run_pairs)

In [None]:
# pd.DataFrame(need_run_pairs[:930]).to_csv('pairs_0')

In [None]:
# i=1
# pd.DataFrame(need_run_pairs[930*i:930*(i+1)]).to_csv('pairs_'+str(i))

In [None]:
# i=2
# pd.DataFrame(need_run_pairs[930*i:930*(i+1)]).to_csv('pairs_'+str(i))

In [None]:
# i=3
# pd.DataFrame(need_run_pairs[930*i:930*(i+1)]).to_csv('pairs_'+str(i))

In [None]:
# i=4
# pd.DataFrame(need_run_pairs[930*i:]).to_csv('pairs_'+str(i))

## Finally we can run the data now! Change the file name below. And let it run overnight

In [None]:
df= pd.read_csv('pairs_0')## input your file
del df['Unnamed: 0']
for i in range(len(df)):
#for i in range(0,1):
    removed_master_node= [df.iloc[i,0],df.iloc[i,1]]
    calculate_duration_remove2nodes(removed_master_node)

## Calculate duration difference compare to the whole network by each file

In [38]:
def transform_duration_df(filename):
    """
    the original file is 465 rows * 465 columns,
    get rid of those, which O-D are the same 
    now the file would be 215760 rows * 3 columns: [from_master_id, to_master_id, duration]
    
    """
    df = pd.read_csv('remove-two-node/'+filename)
    duration=pd.DataFrame()
    for i in df.columns[1:]:
        df_0 = df.loc[:,['to_master_id',i]]
        df_0['from_master_id'] =i
        df_0 = df_0.rename(columns={i:'duration_when_nodes_removed'})
        duration = pd.concat([duration,df_0])
        duration = duration[duration['to_master_id']!=duration['from_master_id']].reset_index(drop= True)
    return duration

In [39]:
whole_network_duration =pd.read_csv('whole_network_duration_shortest_path.csv')
whole_network_duration= whole_network_duration.rename(columns={'from_master':'from_master_id',
                                      'to_master':'to_master_id'})
whole_network_duration=whole_network_duration[['from_master_id','to_master_id','duration']]

In [40]:
def calculate_duration_diff(filename):
    df = transform_duration_df(filename)
    df= whole_network_duration.merge(df,on=['from_master_id','to_master_id'])
    df['duration_diff'] = df['duration_when_nodes_removed']-df['duration']
    df = df[['from_master_id','to_master_id','duration_diff']].rename(columns={'duration_diff':filename})
    return df

## duration difference of removing 2 node

In [41]:
whole_network_duration = np.array(list(csv.reader(open("whole_network_duration", "rb"),
                                                  delimiter=",")))

In [42]:
odpair=[]
for i in itertools.product(whole_network_duration[0,1:],repeat=2):
    odpair.append(i)

In [43]:
## x[0] is to_master_id, x[1] is from_master_id
odpair[:5]

[('Grant Av_0', 'Grant Av_0'),
 ('Grant Av_0', 'Atlantic Av - Barclays Ctr_0'),
 ('Grant Av_0', 'Richmond Valley_0'),
 ('Grant Av_0', '169 St_0'),
 ('Grant Av_0', 'Neptune Av_0')]

In [44]:
df_diff=pd.DataFrame()

In [47]:
'.DS_Store' in os.listdir('remove-two-node')

False

In [50]:
for i in range(1):
    file_name = os.listdir('remove-two-node')[i]
    if file_name =='.DS_Store':
        continue
    df=np.array(list(csv.reader(open("remove-two-node/"+file_name, "rb"), delimiter=",")))
    if (whole_network_duration[:,0] == df[:,0]).all() & (whole_network_duration[0,:] == df[0,:]).all() :
        df_0= whole_network_duration[1:,1:]
        df_0[df_0=='']=np.nan
        df_0 = df_0.astype('float')
    
        df = df[1:,1:]
        df[df=='']=np.nan
        df = df.astype('float')
        difference=(df-df_0).flatten()
        df_diff= pd.DataFrame.from_dict({file_name:dict(zip(odpair,difference))})
        #df_diff = df_diff.join(df_diff_1)

In [None]:
for file_name in os.listdir('remove-two-node')[1:]:
    df=np.array(list(csv.reader(open("remove-two-node/"+file_name, "rb"), delimiter=",")))
    if (whole_network_duration[:,0] == df[:,0]).all() & (whole_network_duration[0,:] == df[0,:]).all() :
        df_0= whole_network_duration[1:,1:]
        df_0[df_0=='']=np.nan
        df_0 = df_0.astype('float')
    
        df = df[1:,1:]
        df[df=='']=np.nan
        df = df.astype('float')
        difference=(df-df_0).flatten()
        df_diff_1 = pd.DataFrame.from_dict({file_name:dict(zip(odpair,difference))})
        df_diff = df_diff.join(df_diff_1)

In [53]:
df_diff.columns

Index([u'W 4 St_0,86 St_1', u'Grant Av_0,Saratoga Av_0',
       u'47-50 Sts - Rockefeller Ctr_0,Jay St - MetroTech_0',
       u'Kew Gardens - Union Tpke_0,14 St_0',
       u'Kew Gardens - Union Tpke_0,Jay St - MetroTech_0',
       u'47-50 Sts - Rockefeller Ctr_0,Rockaway Av_0', u'145 St_0,168 St_0',
       u'Kew Gardens - Union Tpke_0,25 St_0',
       u'Kew Gardens - Union Tpke_0,Borough Hall_0',
       u'Nostrand Av_1,Mets - Willets Point_0',
       ...
       u'59 St_0,72 St_0', u'Flushing - Main St_0,75 St_0',
       u'DeKalb Av_0,59 St_1', u'Flushing - Main St_0,Fulton St_1',
       u'Flushing - Main St_0,Queens Plaza_0', u'Flushing - Main St_0,72 St_0',
       u'25 St_0,125 St_2', u'Van Siclen Av_0,Union St_0',
       u'DeKalb Av_0,7 Av_0', u'Flushing - Main St_0,Van Siclen Av_0'],
      dtype='object', length=4661)

In [54]:
df_diff.to_csv('remove_two_nodes_duration_2.csv.gz',compression='gzip')

In [55]:
#check the size of the file
os.stat('remove_two_nodes_duration_2.csv.gz').st_size

169480979

## MERGE TWO RUNS TOGETHER

In [72]:
df= pd.read_csv('remove_two_nodes_duration_diff.csv.gz',compression='gzip')

In [76]:
df = df.rename(columns={'Unnamed: 0':'to_master_id',
                   'Unnamed: 1':'from_master_id'
                  })

In [79]:
df = df[df['to_master_id']!=df['from_master_id']]

In [81]:
df_diff_agg = df_diff.merge(df,on=['to_master_id','from_master_id'])

In [85]:
df_diff=df_diff_agg

In [86]:
df_diff.shape

(215760, 5888)

## TWO NODES CRITICIALITY 

In [87]:
##  NaN, means not reachable, then use 1 hour to replace, = 3600 seconds 
df_diff= df_diff.fillna(3600)
df_diff= df_diff.reset_index().rename(columns={'level_0':'to_master_id',
                                     'level_1':'from_master_id'})
df_diff = df_diff[df_diff['from_master_id']!= df_diff['to_master_id']]

In [95]:
df_diff.head()

Unnamed: 0,index,to_master_id,from_master_id,"W 4 St_0,86 St_1","Grant Av_0,Saratoga Av_0","47-50 Sts - Rockefeller Ctr_0,Jay St - MetroTech_0","Kew Gardens - Union Tpke_0,14 St_0","Kew Gardens - Union Tpke_0,Jay St - MetroTech_0","47-50 Sts - Rockefeller Ctr_0,Rockaway Av_0","145 St_0,168 St_0",...,"14 St - Union Sq_0,168 St - Washington Hts_0","Franklin Av_1,Broadway_1","42 St - Port Authority Bus Terminal_0,86 St_4","Franklin Av_1,Church Av_0","42 St - Port Authority Bus Terminal_0,E 180 St_0","42 St - Port Authority Bus Terminal_0,88 St_0","Franklin Av_1,Whitehall St_0","Atlantic Av - Barclays Ctr_0,30 Av_0","Franklin Av_1,88 St_0","36 St_0,Euclid Av_0"
0,0,1 Av_0,103 St - Corona Plaza_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,116.565421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1 Av_0,103 St_0,0.0,0.0,0.0,25.120482,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,1 Av_0,103 St_1,1242.30179,0.0,0.0,0.0,0.0,0.0,0.0,...,489.095745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1 Av_0,103 St_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,235.120482,0.0,235.120482,235.120482,0.0,0.0,0.0,0.0
4,4,1 Av_0,104 St_0,0.0,3600.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3600.0,0.0,0.0,3600.0,3600.0


In [96]:
demand_0 = pd.read_csv('od_demand_distribution_v2.csv',index_col=0)

In [97]:
demand_0.head()

Unnamed: 0,from_master,to_master,demand_dist
0,1 Av_0,103 St_0,5.917962
1,1 Av_0,103 St_1,18.316424
2,1 Av_0,103 St_2,3.389157
3,1 Av_0,110 St_0,9.845766
4,1 Av_0,116 St - Columbia University_0,3.609837


In [98]:
demand_0= demand_0.rename(columns={'from_master':'from_master_id','to_master':'to_master_id'})

In [99]:
df_diff_0 = df_diff.merge(demand_0,on = ['to_master_id','from_master_id'])

In [102]:
np.array(df_diff_0.loc[:,pair])

array(['103 St - Corona Plaza_0', '103 St_0', '103 St_1', ...,
       'Woodhaven Blvd_1', 'Woodlawn_0', 'York St_0'], dtype=object)

In [105]:
delay = {}
for pair in df_diff_0.columns[3:-1]:
    delay[pair]= sum(np.array(df_diff_0.loc[:,pair]) * np.array(df_diff_0.loc[:,'demand_dist']))


In [107]:
delay = pd.DataFrame(sorted(delay.items(), key=lambda x: x[1],reverse=True),columns=['station','delay'])

In [108]:
delay.head()

Unnamed: 0,station,delay
0,"Jackson Hts - Roosevelt Av_0,Broadway Jct_0",931256000.0
1,"Jackson Hts - Roosevelt Av_0,Queensboro Plaza_0",919569300.0
2,"Broadway Jct_0,Forest Hills - 71 Av_0",787924100.0
3,"Broadway Jct_0,Franklin Av_1",725001900.0
4,"Broadway Jct_0,Kew Gardens - Union Tpke_0",716313200.0


In [109]:
delay.to_csv('duration_diff_two_nodes_removal_multiply_demand_agg.csv')