# ETL file to convert the geojson file into network files (modified by purin)

In [5]:
import json 
import csv
import networkx as nx
import os, sys

In [6]:
nodefile = "vertex/vertex.csv"
edgefile = "edge/edge.csv"
jsonfile = "geojson/multigraph.geojson"
graphmlfile = "graphml/multigraph.graphml"
gmlfile = "gml/multigraph.gml"

if not os.path.exists("edge"):
    os.makedirs("edge")
if not os.path.exists("vertex"):
    os.makedirs("vertex")
if not os.path.exists("gml"):
    os.makedirs("gml")

In [7]:
with open(jsonfile, 'r') as jsfile:
    with open(nodefile, 'w+') as nodefiled:
        with open(edgefile, 'w+') as edgefiled:
            node = csv.writer(nodefiled)
            edge = csv.writer(edgefiled)
            # Header
            node.writerow(["# NodeID", "Lat", " Lon", "Layer"])
            edge.writerow(["# EdgeID", "Source NodeID", "Target NodeID", "Direction", "Layer"])
            for line in jsfile:
                jsentry = json.loads(line)
                if jsentry['properties']['type'] == "node":
                    node.writerow([
                            jsentry['_id']['$oid'], 
                            jsentry['geometry']['coordinates'][0], 
                            jsentry['geometry']['coordinates'][1], 
                            jsentry['properties']['layer']])
                if jsentry['properties']['type'] == "edge":
                    if jsentry['properties']['name'].startswith("54"):
                        jsentry['properties']['name'] = "None"
                    if 'direction' in jsentry['properties']:
                        if jsentry['properties']['direction'] == "Double sens":
                            direction = "TwoWay"
                        elif (jsentry['properties']['direction'] == "Sens inverse" 
                              or jsentry['properties']['direction'] == "Sens unique"):
                            direction = "OneWay"
                            
                        edge.writerow([
                            jsentry['properties']['mongo_org_id'],
                            jsentry['properties']['mongo_dest_id'],
                            jsentry['_id']['$oid'], 
                            direction,
                            jsentry['properties']['layer'], 
                            jsentry['properties']['name']])
                    else:
                        edge.writerow([
                            jsentry['properties']['mongo_org_id'],
                            jsentry['properties']['mongo_dest_id'],
                            jsentry['_id']['$oid'],
                            "TwoWay",
                            jsentry['properties']['layer'], 
                            jsentry['properties']['name']])

In [8]:
G = nx.DiGraph()

with open(nodefile, 'r') as node:
    reader = csv.reader(node)
    next(reader)
    for row in reader:
        if len(row)!=0:
            lat = row[1]
            lon = row[2]
            ntype = row[3]
            G.add_node(row[0], lat=lat, lon=lon, type=ntype)

with open(edgefile, 'r') as node:
    reader = csv.reader(node)
    next(reader)
    for row in reader:
        if len(row)!=0:
            G.add_edge(row[1], row[2], type=row[4], name=row[5])
            if row[3] == 'TwoWay':
                G.add_edge(row[2], row[1], type=row[4], name=row[5])
        
nx.write_graphml(G, graphmlfile)
nx.write_graphml(G, gmlfile)

In [9]:
print(G.number_of_nodes())
print(G.number_of_edges())

42678
51931


# convert to df

In [10]:
import pandas as pd

In [11]:
# Convert the graph to a pandas DataFrame of edges
df_edges = nx.to_pandas_edgelist(G)

# Convert the graph nodes to a pandas DataFrame
df_nodes = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')

In [12]:
df_edges.head()

Unnamed: 0,source,target,type,name
0,5453b63355474a3362317270,54b7bef755474a2bb2745109,road,D137
1,5453b63355474a3362317271,54b7bef855474a2bb27479db,road,
2,5453b63355474a3362317271,54b7bef855474a2bb27485d1,road,
3,5453b63355474a3362317272,54b7bef755474a2bb2745729,road,D125
4,5453b63355474a3362317272,54b7bef855474a2bb274a0f8,road,D124


In [13]:
df_nodes.head()

Unnamed: 0,lat,lon,type
5453b63355474a3362317270,3.5372355545105947,48.231939820695146,road
5453b63355474a3362317271,2.3955539609513337,49.04966337162509,road
5453b63355474a3362317272,1.6563044786850607,49.05187906283865,road
5453b63355474a3362317273,2.091532960687826,49.05588679019185,road
5453b63355474a3362317274,2.146181135828161,49.05648991168742,road


# function

In [14]:
''' =================================================================================================
function encode uniqie value inspecify col
================================================================================================= '''
def encode_col_value(df_data, col, new_col_name):
    df_data[new_col_name], uniques = pd.factorize(df_data[col])
    
    return df_data, uniques

In [15]:
''' =================================================================================================
function map node feature into edge df
================================================================================================= '''
def map_pos_on_data(df_edge, df_node):
    df_edge['node_lat'] = df_edge['source'].map(df_node.set_index('node_id')['lat'])
    df_edge['node_lon'] = df_edge['source'].map(df_node.set_index('node_id')['lon'])
    df_edge['node_tg_lat'] = df_edge['target'].map(df_node.set_index('node_id')['lat'])
    df_edge['node_tg_lon'] = df_edge['target'].map(df_node.set_index('node_id')['lon'])
    df_edge['node_type'] = df_edge['source'].map(df_node.set_index('node_id')['type'])
    df_edge['node_tg_type'] = df_edge['target'].map(df_node.set_index('node_id')['type'])
    
    return df_edge

# adjust data

In [16]:
# make node index to col
df_nodes2 = df_nodes.rename_axis('node_id').reset_index()

In [17]:
# create col to encode (assign unique name) of node into series number
df_nodes2,_ = encode_col_value(df_nodes2, col='node_id', new_col_name='node')

In [18]:
df_nodes2.head()

Unnamed: 0,node_id,lat,lon,type,node
0,5453b63355474a3362317270,3.5372355545105947,48.231939820695146,road,0
1,5453b63355474a3362317271,2.3955539609513337,49.04966337162509,road,1
2,5453b63355474a3362317272,1.6563044786850607,49.05187906283865,road,2
3,5453b63355474a3362317273,2.091532960687826,49.05588679019185,road,3
4,5453b63355474a3362317274,2.146181135828161,49.05648991168742,road,4


In [19]:
# map lat-lon into edge df
df_edge2 = map_pos_on_data(df_edges, df_nodes2)

In [29]:
df_edge2.head()

Unnamed: 0,source,target,type,name,node_lat,node_lon,node_tg_lat,node_tg_lon,node_type,node_tg_type
0,5453b63355474a3362317270,54b7bef755474a2bb2745109,road,D137,3.5372355545105947,48.231939820695146,,,road,
1,5453b63355474a3362317271,54b7bef855474a2bb27479db,road,,2.3955539609513337,49.04966337162509,,,road,
2,5453b63355474a3362317271,54b7bef855474a2bb27485d1,road,,2.3955539609513337,49.04966337162509,,,road,
3,5453b63355474a3362317272,54b7bef755474a2bb2745729,road,D125,1.6563044786850607,49.05187906283865,,,road,
4,5453b63355474a3362317272,54b7bef855474a2bb274a0f8,road,D124,1.6563044786850607,49.05187906283865,,,road,


In [25]:
df_edge2.tail()

Unnamed: 0,source,target,type,name,node_lat,node_lon,node_tg_lat,node_tg_lon,node_type,node_tg_type
51926,54b954ba55474a0e08bd3f33,5453b63555474a336231a8c5,crosslayer,,,,2.3304290066,48.8718083114,,train
51927,54b954ba55474a0e08bd3f34,5453b63455474a336231821b,crosslayer,,,,2.3143569,48.8939663,,train
51928,54b954ba55474a0e08bd3f35,5453b63455474a33623177e3,crosslayer,,,,2.3331828,48.8330932,,train
51929,54b954ba55474a0e08bd3f36,5453b63555474a336231a5ca,crosslayer,,,,2.345551,48.8534338,,train
51930,54b954ba55474a0e08bd3f37,5453b63455474a3362319697,crosslayer,,,,2.30026252375,48.8756403455,,train


In [22]:
df_nodes2[df_nodes2['node_id']=='5453b63455474a336231834a']

Unnamed: 0,node_id,lat,lon,type,node
4314,5453b63455474a336231834a,1.6252952641562517,48.56889242904061,road,4314


In [27]:
df_edge2[df_edge2['source']=='5453b63455474a3362317ac8']

Unnamed: 0,source,target,type,name,node_lat,node_lon,node_tg_lat,node_tg_lon,node_type,node_tg_type
3777,5453b63455474a3362317ac8,54b7bef955474a2bb274a6e0,metro,M1,2.3012613,48.8718025,,,metro,


In [26]:
df_edge2[df_edge2['target']=='54b954ba55474a0e08bd3f37']

Unnamed: 0,source,target,type,name,node_lat,node_lon,node_tg_lat,node_tg_lon,node_type,node_tg_type
16204,5453b63455474a3362319697,54b954ba55474a0e08bd3f37,crosslayer,,2.30026252375,48.8756403455,,,train,


In [28]:
df_edge2.describe()

Unnamed: 0,source,target,type,name,node_lat,node_lon,node_tg_lat,node_tg_lon,node_type,node_tg_type
count,51931,51931,51931,51931.0,27181.0,27181.0,24750.0,24750.0,27181,24750
unique,39626,41401,5,1437.0,14872.0,14872.0,14216.0,14216.0,4,4
top,5453b63455474a3362318d07,5453b63455474a3362318d07,road,,2.328360950855982,48.87294227162442,2.328360950855982,48.87294227162442,road,road
freq,10,10,42283,20988.0,10.0,10.0,10.0,10.0,26364,23933
