In [1]:
import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2

In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [3]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [4]:
session = driver.session(database="neo4j")

In [5]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [6]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [7]:
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")


In [8]:
def my_neo4j_create_node(station_name):
    "create a node with label Station"
    
    query = """
    
    CREATE (:Station {name: $station_name})
    
    """
    
    session.run(query, station_name=station_name)
    

In [9]:
def my_neo4j_create_relationship_one_way(from_station, to_station, weight):
    "create a relationship one way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)
    

In [10]:
def my_neo4j_create_relationship_two_way(from_station, to_station, weight):
    "create relationships two way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to),
           (to)-[:LINK {weight: $weight}]->(from)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)
    

In [11]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [12]:
cursor = connection.cursor()

In [13]:
my_neo4j_wipe_out_database() 

In [14]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 0
  Relationships: 0
-------------------------


## 1. Create station nodes

In [15]:
connection.rollback()

query = """

select station
from stations
order by station

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    station = row[0]
    
    my_neo4j_create_node('depart ' + station)
    my_neo4j_create_node('arrive ' + station)
    

## 2. Create connections between stations and lines

In [16]:
connection.rollback()

query = """

select station, line
from lines
order by station, line

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    station = row[0]
    line = row[1]
    
    depart = 'depart ' + station
    arrive = 'arrive ' + station
    line_station = line + ' ' + station
    
    my_neo4j_create_node(line_station)
    my_neo4j_create_relationship_one_way(depart, line_station, 0)
    my_neo4j_create_relationship_one_way(line_station, arrive, 0)
    

## 3. Create connections between transfers

In [17]:
connection.rollback()

query = """

select a.station, a.line as from_line, b.line as to_line, s.transfer_time
from lines a
     join lines b
       on a.station = b.station and a.line <> b.line 
     join stations s
       on a.station = s.station
order by 1, 2, 3

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    station = row[0]
    from_line = row[1]
    to_line = row[2]
    transfer_time = int(row[3])
    
    from_station = from_line + ' ' + station
    to_station = to_line + ' ' + station
    
    my_neo4j_create_relationship_one_way(from_station, to_station, transfer_time)
    

## 4. Create connections between stations on the same line

In [18]:
connection.rollback()

query = """

select a.line, a.station as from_station, b.station as to_station, t.travel_time
from lines a
  join lines b
    on a.line = b.line and b.sequence = (a.sequence + 1)
  join travel_times t
    on (a.station = t.station_1 and b.station = t.station_2)
        or (a.station = t.station_2 and b.station = t.station_1)
order by line, from_station, to_station

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    line = row[0]
    from_station = line + ' ' + row[1]
    to_station = line + ' ' + row[2]
    travel_time = int(row[3])
    
    my_neo4j_create_relationship_two_way(from_station, to_station, travel_time)
    

In [19]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 214
  Relationships: 652
-------------------------


## For given store, find station distances

In [37]:
def distance_to_stations(store_id):
    # Distance is in miles
    rollback_before_flag = True
    rollback_after_flag = True

    query = f"""

    create extension cube;
    create extension earthdistance;

    SELECT 
        store_id, 
        street, 
        city, 
        state, 
        station, 
        (point(stores.longitude,stores.latitude) <@> point(stations.longitude,stations.latitude)) as distance
    FROM stores, stations
    WHERE store_id={store_id}
    """

    df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
    return df

def closest_station(store_id):
    df = distance_to_stations(store_id)
    return df.iloc[np.argmin(df['distance'])]

In [38]:
distance_to_stations(1)

Unnamed: 0,store_id,street,city,state,station,distance
0,1,3000 Telegraph Ave,Berkeley,CA,12th Street,3.640897
1,1,3000 Telegraph Ave,Berkeley,CA,16th Street Mission,10.731787
2,1,3000 Telegraph Ave,Berkeley,CA,19th Street,3.324118
3,1,3000 Telegraph Ave,Berkeley,CA,24th Street Mission,11.216962
4,1,3000 Telegraph Ave,Berkeley,CA,Antioch,27.756933
5,1,3000 Telegraph Ave,Berkeley,CA,Ashby,0.547784
6,1,3000 Telegraph Ave,Berkeley,CA,Balboa Park,13.779555
7,1,3000 Telegraph Ave,Berkeley,CA,Bay Fair,13.168268
8,1,3000 Telegraph Ave,Berkeley,CA,Berryessa,39.731729
9,1,3000 Telegraph Ave,Berkeley,CA,Castro Valley,15.210308


In [39]:
closest_station(1)

store_id                     1
street      3000 Telegraph Ave
city                  Berkeley
state                       CA
station                  Ashby
distance              0.547784
Name: 5, dtype: object

## Looks like we are only really interested in the Ashby station, since it is the closest to the store

## 5. Create one node for store_id 1 and connect it to Ashby 

In [71]:
my_neo4j_create_node('store_1')
my_neo4j_create_relationship_one_way('store_1', 'depart Ashby', 0.547784)


In [41]:
rollback_before_flag = True
rollback_after_flag = True

query = """

SELECT * 
FROM customers 

"""

df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df.head()

Unnamed: 0,customer_id,first_name,last_name,street,city,state,zip,closest_store_id,distance
0,1,Robb,Weaving,5 Ramsey Place,Oakland,CA,94609,1,1
1,2,Robby,Belliard,6 Londonderry Plaza,Oakland,CA,94609,1,1
2,3,Sadella,Caudrelier,548 Mcguire Parkway,Oakland,CA,94609,1,1
3,4,Holmes,Shimmings,99 Kennedy Court,Oakland,CA,94609,1,1
4,5,Beverley,Gubbin,51 Mcbride Drive,Oakland,CA,94609,1,1


## For given customer, find closest BART station

In [67]:
def customer_distance_to_stations(customer_id):
    rollback_before_flag = True
    rollback_after_flag = True

    query = f"""
    create extension cube;
    create extension earthdistance;
    
    SELECT 
        customer_id, 
        first_name,
        last_name,
        customers.zip,
        closest_store_id, 
        customers.distance as distance_to_store, 
        station,
        (point(zip_codes.longitude,zip_codes.latitude) <@> point(stations.longitude,stations.latitude)) as distance
    FROM customers, stations, zip_codes 
    WHERE customer_id = {customer_id} AND customers.zip = zip_codes.zip
    """

    df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
    return df

def customer_closest_station(customer_id):
    df = customer_distance_to_stations(customer_id)
    return df.iloc[np.argmin(df['distance'])]

In [66]:
customer_distance_to_stations(1)

Unnamed: 0,customer_id,first_name,last_name,zip,closest_store_id,distance_to_store,station,distance
0,1,Robb,Weaving,94609,1,1,12th Street,2.161916
1,1,Robb,Weaving,94609,1,1,16th Street Mission,9.763365
2,1,Robb,Weaving,94609,1,1,19th Street,1.843979
3,1,Robb,Weaving,94609,1,1,24th Street Mission,10.168682
4,1,Robb,Weaving,94609,1,1,Antioch,28.502333
5,1,Robb,Weaving,94609,1,1,Ashby,1.332969
6,1,Robb,Weaving,94609,1,1,Balboa Park,12.674988
7,1,Robb,Weaving,94609,1,1,Bay Fair,12.109629
8,1,Robb,Weaving,94609,1,1,Berryessa,38.617901
9,1,Robb,Weaving,94609,1,1,Castro Valley,14.301211


In [68]:
customer_closest_station(1)

customer_id                  1
first_name                Robb
last_name              Weaving
zip                      94609
closest_store_id             1
distance_to_store            1
station              MacArthur
distance               0.44779
Name: 26, dtype: object

Note: if the distance to the nearest station plus the distance from Ashby to the nearest store (0.547784 mi) is greater than or equal to the "distance_to_store", then there is no reason to use BART. Otherwise, proceed through notebook. 

## Find travel time from Ashby to nearest station

In [69]:
def my_neo4j_shortest_path(from_station, to_station):
    "given a from station and to station, run and print the shortest path"
    
    query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
    session.run(query)

    query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
    session.run(query)

    query = """

    MATCH (source:Station {name: $source}), (target:Station {name: $target})
    CALL gds.shortestPath.dijkstra.stream(
        'ds_graph', 
        { sourceNode: source, 
          targetNode: target, 
          relationshipWeightProperty: 'weight'
        }
    )
    YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
    RETURN
        gds.util.asNode(sourceNode).name AS from,
        gds.util.asNode(targetNode).name AS to,
        totalCost,
        [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
        costs
    ORDER BY index

    """

    result = session.run(query, source=from_station, target=to_station)
    
    for r in result:
        
        total_cost = int(r['totalCost'])
        
        print("\n--------------------------------")
        print("   Total Cost: ", total_cost)
        print("   Minutes: ", round(total_cost / 60.0,1))
        print("--------------------------------")
        
        nodes = r['nodes']
        costs = r['costs']
        
        i = 0
        previous = 0
        
        for n in nodes:
            
            print(n + ", " + str(int(costs[i]) - previous)  + ", " + str(int(costs[i])))
            
            previous = int(costs[i])
            i += 1
    

In [72]:
my_neo4j_shortest_path('depart Ashby', 'arrive MacArthur')


--------------------------------
   Total Cost:  240
   Minutes:  4.0
--------------------------------
depart Ashby, 0, 0
red Ashby, 0, 0
red MacArthur, 240, 240
arrive MacArthur, 0, 240
