Code and data is provided by LinkedIn Learning: Create a Data Project with Neo4j (https://www.linkedin.com/learning/create-a-data-project-with-neo4j?u=76870426)

In [1]:
# %pip install --upgrade py2neo
from py2neo import Graph
from py2neo.bulk import create_nodes, create_relationships
import pandas as pd
import json

## Load data

In [2]:
sf_data = pd.read_csv("sf_dataset.csv")
sf_data.head()

Unnamed: 0,business_id,business_name,business_address,city,zip,latitude,longitude,user_name,deviceID,scan_timestamp
0,0190125-02-001,Greenline Publications Inc,68 Jordan Dr,San Francisco,94118,37.784632,-122.456876,Heather Baker,7874863620816,2022-01-03 23:19:25
1,1024823-04-151,Custom Corporate Catering,101 Spear St #A-21,San Francisco,94105,37.792534,-122.394067,Shelia Martin,5403628525158,2022-01-01 04:44:00
2,0311287-01-001,Club Donatello,501 Post St,San Francisco,94102,37.788083,-122.41008,Megan Montgomery,6411766564885,2022-01-03 20:22:08
3,0192864-00-000,Ly Ha,1500 Felton St,San Francisco,94134,37.726135,-122.419558,Dr. Diane Dyer,5612683382982,2022-01-03 17:06:04
4,0491034-01-001,Rabbee & Associates,5120 Diamond Heights Blvd #C,San Francisco,94131,37.74709,-122.440617,Nicholas Ward,6704867322407,2022-01-03 18:40:44


In [3]:
sf_data.dtypes

business_id          object
business_name        object
business_address     object
city                 object
zip                   int64
latitude            float64
longitude           float64
user_name            object
deviceID              int64
scan_timestamp       object
dtype: object

In [4]:
# Change deviceID from int to string
sf_data['deviceID'] = sf_data['deviceID'].astype(str)

In [5]:
sf_data.dtypes

business_id          object
business_name        object
business_address     object
city                 object
zip                   int64
latitude            float64
longitude           float64
user_name            object
deviceID             object
scan_timestamp       object
dtype: object

## Identify the nodes and relations for the data
Nodes: Person, Business, Zip code<br>
Relationships: User -[goes to]-> Business, Business -[is located]-> Zip code

In [6]:
# Prepare the Person nodes
# Filter out data and drop duplicates (nodes can only be added once)
df_person = sf_data.filter(["user_name","deviceID"])
df_person = df_person.drop_duplicates('deviceID', keep='last')
print(df_person.shape)

# Turn into JSON dictionary
json_person = df_person.to_json(orient="records")
dict_person = json.loads(json_person)
print(len(dict_person))

(4998, 2)
4998


In [7]:
# Prepare the Business nodes
# Filter out data and drop duplicates (nodes can only be added once)
df_business = sf_data.filter(["business_id","business_name","business_address","latitude","longitude"])
df_business = df_business.drop_duplicates('business_id', keep='last')
print(df_business.shape)

# Turn into JSON dictionary
json_business = df_business.to_json(orient="records")
dict_business = json.loads(json_business)
print(len(dict_business))

(2500, 5)
2500


In [8]:
# Prepare Zip nodes
# Filter out data and drop duplicates (nodes can only be added once)
df_zip = sf_data.filter(["zip"])
df_zip = df_zip.drop_duplicates('zip', keep='last')
print(df_zip.shape)

# Turn into JSON dictionary
json_zip = df_zip.to_json(orient="records")
dict_zip = json.loads(json_zip)
print(len(dict_zip))

(36, 1)
36


In [9]:
# Prepare relationships
person_bus_relation = sf_data.filter(["business_id","deviceID","scan_timestamp"])
json_pbr = person_bus_relation.to_json(orient="records")
dict_pbr = json.loads(json_pbr)
print(person_bus_relation.shape, len(dict_pbr))

bus_zip_relation = sf_data.filter(["business_id","zip"])
bus_zip_relation = bus_zip_relation.drop_duplicates('business_id', keep='last')
json_bzr = bus_zip_relation.to_json(orient="records")
dict_bzr = json.loads(json_bzr)
print(bus_zip_relation.shape, len(dict_bzr))

(50000, 3) 50000
(2500, 2) 2500


## Upload to Neo4j

In [10]:
#connect to neo4j
graph = Graph("neo4j+s://bd9aa907.databases.neo4j.io", auth=("neo4j", "2mw9RXAsmXh1BWKxZYeQnq0Qf-uyvC5C0EMlG9uyLa4"))
graph.delete_all()

In [11]:
# Create Business nodes
create_nodes(graph.auto(), dict_business, labels={"Business"})
print(graph.nodes.match("Business").count())

# Create Zip nodes
create_nodes(graph.auto(), dict_zip, labels={"Zip"})
print(graph.nodes.match("Zip").count())

# create Person nodes
create_nodes(graph.auto(), dict_person, labels={"Person"})
print(graph.nodes.match("Person").count())

2500
36
4998


In [12]:
# Create relationships people visiting businesses
ex_people = []

for p in dict_pbr:
    device= p["deviceID"]
    business = p["business_id"]
    p.pop("deviceID")
    p.pop("business_id")

    # Append as start node, relation, end node
    ex_people.append((device, p, business))

print(len(ex_people))

50000


In [13]:
# Relationship need to be a map/dictionary with name of the attribute and its value
print(ex_people[0])

('7874863620816', {'scan_timestamp': '2022-01-03 23:19:25'}, '0190125-02-001')


In [14]:
# Add the relationship into Neo4j
create_relationships(graph.auto(), ex_people, "VISITED", start_node_key=("Person", "deviceID"), end_node_key=("Business", "business_id"))

In [15]:
#create zip and business
ex_zip = []

for p in dict_bzr:
    zip= p["zip"]
    business = p["business_id"]
    ex_zip.append((business,{"active":1},zip))

print(len(ex_zip))

2500


In [16]:
# Add relationship to Neo4j
create_relationships(graph.auto(), ex_zip, "ISLOCATED", start_node_key=("Business", "business_id"), end_node_key=("Zip", "zip"))

In [17]:
# Relationship need to be a map/dictionary with name of the attribute and its value
print(ex_zip[0])

('0190125-02-001', {'active': 1}, 94118)
