This notebook demonstrates a simple proof-of-concept for creating nodes and relationships in a Neo4j database from a Python client

In [1]:
!pip install py2neo -U

Requirement already up-to-date: py2neo in c:\users\jtotten\appdata\local\continuum\anaconda3\lib\site-packages (4.3.0)


In [2]:
import pandas as pd
import py2neo
print(py2neo.__version__)

4.3.0


In [3]:
# url = 'https://raw.githubusercontent.com/tottenjordan/neo4j/master/data/movie_metadata.csv' # full dataset
url = 'https://raw.githubusercontent.com/tottenjordan/neo4j/master/data/movies_sample.csv' # sample dataset
movies_raw = pd.read_csv(url, index_col=0)

print(movies_raw.shape)
movies_raw.head()

(50, 28)


Unnamed: 0_level_0,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,actors
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Color,James Cameron,723,178,0,855,Joel David Moore,1000,760505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,...,English,USA,PG-13,237000000,2009,936,7.9,1.78,33000,CCH Pounder|Joel David Moore|Wes Studi
Color,Gore Verbinski,302,169,563,1000,Orlando Bloom,40000,309404152,Action|Adventure|Fantasy,Johnny Depp,...,English,USA,PG-13,300000000,2007,5000,7.1,2.35,0,Johnny Depp|Orlando Bloom|Jack Davenport
Color,Sam Mendes,602,148,0,161,Rory Kinnear,11000,200074175,Action|Adventure|Thriller,Christoph Waltz,...,English,UK,PG-13,245000000,2015,393,6.8,2.35,85000,Christoph Waltz|Rory Kinnear|Stephanie Sigman
Color,Christopher Nolan,813,164,22000,23000,Christian Bale,27000,448130642,Action|Thriller,Tom Hardy,...,English,USA,PG-13,250000000,2012,23000,8.5,2.35,164000,Tom Hardy|Christian Bale|Joseph Gordon-Levitt
Color,Andrew Stanton,462,132,475,530,Samantha Morton,640,73058679,Action|Adventure|Sci-Fi,Daryl Sabara,...,English,USA,PG-13,263700000,2012,632,6.6,2.35,24000,Daryl Sabara|Samantha Morton|Polly Walker


For this exercise:
* Drop null values
* Combine actor columns

In [4]:
# create movies df excluding nulls
movies_df = movies_raw.dropna(axis=0, how='any').copy()

# combine actors
movies_df['actors'] = movies_df['actor_1_name'] + "|" + movies_df['actor_2_name'] + "|" + movies_df['actor_3_name']

print(movies_df.shape)
movies_df.head()

(50, 28)


Unnamed: 0_level_0,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,actors
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Color,James Cameron,723,178,0,855,Joel David Moore,1000,760505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,...,English,USA,PG-13,237000000,2009,936,7.9,1.78,33000,CCH Pounder|Joel David Moore|Wes Studi
Color,Gore Verbinski,302,169,563,1000,Orlando Bloom,40000,309404152,Action|Adventure|Fantasy,Johnny Depp,...,English,USA,PG-13,300000000,2007,5000,7.1,2.35,0,Johnny Depp|Orlando Bloom|Jack Davenport
Color,Sam Mendes,602,148,0,161,Rory Kinnear,11000,200074175,Action|Adventure|Thriller,Christoph Waltz,...,English,UK,PG-13,245000000,2015,393,6.8,2.35,85000,Christoph Waltz|Rory Kinnear|Stephanie Sigman
Color,Christopher Nolan,813,164,22000,23000,Christian Bale,27000,448130642,Action|Thriller,Tom Hardy,...,English,USA,PG-13,250000000,2012,23000,8.5,2.35,164000,Tom Hardy|Christian Bale|Joseph Gordon-Levitt
Color,Andrew Stanton,462,132,475,530,Samantha Morton,640,73058679,Action|Adventure|Sci-Fi,Daryl Sabara,...,English,USA,PG-13,263700000,2012,632,6.6,2.35,24000,Daryl Sabara|Samantha Morton|Polly Walker


## Simple Update Ops
* Test ability to upload data to graph data model
* Create nodes
* Create node properties
* Create relationships between nodes

In [6]:
from py2neo import Graph

# Connect to Neo graph instance
graph = Graph("bolt://localhost:7687", user="neo4j", password="password")

### Create Graph: Nodes

In [10]:
from py2neo import Node

# create movies nodes
for i in range(movies_df.shape[0]):
    title = movies_df.iloc[i, :]['movie_title']
    duration = int(movies_df.iloc[i, :]['duration'])
    gross = int(movies_df.iloc[i, :]['gross'])
    language = movies_df.iloc[i, :]['language']
    budget = int(movies_df.iloc[i, :]['budget'])
    usersCount = int(movies_df.iloc[i, :]['num_user_for_reviews'])
    imdbScore = movies_df.iloc[i, :]['imdb_score']
    movieFbLikes = int(movies_df.iloc[i, :]['movie_facebook_likes'])
    
    node = Node("Movie",
                title = title,
                duration = duration,
                gross = gross,
                language = language,
                budget = budget,
                usersCount = usersCount,
                imdbScore = imdbScore,
                moveiFbLikes = movieFbLikes
               )
  
    graph.create(node)

print('Movie nodes successfully created!')

Movie nodes successfully created!


In [12]:
# create plot nodes
all_plots = set()
for i in range(movies_df.shape[0]):
    plot_list = movies_df.iloc[i, :]['plot_keywords'].split("|")
    for plot in plot_list:
        all_plots.add(plot)

for plot in all_plots:
    node = Node("Plot", name=plot)
    graph.create(node)

print("Plot nodes successfully created!")

Plot nodes successfully created!


In [13]:
# create director nodes
all_directors = set()
for i in range(movies_df.shape[0]):
    director = movies_df.iloc[i, :]['director_name']
    all_directors.add(director)
        
for director in all_directors:
    node = Node("Director", name = director)
    graph.create(node)

print("Director nodes successfully created!")

Director nodes successfully created!


In [14]:
# create genre nodes
all_genres = set()
for i in range(movies_df.shape[0]):
    genre_list = movies_df.iloc[i, :]['genres'].split("|")
    for genre in genre_list:
        all_genres.add(genre)

for genre in all_genres:
    node = Node("Genre", name=genre)
    graph.create(node)

print("Genre nodes successfully created!")

Genre nodes successfully created!


In [15]:
# create actors nodes
all_actors = set()
for i in range(movies_df.shape[0]):
    actor_list = movies_df.iloc[i, :]['actors'].split("|")
    for actor in actor_list:
        all_actors.add(actor)

for actor in all_actors:
    node = Node("Actor",name=actor)
    graph.create(node)

print("Actor nodes successfully created!")

Actor nodes successfully created!


### Create Graph: Relationships

In [16]:
from py2neo import NodeMatcher, Relationship

# Actors
for i in range(movies_df.shape[0]):
    title = movies_df.iloc[i, :]['movie_title']
    matcher = NodeMatcher(graph)
    movie_node = matcher.match("Movie", title=title).first()
    
    actors = movies_df.iloc[i, :]['actors'].split("|")
    for actor in actors:
        actor_node = matcher.match("Actor", name=actor).first()
        relationship = Relationship(actor_node, "ACTED_IN", movie_node)
        graph.create(relationship)
        
print("Relationships successfully created!")

Relationships successfully created!


In [17]:
# Genres
for i in range(movies_df.shape[0]):
    title = movies_df.iloc[i, :]['movie_title']
    matcher = NodeMatcher(graph)
    movie_node = matcher.match("Movie", title=title).first()
        
    genres = movies_df.iloc[i, :]['genres'].split("|")
    for genre in genres:
        genre_node = matcher.match("Genre", name=genre).first()
        relationship = Relationship(movie_node, "IN_GENRE", genre_node)
        graph.create(relationship)

print("Relationships created successfully!")

Relationships created successfully!


In [18]:
# Plots
for i in range(movies_df.shape[0]):
    title = movies_df.iloc[i, :]['movie_title']
    matcher = NodeMatcher(graph)
    movie_node = matcher.match("Movie", title=title).first()
    
    plots = movies_df.iloc[i, :]['plot_keywords'].split("|")
    for plot in plots:
        plot_node = matcher.match("Plot", name=plot).first()
        relationship = Relationship(movie_node, "HAS_PLOT", plot_node)
        graph.create(relationship)        

print("Relationships created successfully!")

Relationships created successfully!


In [19]:
# Directors
for i in range(movies_df.shape[0]):
    title = movies_df.iloc[i, :]['movie_title']
    matcher = NodeMatcher(graph)
    movie_node = matcher.match("Movie", title=title).first()
    
    director = movies_df.iloc[i, :]['director_name']
    director_node = matcher.match("Director", name=director).first()
    relationship = Relationship(director_node, "DIRECTED", movie_node)
    graph.create(relationship)       
        

print("Relationships created successfully!")

Relationships created successfully!
