## IMPORTING DATA INTO NEO4J


In [11]:
import csv
from py2neo import Graph, Node

In [12]:
N_MOVIES = 9742
N_RATINGS = 100836
# N_TAGS = 9742
N_LINKS = 3683

In [13]:
N_MOVIES = 1000
N_RATINGS = 1000
N_LINKS = 1000

In [14]:
PORT = 7687
USER = "neo4j"
PASS = "12345678"
graph = Graph("bolt://" + ":7687", auth=(USER, PASS))


In [15]:
# def createGenreNodes():
#     allGenres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
#                  "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
#                  "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

#     for genre in allGenres:
#         gen = Node("Genre", name=genre)
#         graph.create(gen)

In [16]:
def loadMovies():
    with open('data/movies.csv', encoding='utf8') as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        next(readCSV, None)  # skip header
        for i, row in enumerate(readCSV):

            createMovieNodes(row)
            # createGenreMovieRelationships(row)

            if (i % 1000 == 0):
                print(f"{i}/{N_MOVIES} Movie nodes created")

            # break after N_MOVIES movies

            if i >= N_MOVIES:
                break

def createMovieNodes(row):
    movieData = parseRowMovie(row)
    id = movieData[0]
    title = movieData[1]
    year = movieData[2]
    genres = movieData[3]
    mov = Node("Movie", id=id, title=title, year=year, genres=genres)
    graph.create(mov)

def parseRowMovie(row):
        id = row[0]
        year = row[1][-5:-1]
        title = row[1][:-7]
        genres = row[2]

        return (id, title, year, genres)

# def createGenreMovieRelationships(row):
#     movieId = row[0]
#     movieGenres = row[2].split("|")

#     for movieGenre in movieGenres:
#         graph.run('MATCH (g:Genre {name: $genre}), (m:Movie {id: $movieId}) CREATE (g)-[:IS_GENRE_OF]->(m)',
#             genre=movieGenre, movieId=movieId)
        
# def parseRowGenreMovieRelationships(row):
#     movieId = row[0]
#     movieGenres = row[2].split("|")

#     return (movieId, movieGenres)

In [17]:
def loadRatings():
    with open('data/ratings.csv') as csvfile:
         readCSV = csv.reader(csvfile, delimiter=',')
         next(readCSV, None) #skip header
         for i,row in enumerate(readCSV):
             createUserNodes(row)
             createRatingRelationship(row)

             if (i % 100 == 0):
                 print(f"{i}/{N_RATINGS} Rating relationships created")

             if (i >= N_RATINGS):
                 break
def createUserNodes(row):
    user = Node("User", id="user" + row[0])
    graph.merge(user, "User", "id")

def createRatingRelationship(row):
    ratingData = parseRowRatingRelationships(row)

    graph.run(
        'MATCH (u:User {id: $userId}), (m:Movie {id: $movieId}) CREATE (u)-[:RATED { rating: $rating, timestamp: $timestamp }]->(m)',
        userId=ratingData[0], movieId=ratingData[1], rating=ratingData[2], timestamp=ratingData[3])

def parseRowRatingRelationships(row):
    userId = "user" + row[0]
    movieId = row[1]
    rating = float(row[2])
    timestamp = row[3]

    return (userId, movieId, rating, timestamp)

In [18]:
# def loadTags():
#     with open('data/tags.csv', encoding='utf8') as csvfile:
#          readCSV = csv.reader(csvfile, delimiter=',')
#          next(readCSV, None) #skip header
#          for i,row in enumerate(readCSV):
#              createTagRelationship(row)

#              if (i % 100 == 0):
#                  print(f"{i}/{N_TAGS} Tag relationships created")

#              if (i >= N_TAGS):
#                  break

# def createTagRelationship(row):
#     tagData = parseRowTagRelationships(row)

#     graph.run(
#         'MATCH (u:User {id: $userId}), (m:Movie {id: $movieId}) CREATE (u)-[:TAGGED { tag: $tag, timestamp: $timestamp }]->(m)',
#         userId=tagData[0], movieId=tagData[1], tag=tagData[2], timestamp=tagData[3])

# def parseRowTagRelationships(row):
#     userId = "user" + row[0]
#     movieId = row[1]
#     tag = row[2]
#     timestamp = row[3]

#     return (userId, movieId, tag, timestamp)

In [19]:
def loadLinks():
    with open('data/links.csv') as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        next(readCSV, None)  # skip header
        for i, row in enumerate(readCSV):

            updateMovieNodeWithLinks(row)

            if (i % 100 == 0):
                print(f"{i}/{N_LINKS} Movie nodes updated with links")

            # break after N_LINKS movies

            if i >= N_LINKS:
                break

def updateMovieNodeWithLinks(row):
    linkData = parseRowLinks(row)

    graph.run(
        'MATCH (m:Movie {id: $movieId}) SET m += { imdbId: $imdbId , tmdbId: $tmdbId }',
        movieId=linkData[0], imdbId=linkData[1], tmdbId=linkData[2])

def parseRowLinks(row):
    movieId = row[0]
    imdbId = row[1]
    tmdbId = row[2]

    return (movieId, imdbId, tmdbId)

### Start importing data into Neo4j

In [20]:
print("Step 1 out of 3: loading movie nodes")
loadMovies()

print("Step 2 out of 3: loading rating relationships")
loadRatings()

print("Step 3 out of 3: updating links to movie nodes")
loadLinks()

Step 1 out of 3: loading movie nodes
0/1000 Movie nodes created
1000/1000 Movie nodes created
Step 2 out of 3: loading rating relationships
0/1000 Rating relationships created
100/1000 Rating relationships created
200/1000 Rating relationships created
300/1000 Rating relationships created
400/1000 Rating relationships created
500/1000 Rating relationships created
600/1000 Rating relationships created
700/1000 Rating relationships created
800/1000 Rating relationships created
900/1000 Rating relationships created
1000/1000 Rating relationships created
Step 3 out of 3: updating links to movie nodes
0/1000 Movie nodes updated with links
100/1000 Movie nodes updated with links
200/1000 Movie nodes updated with links
300/1000 Movie nodes updated with links
400/1000 Movie nodes updated with links
500/1000 Movie nodes updated with links
600/1000 Movie nodes updated with links
700/1000 Movie nodes updated with links
800/1000 Movie nodes updated with links
900/1000 Movie nodes updated with link

### GET SOME DATA AND VISUALIZE IT

In [21]:
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget

In [22]:
uri = "bolt://localhost:7687" 
user = "neo4j"  
password = "12345678"

driver = GraphDatabase.driver(uri=uri,auth=(user,password),database='neo4j')
session = driver.session(database='neo4j')

Failed to write data to connection IPv4Address(('localhost', 7687)) (IPv4Address(('127.0.0.1', 7687)))


In [23]:

result = session.run("MATCH (m)-[r]->(t) RETURN m,r,t LIMIT 100")

w = GraphWidget(graph = result.graph())

w.show()

GraphWidget(layout=Layout(height='500px', width='100%'))