In [2]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver('bolt://localhost:7687',auth=('neo4j', 'pleaseletmein'))

In [3]:
def run_query(query):
    with driver.session() as session:
        session.run(query)

## Constraints

In [4]:
movie_constraint_query = "CREATE CONSTRAINT FOR (m:Movie) REQUIRE m.id IS UNIQUE;"
movie_tag_constraint_query = "CREATE CONSTRAINT FOR (m:MovieTag) REQUIRE m.id IS UNIQUE;"
run_query(movie_constraint_query)
run_query(movie_tag_constraint_query)

## Import data

In [5]:
import_query = """

LOAD CSV FROM "file:///movie_titles_metadata.tsv" as row FIELDTERMINATOR "\t"
MERGE (m:Movie{id:row[0]})
SET m.title = row[1],
    m.release_year = toInteger(row[2]),
    m.imdb_rating = toFloat(row[3]),
    m.no_votes = toInteger(row[4])
WITH m, apoc.convert.fromJsonList(
          replace(row[5]," ",",")) as tags
UNWIND tags as tag
MERGE (mt:MovieTag{id:tag})
MERGE (m)-[:HAS_TAG]->(mt)

"""

In [6]:
run_query(import_query)

## Preprocess attributes

In [7]:
to_string_preprocessing_query = """
MATCH (m:Movie)
SET m.string_rating = toString(toInteger(m.imdb_rating * 10)),
    m.string_release_year = toString(m.release_year)
"""

In [8]:
run_query(to_string_preprocessing_query)

In [13]:
to_range_preprocessing_query = """

WITH 7 as total_length
MATCH (m:Movie)
WHERE m.imdb_rating IS NOT NULL
WITH m, total_length, 
        toString(toInteger(m.imdb_rating * 10)) as string_rating
WITH m, total_length - size(string_rating) as zeros, string_rating
WITH m, apoc.text.join([x in range(1,zeros) | "0"],"") +    
                                  string_rating as final_rating
SET m.range_rating = final_rating

"""

In [14]:
# look at https://brettscott.wordpress.com/2011/11/19/lucene-number-range-search-integers-floats/
run_query(to_range_preprocessing_query)

# Full-text index

## Create index

In [15]:
create_fts_index_query = """
CREATE FULLTEXT INDEX MovieIndex IF NOT EXISTS
FOR (m:Movie) ON EACH [m.title, m.string_rating, m.range_rating, m.string_release_year]
"""

In [16]:
run_query(create_fts_index_query)

# Lucene queries

In [17]:
import pandas as pd

def read_query(query):
    with driver.session() as session:
        result = session.run(query)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())
    

In [18]:
# basic
basic_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "title:dream") YIELD node, score
RETURN node.title as title, score
"""

In [19]:
read_query(basic_query)

Unnamed: 0,title,score
0,a nightmare on elm street: the dream child,1.270704
1,a nightmare on elm street 3: dream warriors,1.270704
2,a nightmare on elm street 4: the dream master,1.171073


In [20]:
# Logical operator
or_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", 
     "string_release_year:1999 or 2000") YIELD node, score
RETURN node.title as title, score
LIMIT 5
"""

In [21]:
read_query(or_query)

Unnamed: 0,title,score
0,american psycho,1.226042
1,bamboozled,1.226042
2,little nicky,1.226042
3,wonder boys,1.226042
4,scream 3,1.226042


In [22]:
# Wildcard single-character
wildcard_single_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "title:th?") YIELD node, score
RETURN node.title as title, score
"""

In [23]:
read_query(wildcard_single_query)

Unnamed: 0,title,score
0,the fifth element,1.0
1,the world is not enough,1.0
2,the wizard of oz,1.0
3,the witching hour,1.0
4,wag the dog,1.0
...,...,...
153,the avengers,1.0
154,airplane ii: the sequel,1.0
155,the atomic submarine,1.0
156,a nightmare on elm street: the dream child,1.0


In [24]:
# Wildcard multi-character
wildcard_multi_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "title:drea*") YIELD node, score
RETURN node.title as title, score
"""

In [25]:
read_query(wildcard_multi_query)

Unnamed: 0,title,score
0,a nightmare on elm street 4: the dream master,1.0
1,a nightmare on elm street 3: dream warriors,1.0
2,my mother dreams the satan's disciples in new ...,1.0
3,a nightmare on elm street: the dream child,1.0


In [26]:
# Fuzzy search
fuzzy_search_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "title:dream~") YIELD node, score
RETURN node.title as title, score
LIMIT 5
"""

In [27]:
read_query(fuzzy_search_query)

Unnamed: 0,title,score
0,scream,1.885012
1,bull durham,1.812557
2,point break,1.812557
3,scream 3,1.55742
4,scream 2,1.55742


In [28]:
# Range query
range_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "string_rating:[50 TO 99}") YIELD node, score
RETURN node.title as title,score
LIMIT 5
"""

In [29]:
read_query(range_query)

Unnamed: 0,title,score
0,10 things i hate about you,1.0
1,zulu dawn,1.0
2,young frankenstein,1.0
3,x-men,1.0
4,xxx,1.0


In [30]:
# Long range query
long_range_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "range_rating:[0000050 TO 0000150]") YIELD node, score
RETURN node.title as title,score
LIMIT 5
"""

In [31]:
read_query(long_range_query)

Unnamed: 0,title,score
0,10 things i hate about you,1.0
1,zulu dawn,1.0
2,young frankenstein,1.0
3,x-men,1.0
4,xxx,1.0


In [32]:
# Boosting score
boosting_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "title:dream string_rating:[50 TO 99]^2") YIELD node, score
RETURN node.title as title,score
LIMIT 5
"""

In [33]:
read_query(boosting_query)

Unnamed: 0,title,score
0,a nightmare on elm street 3: dream warriors,3.270704
1,a nightmare on elm street 4: the dream master,3.171073
2,young frankenstein,2.0
3,x-men,2.0
4,xxx,2.0


In [34]:
# Time decay
time_decay_query = """
WITH apoc.text.join([x in range(0,10) | 
"string_release_date:" + toString((date().year - x)) + "^" +   
  toString(10-x)]," ") as time_decay
CALL db.index.fulltext.queryNodes("MovieIndex", "title:dream " + time_decay) YIELD node, score
RETURN node.title as title,score
LIMIT 5
"""

In [35]:
read_query(time_decay_query)

Unnamed: 0,title,score
0,a nightmare on elm street: the dream child,1.270704
1,a nightmare on elm street 3: dream warriors,1.270704
2,a nightmare on elm street 4: the dream master,1.171073


In [36]:
# All together
all_together = """
WITH apoc.text.join([x in range(0,10) | 
"string_release_date:" + toString((date().year - x)) + "^" +   
  toString(10-x)]," ") as time_decay
CALL db.index.fulltext.queryNodes("MovieIndex", "title:dream string_rating:[50 TO 99]^2 "+ time_decay) YIELD node, score
// filter only thrillers
MATCH (node)-[:HAS_TAG]->(:MovieTag{id:'thriller'})
RETURN node.title as title,score
LIMIT 5
"""

In [37]:
read_query(all_together)

Unnamed: 0,title,score
0,a nightmare on elm street 3: dream warriors,3.270704
1,a nightmare on elm street 4: the dream master,3.171073
2,watchmen,2.0
3,the world is not enough,2.0
4,witness,2.0
