In [1]:
from neo4j.v1 import GraphDatabase
driver = GraphDatabase.driver('bolt://localhost:7687',auth=('neo4j', 'burek123'))

In [2]:
def run_query(query):
    with driver.session() as session:
        session.run(query)

## Constraints

In [3]:
movie_constraint_query = "CREATE CONSTRAINT ON (m:Movie) ASSERT m.id IS UNIQUE;"
movie_tag_constraint_query = "CREATE CONSTRAINT ON (m:MovieTag) ASSERT m.id IS UNIQUE;"
run_query(movie_constraint_query)
run_query(movie_tag_constraint_query)

## Import data

In [4]:
import_query = """

LOAD CSV FROM "file:///movie_titles_metadata.tsv" as row FIELDTERMINATOR "\t"
MERGE (m:Movie{id:row[0]})
SET m.title = row[1],
    m.release_year = toInteger(row[2]),
    m.imdb_rating = toFloat(row[3]),
    m.no_votes = toInteger(row[4])
WITH m, apoc.convert.fromJsonList(
          replace(row[5]," ",",")) as tags
UNWIND tags as tag
MERGE (mt:MovieTag{id:tag})
MERGE (m)-[:HAS_TAG]->(mt)

"""

In [5]:
run_query(import_query)

## Preprocess attributes

In [6]:
to_string_preprocessing_query = """
MATCH (m:Movie)
SET m.string_rating = toString(toInteger(m.imdb_rating * 10)),
    m.string_release_year = toString(m.release_year)
"""

In [7]:
run_query(to_string_preprocessing_query)

In [8]:
to_range_preprocessing_query = """

WITH 7 as total_length
MATCH (m:Movie)
WHERE exists (m.imdb_rating)
WITH m, total_length, 
        toString(toInteger(m.imdb_rating * 10)) as string_rating
WITH m, total_length - length(string_rating) as zeros, string_rating
WITH m, apoc.text.join([x in range(1,zeros) | "0"],"") +    
                                  string_rating as final_rating
SET m.range_rating = final_rating

"""

In [9]:
# look at https://brettscott.wordpress.com/2011/11/19/lucene-number-range-search-integers-floats/
run_query(to_range_preprocessing_query)

# Full-text index

## Create index

In [10]:
create_fts_index_query = """
CALL db.index.fulltext.createNodeIndex(
"MovieIndex",["Movie"],["title","string_rating","range_rating","string_release_year"])
"""

In [11]:
run_query(create_fts_index_query)

# Lucene queries

In [12]:
import pandas as pd

def read_query(query):
    with driver.session() as session:
        result = session.run(query)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())
    

In [13]:
# basic
basic_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "title:dream") YIELD node, score
RETURN node.title as title, score
"""

In [14]:
read_query(basic_query)

Unnamed: 0,title,score
0,a nightmare on elm street: the dream child,2.641167
1,a nightmare on elm street 4: the dream master,2.263857
2,a nightmare on elm street 3: dream warriors,2.263857


In [15]:
# Logical operator
or_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", 
     "string_release_year:1999 or 2000") YIELD node, score
RETURN node.title as title, score
LIMIT 5
"""

In [16]:
read_query(or_query)

Unnamed: 0,title,score
0,american psycho,0.495955
1,bamboozled,0.495955
2,cast away,0.495955
3,george washington,0.495955
4,gladiator,0.495955


In [17]:
# Wildcard single-character
wildcard_single_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "title:th?") YIELD node, score
RETURN node.title as title, score
"""

In [18]:
read_query(wildcard_single_query)

Unnamed: 0,title,score
0,thx 1138,1.0


In [19]:
# Wildcard multi-character
wildcard_multi_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "title:drea*") YIELD node, score
RETURN node.title as title, score
"""

In [20]:
read_query(wildcard_multi_query)

Unnamed: 0,title,score
0,a nightmare on elm street 4: the dream master,1.0
1,a nightmare on elm street: the dream child,1.0
2,my mother dreams the satan's disciples in new ...,1.0
3,a nightmare on elm street 3: dream warriors,1.0


In [21]:
# Fuzzy search
fuzzy_search_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "title:dream~") YIELD node, score
RETURN node.title as title, score
LIMIT 5
"""

In [22]:
read_query(fuzzy_search_query)

Unnamed: 0,title,score
0,scream,1.83144
1,a nightmare on elm street: the dream child,1.335425
2,a nightmare on elm street 4: the dream master,1.14465
3,bull durham,1.14465
4,a nightmare on elm street 3: dream warriors,1.14465


In [23]:
# Range query
range_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "string_rating:[50 TO 99}") YIELD node, score
RETURN node.title as title,score
LIMIT 5
"""

In [24]:
read_query(range_query)

Unnamed: 0,title,score
0,1492: conquest of paradise,1.0
1,15 minutes,1.0
2,2001: a space odyssey,1.0
3,48 hrs.,1.0
4,the fifth element,1.0


In [25]:
# Long range query
long_range_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "range_rating:[0000050 TO 0000150]") YIELD node, score
RETURN node.title as title,score
LIMIT 5
"""

In [26]:
read_query(long_range_query)

Unnamed: 0,title,score
0,1492: conquest of paradise,1.0
1,15 minutes,1.0
2,2001: a space odyssey,1.0
3,48 hrs.,1.0
4,the fifth element,1.0


In [27]:
# Boosting score
boosting_query = """
CALL db.index.fulltext.queryNodes("MovieIndex", "title:dream string_rating:[50 TO 99]^2") YIELD node, score
RETURN node.title as title,score
LIMIT 5
"""

In [28]:
read_query(boosting_query)

Unnamed: 0,title,score
0,a nightmare on elm street 4: the dream master,2.46348
1,a nightmare on elm street 3: dream warriors,2.46348
2,a nightmare on elm street: the dream child,1.253581
3,1492: conquest of paradise,0.157242
4,15 minutes,0.157242


In [29]:
# Time decay
time_decay_query = """
WITH apoc.text.join([x in range(0,10) | 
"string_release_date:" + toString((date().year - x)) + "^" +   
  toString(10-x)]," ") as time_decay
CALL db.index.fulltext.queryNodes("MovieIndex", "title:dream " + time_decay) YIELD node, score
RETURN node.title as title,score
LIMIT 5
"""

In [30]:
read_query(time_decay_query)

Unnamed: 0,title,score
0,a nightmare on elm street: the dream child,0.009115
1,a nightmare on elm street 4: the dream master,0.007812
2,a nightmare on elm street 3: dream warriors,0.007812


In [31]:
# All together
all_together = """
WITH apoc.text.join([x in range(0,10) | 
"string_release_date:" + toString((date().year - x)) + "^" +   
  toString(10-x)]," ") as time_decay
CALL db.index.fulltext.queryNodes("MovieIndex", "title:dream string_rating:[50 TO 99]^2 "+ time_decay) YIELD node, score
// filter only thrillers
MATCH (node)-[:HAS_TAG]->(:MovieTag{id:'thriller'})
RETURN node.title as title,score
LIMIT 5
"""

In [32]:
read_query(all_together)

Unnamed: 0,title,score
0,a nightmare on elm street 4: the dream master,0.016532
1,a nightmare on elm street 3: dream warriors,0.016532
2,a nightmare on elm street: the dream child,0.008413
3,15 minutes,0.001055
4,48 hrs.,0.001055
