In [1]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, DATETIME
from whoosh.index import create_in
from whoosh.analysis import StemmingAnalyzer
from whoosh.qparser import MultifieldParser
from whoosh import scoring
import whoosh
import csv

from gensim.models import KeyedVectors
import numpy as np

In [2]:
imdb_dataset_path = "./data/IMDB-Movie-Data.csv"
index_path = "./whoosh_index"
word2vec_model_path = 'E:/Users/Lucas xD/Downloads/GoogleNews-vectors-negative300.bin'

In [18]:
try: # nich mehrmals in Speicher laden... sind 3gb
    model
except NameError:
    model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)

In [4]:
def read_in_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp, delimiter=',', quotechar='"')
        data_read = [row for row in reader]
    return data_read

csv_data = read_in_csv(imdb_dataset_path)


In [5]:
schema = Schema(movie_id=ID(stored=True),
                title=TEXT(analyzer=StemmingAnalyzer()),
                description=TEXT(analyzer=StemmingAnalyzer()),
                genre=KEYWORD,
                director=TEXT,
                actors=TEXT,
                year=DATETIME)

In [6]:
create_new_index = True
if(create_new_index):
    index = create_in(index_path, schema)
else:    
    index = whoosh.index.open_dir(index_path)

writer = index.writer()

In [7]:
for row in csv_data[1:]:
    movie_id = row[0]
    title = row[1]
    genre = row[2]
    description = row[3]
    director = row[4]
    actors = row[5]
    year = row[6]
    writer.add_document(movie_id=movie_id, title=title, description=description, genre=genre, director=director, actors=actors, year=year)
writer.commit()

In [8]:

use_synonyms = True

search_term = "superman"

if(use_synonyms):
    similarity_list = model.most_similar(search_term, topn=3)
    similar_words = [sim_tuple[0] for sim_tuple in similarity_list]
    keywords = " OR ".join([search_term] + similar_words)
else:
    keywords  = search_term

results = []
print("Results with Word2Vec:")
print(f"Similar words used: {similar_words}")
with index.searcher(weighting=scoring.TF_IDF()) as searcher:
    query = MultifieldParser(["title", "description"], index.schema).parse(keywords)
    results = searcher.search(query)
    for docnum, score in results.items():
        print(docnum+1, score)
    print(results)


print("________________________\n")
print("Results without Word2Vec:")
with index.searcher(weighting=scoring.TF_IDF()) as searcher:
    query = MultifieldParser(["title", "description"], index.schema).parse(search_term)
    results = searcher.search(query)
    for docnum, score in results.items():
        print(docnum+1, score)
    print(results)

Results with Word2Vec:
Similar words used: ['Superman', 'superhero', 'mere_mortal']
61 13.618285980628055
925 13.618285980628055
148 5.961845129926823
220 5.961845129926823
600 5.961845129926823
788 5.961845129926823
790 5.961845129926823
938 5.961845129926823
<Top 8 Results for Or([Term('title', 'superman'), Term('description', 'superman'), Term('title', 'superhero'), Term('description', 'superhero'), Term('title', 'mere_mort'), Term('description', 'mere_mort')]) runtime=0.0010769000000010465>
________________________

Results without Word2Vec:
61 13.618285980628055
925 13.618285980628055
<Top 2 Results for Or([Term('title', 'superman'), Term('description', 'superman')]) runtime=0.0002825999999984674>
