In [10]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, DATETIME
from whoosh.index import create_in
from whoosh.analysis import StemmingAnalyzer, NgramFilter, StopFilter
from whoosh.qparser import MultifieldParser
from whoosh import scoring
import whoosh
import json

from gensim.models import KeyedVectors
import numpy as np

In [2]:
dataset_path = "E:/Users/Lucas xD/Downloads/Products_Q_US_edited.json"
index_path = "./hb_index"
word2vec_model_path = 'E:/Users/Lucas xD/Downloads/GoogleNews-vectors-negative300.bin'

In [3]:
try: # nich mehrmals in Speicher laden... sind 3gb
    model
except NameError:
    model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)

In [4]:
json_data = json.load(open(dataset_path,'r'))


In [12]:
schema = Schema(brand=TEXT(),
                colors=TEXT(),
                gender=TEXT(),
                longDescription=TEXT(analyzer=StemmingAnalyzer() | NgramFilter(minsize=2, maxsize=4) | StopFilter()),
                name=TEXT(analyzer=StemmingAnalyzer() | NgramFilter(minsize=2, maxsize=4) | StopFilter(), field_boost=3.0),
                productId=TEXT(),
                shortDescription=TEXT(analyzer=StemmingAnalyzer() | NgramFilter(minsize=2, maxsize=4) | StopFilter()),
                sizes=TEXT(),
                styleName=TEXT(),
                variants=TEXT(),
                image=TEXT(),
                id=ID(stored=True)
                )

In [13]:
create_new_index = True
if(create_new_index):
    index = create_in(index_path, schema)
else:    
    index = whoosh.index.open_dir(index_path)

writer = index.writer()

In [14]:
for doc in json_data:
    writer.add_document(
            brand=str(doc['brand']),
            colors=str(doc['colors']),
            gender=str(doc['gender']),
            longDescription=str(doc['longDescription']),
            name=str(doc['name']),
            productId=str(doc['productId']),
            shortDescription=str(doc['shortDescription']),
            sizes=str(doc['sizes']),
            styleName=str(doc['styleName']),
            variants=str(doc['variants']),
            image=str(doc['image']),
            id=str(doc['id'])
    )
         
writer.commit()

In [17]:

use_synonyms = True

search_term = "dre"

if(use_synonyms):
    try:
        similarity_list = model.most_similar(search_term, topn=1)
        similar_words = [sim_tuple[0] for sim_tuple in similarity_list]
    except KeyError:
        similar_words = []
    keywords = " OR ".join([search_term] + similar_words)

    print("Results with Word2Vec:")
    print(f"Similar words used: {similar_words}")
    with index.searcher(weighting=scoring.TF_IDF()) as searcher:
        query = MultifieldParser(["name", "longDescription", 'shortDescription'], index.schema).parse(keywords)
        results = searcher.search(query)
        for docnum, score in results.items():
            print(docnum, score)
        print(results)
else:
    keywords  = search_term

# results = []



print("________________________\n")
print("Results without Word2Vec:")
with index.searcher(weighting=scoring.TF_IDF()) as searcher:
    query = MultifieldParser(["name", "longDescription", "shortDescription"], index.schema).parse(search_term)
    results = searcher.search(query)
    for docnum, score in results.items():
        print(docnum, score)
    for doc in results:
        print(doc)

Results with Word2Vec:
Similar words used: ['duine']
519 24.138011707535227
536 24.138011707535227
992 24.138011707535227
1679 24.138011707535227
2 20.302635421035188
6 20.302635421035188
17 20.302635421035188
215 20.302635421035188
255 20.302635421035188
278 20.302635421035188
<Top 10 Results for Or([Term('name', 'dre'), Term('longDescription', 'dre'), Term('shortDescription', 'dre'), Term('name', 'duin'), Term('longDescription', 'duin'), Term('shortDescription', 'duin')]) runtime=0.002712100000053397>
________________________

Results without Word2Vec:
519 24.138011707535227
536 24.138011707535227
992 24.138011707535227
1679 24.138011707535227
2 20.302635421035188
6 20.302635421035188
17 20.302635421035188
215 20.302635421035188
255 20.302635421035188
278 20.302635421035188
<Hit {'id': '519.0'}>
<Hit {'id': '536.0'}>
<Hit {'id': '992.0'}>
<Hit {'id': '1679.0'}>
<Hit {'id': '2.0'}>
<Hit {'id': '6.0'}>
<Hit {'id': '17.0'}>
<Hit {'id': '215.0'}>
<Hit {'id': '255.0'}>
<Hit {'id': '278.0'

In [27]:
stem = StemmingAnalyzer(stoplist=frozenset(['and', 'is', 'it', 'an', 'as', 'at', 'have', 'in', 'yet', 'if', 'from', 'for', 'when', 'by', 'to', 'you', 'be', 'we', 'that', 'may', 'not', 'with', 'tbd', 'a', 'on', 'your', 'this', 'of', 'us', 'will', 'can', 'the', 'or', 'are']), minsize=3)
output = [token.text for token in stem("the summer dress for the winter")]
output

['summer', 'dress', 'winter']