In [1]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, DATETIME
from whoosh.index import create_in
from whoosh.analysis import StemmingAnalyzer, NgramFilter, StopFilter
from whoosh.qparser import MultifieldParser
from whoosh import scoring
import whoosh
import json

from gensim.models import KeyedVectors
import numpy as np

In [2]:
dataset_path = "E:/Users/Lucas xD/Downloads/Products_Q_US_edited.json"
index_path = "./hb_index"
word2vec_model_path = 'E:/Users/Lucas xD/Downloads/GoogleNews-vectors-negative300.bin'

In [3]:
try: # nich mehrmals in Speicher laden... sind 3gb
    model
except NameError:
    model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)

In [3]:
json_data = json.load(open(dataset_path,'r'))


In [4]:
schema = Schema(brand=TEXT(),
                colors=TEXT(),
                gender=TEXT(),
                longDescription=TEXT(analyzer=StemmingAnalyzer() | NgramFilter(minsize=2, maxsize=4) | StopFilter()),
                name=TEXT(analyzer=StemmingAnalyzer() | NgramFilter(minsize=2, maxsize=4) | StopFilter(), field_boost=3.0),
                productId=TEXT(),
                shortDescription=TEXT(analyzer=StemmingAnalyzer() | NgramFilter(minsize=2, maxsize=4) | StopFilter()),
                sizes=TEXT(),
                styleName=TEXT(),
                variants=TEXT(),
                image=TEXT(),
                id=ID(stored=True)
                )

In [13]:
create_new_index = True
if(create_new_index):
    index = create_in(index_path, schema)
else:    
    index = whoosh.index.open_dir(index_path)

writer = index.writer()

In [14]:
for doc in json_data:
    writer.add_document(
            brand=str(doc['brand']),
            colors=str(doc['colors']),
            gender=str(doc['gender']),
            longDescription=str(doc['longDescription']),
            name=str(doc['name']),
            productId=str(doc['productId']),
            shortDescription=str(doc['shortDescription']),
            sizes=str(doc['sizes']),
            styleName=str(doc['styleName']),
            variants=str(doc['variants']),
            image=str(doc['image']),
            id=str(doc['id'])
    )
         
writer.commit()

In [17]:

use_synonyms = True

search_term = "dre"

if(use_synonyms):
    try:
        similarity_list = model.most_similar(search_term, topn=1)
        similar_words = [sim_tuple[0] for sim_tuple in similarity_list]
    except KeyError:
        similar_words = []
    keywords = " OR ".join([search_term] + similar_words)

    print("Results with Word2Vec:")
    print(f"Similar words used: {similar_words}")
    with index.searcher(weighting=scoring.TF_IDF()) as searcher:
        query = MultifieldParser(["name", "longDescription", 'shortDescription'], index.schema).parse(keywords)
        results = searcher.search(query)
        for docnum, score in results.items():
            print(docnum, score)
        print(results)
else:
    keywords  = search_term

# results = []



print("________________________\n")
print("Results without Word2Vec:")
with index.searcher(weighting=scoring.TF_IDF()) as searcher:
    query = MultifieldParser(["name", "longDescription", "shortDescription"], index.schema).parse(search_term)
    results = searcher.search(query)
    for docnum, score in results.items():
        print(docnum, score)
    for doc in results:
        print(doc)

Results with Word2Vec:
Similar words used: ['duine']
519 24.138011707535227
536 24.138011707535227
992 24.138011707535227
1679 24.138011707535227
2 20.302635421035188
6 20.302635421035188
17 20.302635421035188
215 20.302635421035188
255 20.302635421035188
278 20.302635421035188
<Top 10 Results for Or([Term('name', 'dre'), Term('longDescription', 'dre'), Term('shortDescription', 'dre'), Term('name', 'duin'), Term('longDescription', 'duin'), Term('shortDescription', 'duin')]) runtime=0.002712100000053397>
________________________

Results without Word2Vec:
519 24.138011707535227
536 24.138011707535227
992 24.138011707535227
1679 24.138011707535227
2 20.302635421035188
6 20.302635421035188
17 20.302635421035188
215 20.302635421035188
255 20.302635421035188
278 20.302635421035188
<Hit {'id': '519.0'}>
<Hit {'id': '536.0'}>
<Hit {'id': '992.0'}>
<Hit {'id': '1679.0'}>
<Hit {'id': '2.0'}>
<Hit {'id': '6.0'}>
<Hit {'id': '17.0'}>
<Hit {'id': '215.0'}>
<Hit {'id': '255.0'}>
<Hit {'id': '278.0'

In [7]:
def preprocess_query(search_term):
        stem = StemmingAnalyzer(stoplist=frozenset([
            'and', 'is', 'it', 'an', 'as', 'at', 'have', 'in', 'yet', 'if',
            'from', 'for', 'when', 'by', 'to', 'you', 'be', 'we', 'that',
            'may', 'not', 'with', 'tbd', 'a', 'on', 'your', 'this', 'of', 'us',
            'will', 'can', 'the', 'or', 'are'
        ]),
                                minsize=3)
        return [token.text for token in stem(search_term)]

In [32]:
from whoosh import query
from whoosh.lang.wordnet import Thesaurus
import numpy as np


query = preprocess_query('summer parfume')


sny_file = open("E:/Users/Lucas xD/Downloads/prolog/wn_s.pl")
tesaurus = Thesaurus.from_file(sny_file)
synonyms_arr = []
for token in query:
    all_synonyms = tesaurus.synonyms(token)
    if len(all_synonyms) > 0:
        synonyms_arr.append(all_synonyms[0])
        
    print(synonyms_arr, token)
    # if len(synonyms_arr) > 0: 
        # synonyms_arr.append(synonyms_arr) # only keep best match
        # synonym_string = synonym_string = " OR ".join(synonyms_arr)
    # synonyms_arr.append(token)
    # print('1', synonyms_arr)
    # synonyms_arr.append(" OR ".join(synonyms_arr))
    # print('2', synonyms_arr)


# synonym_string = " ".join(synonyms_arr)
print(query)
fullQuery = np.concatenate((query, synonyms_arr))
synonym_string = " OR ".join(fullQuery)
# synonym_string = " OR ".join(synonyms_arr)
synonym_string

['summertime'] summer
['summertime'] parfum
['summer', 'parfum']


'summer OR parfum OR summertime'

In [52]:
from whoosh import query
from nltk.corpus import wordnet
import numpy as np

import nltk
nltk.download('wordnet')

query = preprocess_query('casual fit')

def get_synonyms(word):
    synonyms = []
    synsets = wordnet.synsets(word)
    if (len(synsets) == 0):
        return []
    synset = synsets[0]
    lemma_names = synset.lemma_names()
    for lemma_name in lemma_names:
        lemma_name = lemma_name.lower().replace('_', ' ')
        if (lemma_name != word and lemma_name not in synonyms):
            synonyms.append(lemma_name)
    return synonyms


synonyms_arr = []
for token in query:
    synonyms = get_synonyms(token)
    if len(synonyms) > 0: synonyms_arr.append(synonyms[0]) # only keep best match


# synonym_string = " ".join(synonyms_arr)

fullQuery = np.concatenate((query, synonyms_arr))
synonym_string = " OR ".join(fullQuery)
# synonym_string = " OR ".join(synonyms_arr)
synonym_string

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'casual OR fit OR insouciant OR tantrum'

In [70]:
synsets1 = wordnet.synsets('casual')[0]
synsets2 = wordnet.synsets('insouciant')[0]

print(synsets1.wup_similarity(synsets2))
print(synsets1.path_similarity(synsets2))
print(synsets1.lch_similarity(synsets2))

1.0
1.0
0.6931471805599453


In [130]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

def tag(sentence):
 words = word_tokenize(sentence)
 words = pos_tag(words)
 return words

def paraphraseable(tag):
 return tag.startswith('NN') or tag == 'VB' or tag.startswith('JJ')

def pos(tag):
 if tag.startswith('NN'):
  return wn.NOUN
 elif tag.startswith('V'):
  return wn.VERB

def synonyms(word, tag):
    lemma_lists = [ss.lemmas() for ss in wn.synsets(word, pos(tag))]
    lemmas = [lemma.name() for lemma in sum(lemma_lists, []) if lemma.name() != word]
    return set(lemmas)

def synonymIfExists(sentence):
 for (word, t) in tag(sentence):
   if paraphraseable(t):
    syns = synonyms(word, t)
    if syns:
     if len(syns) >= 1:
      yield [word, list(syns)[0]] # keep only one
      continue
   yield [word, []]

def get_synonyms(sentence):
 return [word for word in synonymIfExists(sentence)]


print(get_synonyms("casual winter cloth"))
print(get_synonyms("business necktie"))
print(get_synonyms("summer T-shirt"))
print(get_synonyms("blue vest"))


[['casual', 'nonchalant'], ['winter', 'wintertime'], ['cloth', 'fabric']]
[['business', 'occupation'], ['necktie', 'tie']]
[['summer', 'summertime'], ['T-shirt', 'tee_shirt']]
[['blue', 'dispirited'], ['vest', 'singlet']]


In [102]:
"wintertime" == "winter"

False