# Imports

In [24]:
# index creation
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, DATETIME
from whoosh.index import create_in
from whoosh.analysis import StemmingAnalyzer, NgramFilter, StopFilter
from whoosh.qparser import MultifieldParser
from whoosh import scoring
import whoosh

# data import
import json

from gensim.models import KeyedVectors
import numpy as np

# preprocessing
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn

# Synonyms generation with a dictionary (WordNet)

In [122]:
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

def tag(sentence):
  words = word_tokenize(sentence)
  words = pos_tag(words)
  return words

def paraphraseable(tag):
 return tag.startswith('NN') or tag == 'VB' or tag.startswith('JJ')

def pos(tag):
 if tag.startswith('NN'):
  return wn.NOUN
 elif tag.startswith('V'):
  return wn.VERB

def synonyms(word, tag):
    lemma_lists = [ss.lemmas() for ss in wn.synsets(word, pos(tag))]
    lemmas = [lemma.name() for lemma in sum(lemma_lists, []) if lemma.name() != word]
    return set(lemmas)

def synonymIfExists(sentence):
 for (word, t) in tag(sentence):
   if paraphraseable(t):
    syns = synonyms(word, t)
    if syns:
     if len(syns) >= 1:
      yield [word, list(syns)[0]] # keep only one
      continue
   yield [word]

def get_synonyms(sentence):
  words = [word for word in synonymIfExists(sentence)]
  single_list = [item for sublist in words for item in sublist]
  str_list = ','.join(single_list)
  return str_list


print(get_synonyms("casual winter cloth"))
print(get_synonyms("fitting necktie for a casual meeting"))
print(get_synonyms("summer T-shirt"))
print(get_synonyms("elastic trouser"))

casual,occasional,winter,wintertime,cloth,textile
fitting,necktie,tie,for,a,casual,occasional,meeting,coming_together
summer,summertime,T-shirt,jersey
elastic,pliant,trouser,pant


In [113]:
def preprocess_query(search_term):
        stem = StemmingAnalyzer(stoplist=frozenset([
            'and', 'is', 'it', 'an', 'as', 'at', 'have', 'in', 'yet', 'if',
            'from', 'for', 'when', 'by', 'to', 'you', 'be', 'we', 'that',
            'may', 'not', 'with', 'tbd', 'a', 'on', 'your', 'this', 'of', 'us',
            'will', 'can', 'the', 'or', 'are'
        ]),
                                minsize=3)
        return [token.text for token in stem(search_term)]

In [123]:
print(','.join(preprocess_query('fitting,necktie,tie,for,a,casual,occasional,meeting,coming_together')))
print(','.join(preprocess_query('elastic,pliant,trouser,pant')))

fitt,neckti,tie,casual,occasion,meet,coming_togeth
elast,pliant,trouser,pant


# Training word embeddings with product data (Tensorflow, Keras)

In [26]:
import io
import re
import string
import tensorflow as tf
import tqdm

from tensorflow.keras import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import numpy as np

In [27]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(len(tokens))

8


In [28]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [29]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [30]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


In [31]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))

26


In [32]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(3, 1): (road, the)
(1, 4): (the, shimmered)
(7, 1): (sun, the)
(4, 5): (shimmered, in)
(2, 4): (wide, shimmered)


In [33]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=4,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([3 7 0 4], shape=(4,), dtype=int64)
['road', 'sun', '<pad>', 'shimmered']


In [34]:
# Add a dimension so you can use concatenation (on the next step).
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

# Concat positive context word with negative sampled words.
context = tf.concat([context_class, negative_sampling_candidates], 0)

# Label first context word as 1 (positive) followed by num_ns 0s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")

# Reshape target to shape (1,) and context and label to (num_ns+1,).
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)

In [35]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

print("target  :", target)
print("context :", context)
print("label   :", label)

target_index    : 3
target_word     : road
context_indices : [1 3 7 0 4]
context_words   : ['the', 'road', 'sun', '<pad>', 'shimmered']
label           : [1 0 0 0 0]
target  : tf.Tensor(3, shape=(), dtype=int32)
context : tf.Tensor([1 3 7 0 4], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


## Preprocessing in one function

In [36]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels


In [37]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [38]:
with open(path_to_file) as f: 
  lines = f.read().splitlines()
for line in lines[:20]:
  print(line)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [39]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [40]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [41]:
vectorize_layer.adapt(text_ds.batch(1024))

In [42]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that', 'in', 'is', 'not', 'for', 'with', 'me', 'it', 'be', 'your']


In [43]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(tf.data.AUTOTUNE).map(vectorize_layer).unbatch()

In [44]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

32777


In [45]:
for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] => ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] => ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']


In [46]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=4)
print(len(targets), len(contexts), len(labels))

100%|██████████| 32777/32777 [00:07<00:00, 4498.63it/s]

65011 65011 65011





In [47]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [48]:
dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [53]:
class Word2Vec(Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=num_ns+1)
    self.dots = Dot(axes=(3, 2))
    self.flatten = Flatten()

  def call(self, pair):
    target, context = pair
    word_emb = self.target_embedding(target)
    context_emb = self.context_embedding(context)
    dots = self.dots([context_emb, word_emb])
    return self.flatten(dots)

In [54]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [55]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [56]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x28516d8eca0>

In [57]:
%tensorboard --logdir logs

UsageError: Line magic function `%tensorboard` not found.


In [58]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [59]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip <PAD>.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [60]:
index2word = dict(enumerate(list(inverse_vocab)))
word2index = {v: k for k, v in index2word.items()}

text_vector_ds

<_UnbatchDataset shapes: (10,), types: tf.int64>

In [61]:
def find_closest(embeds, word, n=1): # n is for "n closest words"
  n = n + 1 # This is becuse the most similar word is definatly that word itself. like the most similar word for "apple" is "apple". so we should look for top n+1 words
  main_vec = embeds(word2index[word])

  similarities = -tf.keras.losses.cosine_similarity(embeds.embeddings, main_vec)
  top_n = tf.math.top_k(similarities, n).indices
  words = [index2word[i] for i in top_n.numpy()]

  return words[1:], tf.math.top_k(similarities, 3)[0][1]

In [67]:
target = word2vec.target_embedding
res, sim = find_closest(target, 'summer')
print('closest: ', res)
print('top similarity: ', sim.numpy())

closest:  ['rebellion']
top similarity:  0.5394914


In [63]:
word2vec.summary()

Model: "word2_vec"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
w2v_embedding (Embedding)    multiple                  524288    
_________________________________________________________________
embedding (Embedding)        multiple                  524288    
_________________________________________________________________
dot (Dot)                    multiple                  0         
_________________________________________________________________
flatten (Flatten)            multiple                  0         
Total params: 1,048,576
Trainable params: 1,048,576
Non-trainable params: 0
_________________________________________________________________


# Import pretrained word embeddings (Google's Word2Vec)

In [68]:
word2vec_model_path = 'E:/Users/Lucas xD/Downloads/GoogleNews-vectors-negative300.bin'

In [69]:
try: # nich mehrmals in Speicher laden... sind 3gb
    model
except NameError:
    model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)

In [81]:
model.most_similar('HUGO')

[('MARGO', 0.5448382496833801),
 ('ESPINOSA', 0.540989875793457),
 ('ALFREDO', 0.5401614904403687),
 ('FELIX', 0.5358700156211853),
 ('JEFFREYS', 0.5335772037506104),
 ('PABLO', 0.5334649682044983),
 ('ALBERTO', 0.5314393043518066),
 ('LINDGREN', 0.5300692915916443),
 ('CLARA', 0.5292963981628418),
 ('RICARDO', 0.5291734337806702)]

# Create Schema with analyzers & filters (Whoosh)

In [70]:
dataset_path = "E:/Users/Lucas xD/Downloads/Products_Q_US_edited.json"
index_path = "./hb_index"

json_data = json.load(open(dataset_path,'r'))

In [71]:
schema = Schema(brand=TEXT(),
                colors=TEXT(),
                gender=TEXT(),
                longDescription=TEXT(analyzer=StemmingAnalyzer() | NgramFilter(minsize=2, maxsize=4) | StopFilter()),
                name=TEXT(analyzer=StemmingAnalyzer() | NgramFilter(minsize=2, maxsize=4) | StopFilter(), field_boost=3.0),
                productId=TEXT(),
                shortDescription=TEXT(analyzer=StemmingAnalyzer() | NgramFilter(minsize=2, maxsize=4) | StopFilter()),
                sizes=TEXT(),
                styleName=TEXT(),
                variants=TEXT(),
                image=TEXT(),
                id=ID(stored=True)
                )

# Create the index

In [72]:
create_new_index = True
if(create_new_index):
    index = create_in(index_path, schema)
else:    
    index = whoosh.index.open_dir(index_path)

writer = index.writer()

In [73]:
for doc in json_data:
    writer.add_document(
            brand=str(doc['brand']),
            colors=str(doc['colors']),
            gender=str(doc['gender']),
            longDescription=str(doc['longDescription']),
            name=str(doc['name']),
            productId=str(doc['productId']),
            shortDescription=str(doc['shortDescription']),
            sizes=str(doc['sizes']),
            styleName=str(doc['styleName']),
            variants=str(doc['variants']),
            image=str(doc['image']),
            id=str(doc['id'])
    )
         
writer.commit()

# Testing the index

In [74]:

use_synonyms = True

search_term = "dress"

if(use_synonyms):
    try:
        similarity_list = model.most_similar(search_term, topn=1)
        similar_words = [sim_tuple[0] for sim_tuple in similarity_list]
    except KeyError:
        similar_words = []
    keywords = " OR ".join([search_term] + similar_words)

    print("Results with Word2Vec:")
    print(f"Similar words used: {similar_words}")
    with index.searcher(weighting=scoring.TF_IDF()) as searcher:
        query = MultifieldParser(["name", "longDescription", 'shortDescription'], index.schema).parse(keywords)
        results = searcher.search(query)
        for docnum, score in results.items():
            print(docnum, score)
        print(results)
else:
    keywords  = search_term

# results = []



print("________________________\n")
print("Results without Word2Vec:")
with index.searcher(weighting=scoring.TF_IDF()) as searcher:
    query = MultifieldParser(["name", "longDescription", "shortDescription"], index.schema).parse(search_term)
    results = searcher.search(query)
    for docnum, score in results.items():
        print(docnum, score)
    for doc in results:
        print(doc)

Results with Word2Vec:
Similar words used: ['dresses']
519 46.841051088653984
536 46.841051088653984
992 46.841051088653984
1679 46.841051088653984
2 39.648702347024916
6 39.648702347024916
17 39.648702347024916
215 39.648702347024916
255 39.648702347024916
278 39.648702347024916
<Top 10 Results for Or([And([Term('name', 'dres'), Term('name', 'ress')]), And([Term('longDescription', 'dres'), Term('longDescription', 'ress')]), And([Term('shortDescription', 'dres'), Term('shortDescription', 'ress')])]) runtime=0.012073500001861248>
________________________

Results without Word2Vec:
519 46.841051088653984
536 46.841051088653984
992 46.841051088653984
1679 46.841051088653984
2 39.648702347024916
6 39.648702347024916
17 39.648702347024916
215 39.648702347024916
255 39.648702347024916
278 39.648702347024916
<Hit {'id': '519.0'}>
<Hit {'id': '536.0'}>
<Hit {'id': '992.0'}>
<Hit {'id': '1679.0'}>
<Hit {'id': '2.0'}>
<Hit {'id': '6.0'}>
<Hit {'id': '17.0'}>
<Hit {'id': '215.0'}>
<Hit {'id': '25