# Gensim - Easy

In [3]:
#Import a test data set provided in gensim to train amodel

from gensim.test.utils import common_texts
from gensim.models import Word2Vec

#Build the model, by selecting the parameters.
our_model=Word2Vec(sentences=common_texts, vector_size=10, window=5, min_count=1, workers=4)

#Save the model
our_model.save("tempmodel.w2v")

#Inspect the model by looking for the most similar words for a test word.
print(our_model.wv.most_similar('computer',topn=5))

#Let us see what the 10-dimensional vector for 'computer'looks like.
#print(our_model['computer'])

[('eps', 0.2914133071899414), ('trees', 0.05541801080107689), ('minors', 0.04264770820736885), ('survey', -0.02176349051296711), ('interface', -0.1523357331752777)]


# Tensorflow, Keras - Advanced

In [4]:
import io
import re
import string
import tensorflow as tf
import tqdm

from tensorflow.keras import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import numpy as np

import matplotlib
import matplotlib.pyplot as plt

In [5]:
# Load the TensorBoard notebook extension
%load_ext tensorboard


In [6]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(len(tokens))


8


In [7]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [8]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [9]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)


[1, 2, 3, 4, 5, 1, 6, 7]


In [10]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))


26


In [11]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")


(2, 3): (wide, road)
(6, 5): (hot, in)
(6, 7): (hot, sun)
(5, 3): (in, road)
(7, 6): (sun, hot)


In [12]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=4,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])


tf.Tensor([3 7 0 4], shape=(4,), dtype=int64)
['road', 'sun', '<pad>', 'shimmered']


In [13]:
# Add a dimension so you can use concatenation (on the next step).
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

# Concat positive context word with negative sampled words.
context = tf.concat([context_class, negative_sampling_candidates], 0)

# Label first context word as 1 (positive) followed by num_ns 0s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")

# Reshape target to shape (1,) and context and label to (num_ns+1,).
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)


In [14]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")


target_index    : 2
target_word     : wide
context_indices : [3 3 7 0 4]
context_words   : ['road', 'road', 'sun', '<pad>', 'shimmered']
label           : [1 0 0 0 0]


In [15]:
print("target  :", target)
print("context :", context)
print("label   :", label)


target  : tf.Tensor(2, shape=(), dtype=int32)
context : tf.Tensor([3 3 7 0 4], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


In Keras gibt's ne Funktion die genau das alles tut:

In [16]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)


[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


In [17]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels


## Erstellen eines Word-Embeddings-Models

### Preprocessing für Word Embeddings:

In [18]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')


In [19]:
with open(path_to_file) as f: 
  lines = f.read().splitlines()
for line in lines[:20]:
  print(line)


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [20]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))


In [21]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)


In [22]:
vectorize_layer.adapt(text_ds.batch(1024))

In [23]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])


['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that', 'in', 'is', 'not', 'for', 'with', 'me', 'it', 'be', 'your']


In [24]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(tf.data.AUTOTUNE).map(vectorize_layer).unbatch()


In [25]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

32777


In [26]:
for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] => ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] => ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']


In [27]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=4)
print(len(targets), len(contexts), len(labels))


100%|██████████| 32777/32777 [00:08<00:00, 4057.85it/s]65292 65292 65292



In [28]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [29]:
dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [30]:
class Word2Vec(Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)
    self.dots = Dot(axes=(3, 2))
    self.flatten = Flatten()

  def call(self, pair):
    target, context = pair
    word_emb = self.target_embedding(target)
    context_emb = self.context_embedding(context)
    dots = self.dots([context_emb, word_emb])
    return self.flatten(dots)

In [31]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])


In [32]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")


In [33]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x21f0aff9580>

In [34]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 2232), started 19:56:47 ago. (Use '!kill 2232' to kill it.)

In [35]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()


In [36]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip <PAD>.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()


In [37]:
index2word = dict(enumerate(list(inverse_vocab)))
word2index = {v: k for k, v in index2word.items()}

text_vector_ds

<_UnbatchDataset shapes: (10,), types: tf.int64>

In [38]:
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum((vec1-vec2)**2))

def find_closest_neighbor(word_index, vectors):
    min_dist = 10000 # to act like positive infinity
    min_index = -1    
    query_vector = vectors[word_index]    
    for index, vector in enumerate(vectors):        
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):            
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index    
    return index2word[min_index]

In [39]:
find_closest_neighbor(word2index['king'], word2vec.get_layer('w2v_embedding').get_weights()[0])


'xi'

In [40]:
def find_closest(embeds, word, n=1): # n is for "n closest words"
  n = n + 1 # This is becuse the most similar word is definatly that word itself. like the most similar word for "apple" is "apple". so we should look for top n+1 words
  main_vec = embeds(word2index[word])

  similarities = -tf.keras.losses.cosine_similarity(embeds.embeddings, main_vec)
  top_n = tf.math.top_k(similarities, n).indices
  words = [index2word[i] for i in top_n.numpy()]

  return words[1:], tf.math.top_k(similarities, 3)[0][1]

In [41]:
target = word2vec.target_embedding
res, sim = find_closest(target, 'country')
print('closest: ', res)
print('top similarity: ', sim.numpy())

closest:  ['measure']
top similarity:  0.51464057


In [42]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
print("length of all word2vec weights: (dimension) ",weights.shape, type(weights),"\n",weights)

length of all word2vec weights: (dimension)  (4096, 128) <class 'numpy.ndarray'> 
 [[-0.03170488  0.04458102  0.02082629 ... -0.01657663  0.0432646
   0.03816791]
 [-0.24896912  0.25002706  0.25575274 ...  0.06890602 -0.03906033
  -0.00669807]
 [ 0.08956954 -0.13569097  0.46888766 ... -0.06426004  0.01015592
  -0.17689146]
 ...
 [ 0.09986273  0.04257649 -0.04497757 ...  0.09604951 -0.06974712
   0.02613098]
 [-0.30789903  0.00470538  0.20721006 ... -0.08699082  0.10174327
   0.14791062]
 [-0.06157974  0.05344606 -0.11591852 ...  0.2676837  -0.21459925
  -0.15642247]]


In [43]:
layer2 = word2vec.get_layer('embedding').get_weights()[0]
print("layer2 weights: (dimension) ",layer2.shape, type(layer2),"\n",layer2)

vocab = vectorize_layer.get_vocabulary()
print("vocabulary: (length/words)",len(vocab))

layer2 weights: (dimension)  (4096, 128) <class 'numpy.ndarray'> 
 [[ 0.6983603  -0.7845663  -0.7150691  ...  0.70170385 -0.6963884
   0.6602721 ]
 [ 0.3122846   0.0951447   0.21395807 ... -0.48337275  0.5315396
  -0.3748534 ]
 [ 0.25121936  0.2995198   0.1888315  ...  0.76729554 -0.5334912
   0.45754376]
 ...
 [ 0.22649363 -0.21330819  0.12587321 ... -0.23354936  0.06174172
  -0.13220982]
 [-0.06439     0.00131847 -0.08911397 ...  0.30978835 -0.34610048
  -0.01553277]
 [-0.05639622 -0.11941099 -0.07163206 ...  0.20494175 -0.17191157
  -0.02553475]]
vocabulary: (length/words) 4096


In [44]:
word2vec.summary()

Model: "word2_vec"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
w2v_embedding (Embedding)    multiple                  524288    
_________________________________________________________________
embedding (Embedding)        multiple                  524288    
_________________________________________________________________
dot (Dot)                    multiple                  0         
_________________________________________________________________
flatten (Flatten)            multiple                  0         
Total params: 1,048,576
Trainable params: 1,048,576
Non-trainable params: 0
_________________________________________________________________


In [45]:
#from keras.layers import dot
#similarity = dot([target, context], axes=1, normalize=True)

TypeError: 'NoneType' object is not subscriptable

### Pretrained vs Custom

In [None]:
# from gensim.models import Word2Vec
# # from gensim.test.utils import datapath
# from scipy.linalg import orthogonal_procrustes
# from sklearn.metrics.pairwise import cosine_similarity

# # Corpus file path
# file_path = "" # .txt path

# model = Word2Vec(corpus_file=file_path,
#                  min_count=100,
#                  window=5,
#                  size=300,
#                  sample=1e-5,
#                  negative=15,
#                  alpha=0.025,
#                  ns_exponent=0.75,
#                  workers=8,
#                  sg=1)

# model2 = Word2Vec(corpus_file=file_path,
#                  min_count=100,
#                  window=15,
#                  size=300,
#                  sample=1e-8,
#                  negative=5,
#                  alpha=0.5,
#                  ns_exponent=1,
#                  workers=8,
#                  sg=1)

# # Get the vocab for each model
# vocab_model = set(model.wv.vocab.keys())
# vocab_model2 = set(model2.wv.vocab.keys())


# # Find the common vocabulary
# common_vocab = list(vocab_model2 & vocab_model)

# # Make the orthogonal_procrustes alignment
# model_matrix = model.wv.__getitem__(common_vocab)
# model2_matrix = model2.wv.__getitem__(common_vocab)
# M, _ = orthogonal_procrustes(model2_matrix, model_matrix)
# model2_matrix_aligned = model2.wv.__getitem__(common_vocab).dot(M)

# # Compute the cosine
# cos_mat = cosine_similarity(model_matrix, model2_matrix_aligned)
# vec_cosine = cos_mat.diagonal()
# mean_cosine = vec_cosine.mean()
# sd_cosine = vec_cosine.std()

# print("Mean cosine = {0}, Std cosine = {1}".format(mean_cosine, sd_cosine))

# Versuch n°2 mit Gensim

In [46]:
import re  # preprocessing
import spacy  # preprocessing
import pandas as pd  # data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency


import json # For import training data

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [47]:
with open(r"E:\Users\Lucas xD\Downloads\Products_Q_US_edited.json", encoding="utf8") as json_file:
    data = json.load(json_file)

In [48]:
df = pd.json_normalize(data)
df.head()

Unnamed: 0,brand,colors,gender,longDescription,name,productId,shortDescription,sizes,styleName,variants,image,id
0,HUGO,"[{'code': '001', 'name': 'Black', 'link': 'htt...",Men,V-neck Solid Small embossed tonal logo detail ...,Stretch Cotton V-Neck T-Shirt | Dredosos,hbna50261022,This t-shirt by HUGO is crafted from cotton wi...,"[XS, L, M, S, XL, XXL]",,"[{'careInstructionCodes': ['000087', 'B1', 'C0...",https://images.hugoboss.com/is/image/boss/hbna...,0.0
1,BOSS,"[{'code': '001', 'name': 'Black', 'link': 'htt...",Men,Cow skin Basket weave printed texture Tonal sq...,Leather belt with embossed detail,hbna50262032,Upgrade your everyday collection with this tim...,"[34, 38, 40, 42, 44, 30, 32, 36]",,"[{'careInstructionCodes': [], 'colorCode': '00...",https://images.hugoboss.com/is/image/boss/hbna...,1.0
2,BOSS,"[{'code': '001', 'name': 'Black', 'link': 'htt...",Men,Lace-up Derby Italian leather upper Leather li...,Italian Leather Derby Dress Shoe | Prindo,hbna50263062,Crafted from fine Italian calfskin with a prin...,"[6.5, 9, 11, 11.5, 12, 7, 7.5, 8, 8.5, 9.5, 10...",DressShoes,"[{'careInstructionCodes': [], 'colorCode': '00...",https://images.hugoboss.com/is/image/boss/hbna...,2.0
3,BOSS,"[{'code': '001', 'name': 'Black', 'link': 'htt...",Men,A regular fit tuxedo in virgin wool. <b>Jacke...,"Virgin Wool Tuxedo, Regular Fit | Stars/Glamour",hbna50194045,This regular fit tuxedo by BOSS is crafted in ...,"[34R, 42L, 44L, 46L, 48L, 36R, 36S, 38R, 38S, ...",Tuxedos,"[{'careInstructionCodes': ['00', 'B1', 'C0', '...",https://images.hugoboss.com/is/image/boss/hbna...,3.0
4,BOSS,"[{'code': '410', 'name': 'Dark Blue', 'link': ...",Men,A slim-fit suit made from Italian Super 110s v...,"Italian Virgin Wool Suit, Slim Fit | Huge/Genius",hbna50263213,Our best-selling suit just got better with an ...,"[44S, 48L, 42L, 44L, 34R, 36R, 38R, 40R, 40S, ...",Professional,"[{'careInstructionCodes': ['00', 'B1', 'C0', '...",https://images.hugoboss.com/is/image/boss/hbna...,4.0


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2777 entries, 0 to 2776
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   brand             2777 non-null   object 
 1   colors            2777 non-null   object 
 2   gender            2777 non-null   object 
 3   longDescription   2777 non-null   object 
 4   name              2777 non-null   object 
 5   productId         2777 non-null   object 
 6   shortDescription  2777 non-null   object 
 7   sizes             2777 non-null   object 
 8   styleName         2777 non-null   object 
 9   variants          2777 non-null   object 
 10  image             2777 non-null   object 
 11  id                2777 non-null   float64
dtypes: float64(1), object(11)
memory usage: 260.5+ KB


In [50]:
df = df.drop(columns=['brand', 'colors', 'gender', 'productId', 'sizes', 'styleName', 'variants', 'image', 'id'])
df.head()

Unnamed: 0,longDescription,name,shortDescription
0,V-neck Solid Small embossed tonal logo detail ...,Stretch Cotton V-Neck T-Shirt | Dredosos,This t-shirt by HUGO is crafted from cotton wi...
1,Cow skin Basket weave printed texture Tonal sq...,Leather belt with embossed detail,Upgrade your everyday collection with this tim...
2,Lace-up Derby Italian leather upper Leather li...,Italian Leather Derby Dress Shoe | Prindo,Crafted from fine Italian calfskin with a prin...
3,A regular fit tuxedo in virgin wool. <b>Jacke...,"Virgin Wool Tuxedo, Regular Fit | Stars/Glamour",This regular fit tuxedo by BOSS is crafted in ...
4,A slim-fit suit made from Italian Super 110s v...,"Italian Virgin Wool Suit, Slim Fit | Huge/Genius",Our best-selling suit just got better with an ...


In [51]:
df = df.stack().reset_index() # all in one column


In [52]:
df = df.drop(columns=['level_0', 'level_1'])
df.columns = ['sentences']
df.head()

Unnamed: 0,sentences
0,V-neck Solid Small embossed tonal logo detail ...
1,Stretch Cotton V-Neck T-Shirt | Dredosos
2,This t-shirt by HUGO is crafted from cotton wi...
3,Cow skin Basket weave printed texture Tonal sq...
4,Leather belt with embossed detail


In [53]:
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

In [54]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [55]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['sentences']) # Removes non-alphabetic characters

In [56]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.19 mins


In [57]:
txt[:5]

['v neck solid small emboss tonal logo detail hem stretch cotton',
 'stretch cotton v neck t shirt dredosos',
 't shirt hugo craft cotton hint stretch comfort small emboss tonal logo detail hem complete v neck style',
 'cow skin basket weave print texture tonal square pin buckle logo print buckle width cm',
 'leather belt emboss detail']

In [58]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(6723, 1)

In [59]:
from gensim.models.phrases import Phrases, Phraser # Phrases package to automatically detect common phrases (bigrams) from a list of sentences.

In [60]:
sent = [row.split() for row in df_clean['clean']]

In [61]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 10:47:41: collecting all words and their counts
INFO - 10:47:41: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 10:47:41: collected 40667 token types (unigram + bigrams) from a corpus of 130173 words and 6723 sentences
INFO - 10:47:41: merged Phrases<40667 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 10:47:41: Phrases lifecycle event {'msg': 'built Phrases<40667 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.13s', 'datetime': '2021-07-08T10:47:41.520830', 'gensim': '4.0.1', 'python': '3.9.1 (tags/v3.9.1:1e5d33e, Dec  7 2020, 17:08:21) [MSC v.1927 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [62]:
bigram = Phraser(phrases)

INFO - 10:48:32: exporting phrases from Phrases<40667 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 10:48:32: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<144 phrases, min_count=30, threshold=10.0> from Phrases<40667 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.06s', 'datetime': '2021-07-08T10:48:32.890021', 'gensim': '4.0.1', 'python': '3.9.1 (tags/v3.9.1:1e5d33e, Dec  7 2020, 17:08:21) [MSC v.1927 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [63]:
sentences = bigram[sent]

#### Most frequent words:

In [65]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

3141

In [66]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['fit',
 'pocket',
 'cotton',
 'boss',
 'regular',
 'craft',
 'logo',
 'feature',
 'design',
 'slim']

In [86]:
import multiprocessing

from gensim.models import Word2Vec

In [87]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
w2v_model = Word2Vec(min_count=10,
                     window=2,
                     vector_size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

INFO - 11:06:06: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.03)', 'datetime': '2021-07-08T11:06:06.817138', 'gensim': '4.0.1', 'python': '3.9.1 (tags/v3.9.1:1e5d33e, Dec  7 2020, 17:08:21) [MSC v.1927 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


Vocab...

In [103]:
w2v_model.build_vocab(sentences, progress_per=10000)

INFO - 11:08:53: collecting all words and their counts
INFO - 11:08:53: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:08:53: collected 3141 word types from a corpus of 115658 raw words and 6723 sentences
INFO - 11:08:53: Creating a fresh vocabulary
INFO - 11:08:53: Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 1206 unique words (38.39541547277937%% of original 3141, drops 1935)', 'datetime': '2021-07-08T11:08:53.819342', 'gensim': '4.0.1', 'python': '3.9.1 (tags/v3.9.1:1e5d33e, Dec  7 2020, 17:08:21) [MSC v.1927 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
INFO - 11:08:53: Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 109974 word corpus (95.0855107299106%% of original 115658, drops 5684)', 'datetime': '2021-07-08T11:08:53.824342', 'gensim': '4.0.1', 'python': '3.9.1 (tags/v3.9.1:1e5d33e, Dec  7 2020, 17:08:21) [MSC v.1927 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 

Training:

In [107]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=20, report_delay=1)

INFO - 11:09:13: Word2Vec lifecycle event {'msg': 'training model with 3 workers on 1206 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2', 'datetime': '2021-07-08T11:09:13.156139', 'gensim': '4.0.1', 'python': '3.9.1 (tags/v3.9.1:1e5d33e, Dec  7 2020, 17:08:21) [MSC v.1927 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
INFO - 11:09:13: worker thread finished; awaiting finish of 2 more threads
INFO - 11:09:13: worker thread finished; awaiting finish of 1 more threads
INFO - 11:09:13: worker thread finished; awaiting finish of 0 more threads
INFO - 11:09:13: EPOCH - 1 : training on 115658 raw words (31021 effective words) took 0.1s, 253167 effective words/s
INFO - 11:09:13: worker thread finished; awaiting finish of 2 more threads
INFO - 11:09:13: worker thread finished; awaiting finish of 1 more threads
INFO - 11:09:13: worker thread finished; awaiting finish of 0 more threads
INFO - 11:09:13: EPOCH - 2 : training on 115658 ra

(621062, 2313160)

In [108]:
w2v_model.wv.most_similar(positive=["boss"])

[('elevate', 0.5828986763954163),
 ('impeccable', 0.5804067254066467),
 ('occasion', 0.5768791437149048),
 ('elegant', 0.5364821553230286),
 ('pare', 0.5162294507026672),
 ('business', 0.49626004695892334),
 ('formal', 0.49442365765571594),
 ('workwear', 0.4815402626991272),
 ('effortless', 0.471557080745697), 
 ('italy', 0.4412034749984741)]

In [109]:
w2v_model.wv.most_similar(positive=["hugo"])

[('smart_casual', 0.653635573387146),
 ('bold', 0.5222806334495544),
 ('red', 0.48366621136665344),
 ('iconic', 0.4434213638305664),
 ('urban', 0.43985819816589355),
 ('contemporary', 0.43972058215141296),
 ('chunky', 0.43887126445770264),
 ('trend', 0.4271756708621979),
 ('black', 0.4056681990623474),
 ('weekend', 0.42688432335853577)]

In [112]:
w2v_model.wv.most_similar(positive=["professional"])

[('crisp', 0.5842987895011902),
 ('elevate', 0.5828986763954163),
 ('impeccable', 0.5804067254066467),
 ('occasion', 0.5768791437149048),
 ('elegant', 0.5364821553230286),
 ('pare', 0.5162294507026672),
 ('business', 0.49626004695892334),
 ('formal', 0.49442365765571594),
 ('workwear', 0.4815402626991272),
 ('effortless', 0.471557080745697)]

In [113]:
w2v_model.wv.most_similar(positive=["casual"])

[('duty', 0.5042375326156616),
 ('foundation', 0.48831605911254883),
 ('give', 0.4623507857322693),
 ('smart_casual', 0.453635573387146),
 ('contemporary', 0.45182058215141296),
 ('lay', 0.4511401355266571),
 ('versatile', 0.4445495903491974),
 ('new_season', 0.44348201155662537),
 ('crew_neckline', 0.4307824969291687),
 ('weekend', 0.42688432335853577)]

In [1]:
w2v_model.wv.most_similar(positive=["casual"])

NameError: name 'w2v_model' is not defined