In [1]:
from gensim.models import Word2Vec

In [6]:
"""Train Word2Vec"""

def word2vec_model(sentences, size=100, min_count=5, window=5, negative=5,
                   cbow=True, iterations=5, seed=0, workers=1):
    """Creates and trains a gensim word2vec model:
        sentences is a list of sentences to be trained on.
        size is the dimensionality of the embedding layer.
        min_count is the minimum number of occurrences of a
            word for use in training.
        window is the maximum distance between the current
            and predicted word within a sentence.
        negative is the size of negative sampling.
        cbow is a boolean to determine the training type;
        True is for CBOW; False is for Skip-gram.
        iterations is the number of iterations to train over.
        seed is the seed for the random number generator.
        workers is the number of worker threads to train the model.
        Returns: the trained model."""
    sg = 0 if cbow else 1

    model = Word2Vec(sentences, vector_size=size,
                     min_count=min_count, window=window,
                     negative=negative, sg=sg, epochs=iterations,
                     seed=seed, workers=workers)

    return model

In [7]:
from gensim.test.utils import common_texts
# word2vec_model = __import__('2-word2vec').word2vec_model

print(common_texts[:2])
w2v = word2vec_model(common_texts, min_count=1)
print(w2v.wv["computer"])

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
[-9.17425146e-04  4.23241127e-03  5.63164940e-03  6.88221911e-03
 -6.18189573e-03  3.55597492e-03 -4.59551578e-03 -2.62356992e-03
 -2.58884183e-03  1.51444075e-03  1.76495546e-03  1.26824854e-03
 -8.70202854e-03  8.73132143e-03  7.04515446e-03 -2.24651699e-03
  1.43263815e-03 -6.70434721e-03  2.69516581e-03  7.53865717e-03
  8.56675580e-03  7.89457001e-03 -8.89756717e-03 -9.03468858e-03
  4.73744608e-03 -6.03551976e-03 -6.21854421e-03  2.72567268e-03
 -8.80681351e-03  5.77690266e-03 -6.42151944e-03  2.13384978e-03
  2.60995259e-03 -6.16821647e-03 -1.97864044e-03 -7.64716882e-03
  9.61878430e-03  1.19452474e-04 -7.03770155e-03  6.31020777e-03
  4.79384791e-03 -5.65865776e-03 -3.22094793e-03 -8.49734619e-03
  1.13402959e-03  1.02089881e-03 -8.94187670e-03 -6.16365811e-03
 -9.08331887e-04 -8.65152571e-03  4.83665941e-03  5.46529191e-03
  4.19504056e-03  6.42453181e-03  6.02116482e-03 -2.0332

In [9]:
"""Extract Word2Vec"""

from keras.layers import Embedding
import numpy as np


def gensim_to_keras(model):
    """Converts a gensim word2vec model to a keras Embedding layer.
        model is a trained gensim word2vec models
        Returns: the trainable keras Embedding."""
    vocab_size = len(model.wv.key_to_index) + 1
    vector_size = model.vector_size

    weight_matrix = np.zeros((vocab_size, vector_size))

    for word, i in model.wv.key_to_index.items():
        weight_matrix[i] = model.wv[word]

    embedding_layer = Embedding(input_dim=vocab_size,
                                output_dim=vector_size,
                                weights=[weight_matrix],
                                trainable=True)

    return embedding_layer


In [10]:
from gensim.test.utils import common_texts
# word2vec_model = __import__('2-word2vec').word2vec_model
# gensim_to_keras = __import__('3-gensim_to_keras').gensim_to_keras

print(common_texts[:2])
w2v = word2vec_model(common_texts, min_count=1)
print(gensim_to_keras(w2v))


[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
<keras.src.layers.core.embedding.Embedding object at 0x7dbfcc4a6500>


In [13]:
"""FastText function"""

from gensim.models import FastText


def fasttext_model(sentences, size=100, min_count=5,
                   negative=5, window=5, cbow=True, iterations=5,
                   seed=0, workers=1):
    """creates and trains a genism fastText model:
        sentences is a list of sentences to be trained on.
        size is the dimensionality of the embedding layer.
        min_count is the minimum number of occurrences of a word
            for use in training.
        window is the maximum distance between the current and predicted
            word within a sentence.
        negative is the size of negative sampling.
        cbow is a boolean to determine the training type; True is for CBOW;
            False is for Skip-gram.
        iterations is the number of iterations to train over.
        seed is the seed for the random number generator.
        workers is the number of worker threads to train the model.
        Returns: the trained model."""
    sg = 0 if cbow else 1

    model = FastText(sentences, vector_size=size, min_count=min_count, window=window,
                     negative=negative, sg=sg, epochs=iterations,
                     seed=seed, workers=workers)

    return model


In [14]:
from gensim.test.utils import common_texts
# fasttext_model = __import__('4-fasttext').fasttext_model

print(common_texts[:2])
ft = fasttext_model(common_texts, min_count=1)
print(ft.wv["computer"])


[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
[-4.4518875e-04  1.9057443e-04  7.1344204e-04  1.5088863e-04
  7.3785416e-04  2.0828047e-03 -1.4264339e-03 -6.6978252e-04
 -3.9446630e-04  6.1643129e-04  3.7035978e-04 -1.7527672e-03
  2.0829479e-05  1.0929988e-03 -6.6954875e-04  7.9767447e-04
 -9.0742309e-04  1.9187949e-03 -6.9725298e-04  3.7622583e-04
 -5.0849823e-05  1.6160590e-04 -8.3575735e-04 -1.4309353e-03
  1.8365250e-04 -1.1365860e-03 -2.1796341e-03  3.3816829e-04
 -1.0266158e-03  1.9360909e-03  9.3765622e-05 -1.2577525e-03
  1.7052694e-04 -1.0470246e-03  9.1582153e-04 -1.1945128e-03
  1.2874184e-03 -3.1551000e-04 -1.1084992e-03  2.2345960e-04
  5.9021922e-04 -5.7232735e-04  1.6017178e-04 -1.0333696e-03
 -2.6842864e-04 -1.2489735e-03 -3.4248878e-05  2.0717620e-03
  1.0997808e-03  4.9419136e-04 -4.3252495e-04  7.6816598e-04
  3.0231036e-04  6.4548600e-04  2.5580439e-03 -1.2883682e-04
 -3.8391326e-04 -2.1800243e-04  6.5950496e-04 -