In [96]:
import os
import re
import numpy as np
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
from gensim.models.callbacks import CallbackAny2Vec

## Training

In [3]:
DATA_FOLDER = "data\\"
train_path = os.path.join(DATA_FOLDER, "train.txt")
with open(train_path) as file:
    text = file.read()

'data\\train.txt'

In [12]:
data = [nltk.word_tokenize(re.sub(r"[^a-z]+", " ", sentence.lower())) for sentence in nltk.sent_tokenize(text)]

### FastText

In [16]:
fast_text_model = FastText(size=300, workers=12)

# build the vocabulary
fast_text_model.build_vocab(data)

# train the model
fast_text_model.train(data, total_examples=model.corpus_count, epochs=20)

In [26]:
fast_text_model.save('models\\fasttext200k\\fasttext200k.model')

In [107]:
def predict_output_word(model, context_words_list, topn=10):
    """Get the probability distribution of the center word given context words.

        Parameters
        ----------
        model: gensim.models.base_any2vec.BaseWordEmbeddingsModel
            Model that predicts the output
        context_words_list : list of str
            List of context words.
        topn : int, optional
            Return `topn` words and their probabilities.

        Returns
        -------
        list of (str, float)
            `topn` length list of tuples of (word, probability).

    """
    if not model.negative:
        raise RuntimeError(
        "We have currently only implemented predict_output_word for the negative sampling scheme, "
        "so you need to have run word2vec with negative > 0 for this to work."
        )

    if not hasattr(model.wv, 'vectors') or not hasattr(model.trainables, 'syn1neg'):
        raise RuntimeError("Parameters required for predicting the output words not found.")

    word_vocabs = [model.wv.vocab[w] for w in context_words_list if w in model.wv.vocab]
    if not word_vocabs:
        warnings.warn("All the input context words are out-of-vocabulary for the current model.")
        return None

    word2_indices = [word.index for word in word_vocabs]

    l1 = np.sum(model.wv.vectors[word2_indices], axis=0)
    if word2_indices and model.cbow_mean:
        l1 /= len(word2_indices)

    # propagate hidden -> output and take softmax to get probabilities
    prob_values = np.exp(np.dot(l1, model.trainables.syn1neg.T))
    prob_values /= np.sum(prob_values)
    top_indices = np.argsort(prob_values)[-topn:][::-1]
    # returning the most probable output words with their probabilities
    return [(model.wv.index2word[index1], prob_values[index1]) for index1 in top_indices]

### Word2Vec

In [12]:
word2vec_model = Word2Vec(size=300, workers=12)
word2vec_model.build_vocab(data)

In [7]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1
epoch_logger = EpochLogger()

In [20]:
word2vec_model.train(data, total_examples=model.corpus_count, epochs=20, callbacks=[epoch_logger])

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end
Epoch #10 start
Epoch #10 end
Epoch #11 start
Epoch #11 end
Epoch #12 start
Epoch #12 end
Epoch #13 start
Epoch #13 end
Epoch #14 start
Epoch #14 end
Epoch #15 start
Epoch #15 end
Epoch #16 start
Epoch #16 end
Epoch #17 start
Epoch #17 end
Epoch #18 start
Epoch #18 end
Epoch #19 start
Epoch #19 end


(744806832, 973533000)

In [21]:
word2vec_model.save("models\\word2vec200k\\word2vec200k.model")

## Testing

In [108]:
test_sent = "The level of postoperative pain and nausea was scored in the postanesthesia care unit ( PACU ) during the first postoperative hour , as well as at 2 , 4 , 8 and 24h postoperatively ."
words = nltk.word_tokenize(re.sub(r"[^a-z]+", " ", test_sent.lower()))
words

['the',
 'level',
 'of',
 'postoperative',
 'pain',
 'and',
 'nausea',
 'was',
 'scored',
 'in',
 'the',
 'postanesthesia',
 'care',
 'unit',
 'pacu',
 'during',
 'the',
 'first',
 'postoperative',
 'hour',
 'as',
 'well',
 'as',
 'at',
 'and',
 'h',
 'postoperatively']

In [101]:
fast_text_model = FastText.load("models\\fasttext200k\\fasttext200k.model")
word2vec_model = Word2Vec.load("models\\word2vec200k\\word2vec200k.model")

In [109]:
predict_output_word(word2vec_model, words[:6])

[('relief', 0.005527567),
 ('intensity', 0.0034905642),
 ('shoulder', 0.0011541344),
 ('discomfort', 0.0009982137),
 ('analgesia', 0.00083989766),
 ('preoperative', 0.00077001157),
 ('severity', 0.0006258007),
 ('hypoesthesia', 0.00059391954),
 ('pain', 0.00056159665),
 ('back', 0.0005227166)]

In [110]:
predict_output_word(fast_text_model, words[:6])

[('relief', 0.36741358),
 ('intensity', 0.0859091),
 ('alleviation', 0.0096091805),
 ('chest', 0.003944416),
 ('complained', 0.0035093569),
 ('neuropathic', 0.0034618857),
 ('back', 0.0027384593),
 ('severity', 0.0026988513),
 ('perception', 0.0019876296),
 ('shoulder', 0.001774376)]

In [106]:
type(fast_text_model).__base__

gensim.models.base_any2vec.BaseWordEmbeddingsModel

In [116]:
def check(words):
    print(words)
    return (predict_output_word(word2vec_model, words[:-1]), predict_output_word(fast_text_model, words[:-1]))

In [117]:
check(words[10:15])

['the', 'postanesthesia', 'care', 'unit', 'pacu']


([('unit', 0.695053),
  ('pacu', 0.057000123),
  ('postanesthesia', 0.03609574),
  ('care', 0.014875768),
  ('arrival', 0.008621463),
  ('stay', 0.0078104693),
  ('intensive', 0.007062314),
  ('postanaesthesia', 0.005120231),
  ('ward', 0.0020113364),
  ('postanesthetic', 0.0015265796)],
 [('intensive', 0.6312368),
  ('postanesthesia', 0.15752365),
  ('unit', 0.103116766),
  ('usual', 0.06519048),
  ('pacu', 0.00407928),
  ('inpatient', 0.002983677),
  ('orthogeriatric', 0.0024374798),
  ('maternity', 0.0023734183),
  ('cpu', 0.0020048202),
  ('stay', 0.0018834673)])