In [1]:
import logging
import os
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
        self.sentence_count = 0

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, fname)) as f_input:
                corpus = f_input.read()
            raw_sentences = sent_tokenize(corpus)
            for sentence in raw_sentences:
                if len(sentence) > 0:
                    self.sentence_count += 1
                    yield simple_preprocess(sentence) # tokenization, lowercasing ect... => retrun a list o


sentences = MySentences('/data')

model = Word2Vec(sg=1, # 1 for skip-gram; otherwise CBOW
                 size=300, # num of features
                 window=5,
                 #min_count=3,
                 workers=4)
model.build_vocab(sentences)
model.train(sentences=sentences, total_examples=model.corpus_count, epochs=model.epochs)
model.save('GOT-vectors.w2v')  # Save the model for later use

print(model.wv.most_similar('daenerys', topn=5))


2019-10-22 20:44:31,330 : INFO : collecting all words and their counts
2019-10-22 20:44:31,819 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-10-22 20:44:31,974 : INFO : PROGRESS: at sentence #10000, processed 133456 words, keeping 9186 word types
2019-10-22 20:44:32,125 : INFO : PROGRESS: at sentence #20000, processed 265279 words, keeping 11983 word types
2019-10-22 20:44:32,917 : INFO : PROGRESS: at sentence #30000, processed 398618 words, keeping 14031 word types
2019-10-22 20:44:33,071 : INFO : PROGRESS: at sentence #40000, processed 528012 words, keeping 15432 word types
2019-10-22 20:44:33,215 : INFO : PROGRESS: at sentence #50000, processed 650964 words, keeping 16557 word types
2019-10-22 20:44:33,968 : INFO : PROGRESS: at sentence #60000, processed 785700 words, keeping 17777 word types
2019-10-22 20:44:34,125 : INFO : PROGRESS: at sentence #70000, processed 924133 words, keeping 18883 word types
2019-10-22 20:44:34,279 : INFO : PROGRESS: at s

2019-10-22 20:45:07,420 : INFO : saved GOT-vectors.w2v
2019-10-22 20:45:07,420 : INFO : precomputing L2-norms of word weight vectors


[('stormborn', 0.7817783355712891), ('unburnt', 0.7196938991546631), ('targaryen', 0.6913876533508301), ('kneel', 0.6648521423339844), ('rhaella', 0.6315736770629883)]


In [2]:
model.wv.similarity('jon', 'ygritte')

0.6076681

In [3]:
print(model.wv.distances('arryn'))

[0.70651907 0.7496346  0.66152644 ... 0.57311463 0.6294986  0.67315924]


In [5]:
# Top 5 Ã¤hnliche Begriffe
print(model.wv.most_similar('lannister', topn = 7))

# ODER
print(model.wv.similar_by_word('lannister'))

[('pays', 0.7023950219154358), ('tywin', 0.6980235576629639), ('kingslayer', 0.6655067205429077), ('debts', 0.6647920608520508), ('kevan', 0.651489794254303), ('jaime', 0.6432561874389648), ('stafford', 0.624941349029541)]
[('pays', 0.7023950219154358), ('tywin', 0.6980235576629639), ('kingslayer', 0.6655067205429077), ('debts', 0.6647920608520508), ('kevan', 0.651489794254303), ('jaime', 0.6432561874389648), ('stafford', 0.624941349029541), ('cersei', 0.620202362537384), ('casterly', 0.6180166006088257), ('imp', 0.5923709869384766)]


In [6]:
# Vektoren Addition / Subtraktion
model.wv.most_similar(positive=['stark', 'winterfell'], negative=['dragons'], topn=1)

[('eddard', 0.6230649948120117)]

In [7]:
model.wv.doesnt_match('winterfell riverrun jaime'.split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'jaime'