In [13]:
import gensim
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models import Phrases, TfidfModel
from gensim.models.phrases import Phraser
from gensim.test.utils import datapath 
from gensim.models.word2vec import Word2Vec, Text8Corpus
import logging

In [9]:
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)

In [37]:
path = '/Users/stefanoraimondousai/Documents/ReadingCourse/wiki_slim.txt'
sentences = Text8Corpus(datapath(path))

In [38]:
ngram=Phrases(sentences)

2020-03-13 01:03:57,561: INFO: collecting all words and their counts
2020-03-13 01:03:57,569: INFO: PROGRESS: at sentence #0, processed 0 words and 0 word types
2020-03-13 01:04:55,827: INFO: collected 10704833 word types from a corpus of 30436010 words (unigram + bigrams) and 3044 sentences
2020-03-13 01:04:55,833: INFO: using 10704833 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>


In [39]:
phraser = Phraser(ngram)

2020-03-13 01:04:55,838: INFO: source_vocab length 10704833
2020-03-13 01:05:04,055: INFO: Phraser added 50000 phrasegrams
2020-03-13 01:05:25,486: INFO: Phraser added 100000 phrasegrams
2020-03-13 01:06:31,435: INFO: Phraser built with 119327 phrasegrams


In [40]:
ngram = Phrases(phraser[sentences])

2020-03-13 01:06:31,444: INFO: collecting all words and their counts
2020-03-13 01:06:31,468: INFO: PROGRESS: at sentence #0, processed 0 words and 0 word types
2020-03-13 01:08:25,545: INFO: collected 12701062 word types from a corpus of 26580194 words (unigram + bigrams) and 3044 sentences
2020-03-13 01:08:25,554: INFO: using 12701062 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>


In [41]:
phraser=Phraser(ngram)

2020-03-13 01:08:26,768: INFO: source_vocab length 12701062
2020-03-13 01:08:30,319: INFO: Phraser added 50000 phrasegrams
2020-03-13 01:08:34,511: INFO: Phraser added 100000 phrasegrams
2020-03-13 01:08:40,584: INFO: Phraser added 150000 phrasegrams
2020-03-13 01:08:49,855: INFO: Phraser added 200000 phrasegrams
2020-03-13 01:09:06,946: INFO: Phraser added 250000 phrasegrams
2020-03-13 01:10:43,044: INFO: Phraser built with 298254 phrasegrams


In [42]:
import re
with open('trigramWikiSlim.txt','w') as outfile:
    for sent in sentences:
        tokens_ = phraser[sent]
        outfile.write("%s\n" % re.sub("[\'\[\]\,]", "",str(tokens_)))

In [None]:
model = Word2Vec(phraser[sentences], 
                 min_count=3,   # Ignore words that appear less than this
                 size=300,      # Dimensionality of word embeddings
                 workers=4,     # Number of processors (parallelisation)
                 window=5,      # Context window for words during training
                 iter=5,
                 sg=1)        # Number of epochs training over corpus

In [36]:
del sentences
del ngram
del phraser

In [25]:
model.save('WikiModelItaSlim.model')

2020-03-13 00:43:51,056: INFO: saving Word2Vec object under WikiModelItaSlim.model, separately None
2020-03-13 00:43:51,057: INFO: storing np array 'vectors' to WikiModelItaSlim.model.wv.vectors.npy
2020-03-13 00:43:52,297: INFO: not storing attribute vectors_norm
2020-03-13 00:43:52,298: INFO: storing np array 'syn1neg' to WikiModelItaSlim.model.trainables.syn1neg.npy
2020-03-13 00:43:53,532: INFO: not storing attribute cum_table
2020-03-13 00:43:54,425: INFO: saved WikiModelItaSlim.model


In [26]:
model.most_similar('ciao')

  """Entry point for launching an IPython kernel.
2020-03-13 00:44:06,626: INFO: precomputing L2-norms of word weight vectors


[(':)', 0.9392555952072144),
 ('grazie.', 0.9346376657485962),
 (':-)', 0.933768630027771),
 ('ciao,', 0.9324568510055542),
 ('ciao!', 0.9109267592430115),
 ('2005_(cest)', 0.9018099904060364),
 ('ciao.', 0.8992548584938049),
 ('2004_(utc)', 0.8964660167694092),
 ('(utc)', 0.8938655853271484),
 ('ok,', 0.8921703100204468)]

In [29]:
model.predict_output_word(["l'italia",'è','la','capitale'])

[('più_popolosa', 0.0048553995),
 ('del_messico', 0.002791965),
 ('cina', 0.0025693856),
 ('capitale', 0.0020230694),
 ('più_grande', 0.0018823635),
 ('moldavia', 0.0016764032),
 ('repubblica', 0.0015376823),
 ('divisa_tra', 0.0013559225),
 ('germania.', 0.00094312476),
 ('svizzera,', 0.000838515)]

In [34]:
model.predict_output_word(['papa'])

[('papa_clemente', 0.10156507),
 ('papa_alessandro', 0.08318469),
 ('papa_paolo', 0.080565795),
 ('papa_innocenzo', 0.071060896),
 ('scomunica', 0.060936455),
 ('papa_leone', 0.052851405),
 ('papa_urbano', 0.039037954),
 ('papa_pio', 0.035492465),
 ('pontefice,', 0.02253196),
 ('patriarca', 0.021794304)]