# Generating a WordVector

In [51]:
import json
import logging 
import multiprocessing
from datetime import datetime

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [52]:
LOG = logging.getLogger('make_word_vec')
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

 ## Parameter suggestions brought to you by:
 * Hyperparameters matter
 * 

In [53]:
# corpus_characteristics = 'non_lemmatized'  
# corpus_filename =  'latin_library.preprocessed.cor' 

corpus_characteristics = 'lemmatized'  
corpus_filename ='latin_library.lemmatized.preprocessed.cor'


In [54]:
keyword_params = {
    'size': 600,
    'iter': 300,
    'min_count': 2, # Ignores all words with total frequency lower than this.
    'max_vocab_size': None,
    'ns_exponent': 0.75, # the default, optimal for linguistic tasks; also try -0.5 for recommenders
    'alpha':  0.025,
    'min_alpha': 0.004,
    'sg': 1, # skip gram
    'window': 10, # number of surrounding words to consider
    'workers': multiprocessing.cpu_count() - 1,
    'negative': 15, # 15 may be best
    'sample': 0.001 #   0.00001  # sample=1e-05 downsamples 4158 most-common words
    #     sample=0.001 downsamples 32 most-common words
}
LOG.info('Creating vector with parameters: %s', json.dumps(keyword_params))
latin_lib_vec = Word2Vec(corpus_file=corpus_filename, **keyword_params)

INFO : Creating vector with parameters: {"size": 600, "iter": 300, "min_count": 2, "max_vocab_size": null, "ns_exponent": 0.75, "alpha": 0.025, "min_alpha": 0.004, "sg": 1, "window": 10, "workers": 7, "negative": 15, "sample": 512}
INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : PROGRESS: at sentence #10000, processed 175859 words, keeping 5879 word types
INFO : PROGRESS: at sentence #20000, processed 342816 words, keeping 8001 word types
INFO : PROGRESS: at sentence #30000, processed 550570 words, keeping 9757 word types
INFO : PROGRESS: at sentence #40000, processed 753358 words, keeping 11368 word types
INFO : PROGRESS: at sentence #50000, processed 969282 words, keeping 12523 word types
INFO : PROGRESS: at sentence #60000, processed 1161725 words, keeping 13634 word types
INFO : PROGRESS: at sentence #70000, processed 1344406 words, keeping 16515 word types
INFO : PROGRESS: at sentence #80000, processed 15

KeyboardInterrupt: 

In [21]:
LOG.info('Saving word2vec for latin library corpus')
latin_lib_vec.save('latin_library.{}.{}.vec'.format(corpus_characteristics, datetime.now().strftime('%Y.%m.%d')))

INFO : Saving word2vec for latin library corpus
INFO : saving Word2Vec object under latin_library.non_lemmatized.2019.03.08.vec, separately None
INFO : storing np array 'vectors' to latin_library.non_lemmatized.2019.03.08.vec.wv.vectors.npy
INFO : not storing attribute vectors_norm
INFO : storing np array 'syn1neg' to latin_library.non_lemmatized.2019.03.08.vec.trainables.syn1neg.npy
INFO : not storing attribute cum_table
INFO : saved latin_library.non_lemmatized.2019.03.08.vec


In [22]:
with open('latin_library.vec.{}.{}.params'.format(corpus_characteristics, datetime.now().strftime('%Y.%m.%d')), 'wt') as writer:
    json.dump(keyword_params, writer)

### Persist the word vectors to disk
they should be cross platform, cross language loadable

In [23]:
word_vectors = latin_lib_vec.wv
the_filename = 'latin_library.{}.{}.kv'.format(corpus_characteristics, datetime.now().strftime('%Y.%m.%d'))
word_vectors.save(the_filename)

INFO : saving Word2VecKeyedVectors object under latin_library.non_lemmatized.2019.03.08.kv, separately None
INFO : storing np array 'vectors' to latin_library.non_lemmatized.2019.03.08.kv.vectors.npy
INFO : not storing attribute vectors_norm
INFO : saved latin_library.non_lemmatized.2019.03.08.kv


## Some QA

In [24]:
latin_lib_vec.wv.most_similar('puella')

[('cacata', 0.40804749727249146),
 ('euellerat', 0.35726672410964966),
 ('scitula', 0.34938153624534607),
 ('trahetur', 0.3437250852584839),
 ('plebea', 0.3397197723388672),
 ('instabatur', 0.3371580243110657),
 ('succenses', 0.33677178621292114),
 ('sauio', 0.33424675464630127),
 ('concupiueris', 0.33303868770599365),
 ('ligustro', 0.33187466859817505)]

In [25]:
latin_lib_vec.wv.similar_by_word('uir')

[('et', 0.3993789255619049),
 ('fabrilem', 0.3965851962566376),
 ('conloquetur', 0.38349807262420654),
 ('dominus', 0.3774290084838867),
 ('dixit', 0.37603938579559326),
 ('mancupium', 0.37392544746398926),
 ('incolet', 0.3695604205131531),
 ('non', 0.36870378255844116),
 ('uociferabitur', 0.36849337816238403),
 ('atramentarium', 0.36640453338623047)]

In [26]:
latin_lib_vec.wv.similar_by_word('uiolenter')

[('flauom', 0.372098833322525),
 ('aperuerant', 0.37147045135498047),
 ('infligebant', 0.35610195994377136),
 ('inlisi', 0.35295096039772034),
 ('accusarat', 0.3477999269962311),
 ('obtruncauerunt', 0.3422698974609375),
 ('nudabat', 0.33932244777679443),
 ('diripiebat', 0.339178204536438),
 ('imperitans', 0.3288525342941284),
 ('occupauimus', 0.32780519127845764)]

In [29]:
the_filename = 'latin_library.{}.{}.kv'.format(corpus_characteristics, datetime.now().strftime('%Y.%m.%d'))
latin_word_vectors = KeyedVectors.load(the_filename, mmap='r')

INFO : loading Word2VecKeyedVectors object from latin_library.non_lemmatized.2019.03.08.kv
INFO : loading vectors from latin_library.non_lemmatized.2019.03.08.kv.vectors.npy with mmap=r
INFO : setting ignored attribute vectors_norm to None
INFO : loaded latin_library.non_lemmatized.2019.03.08.kv


In [30]:
latin_word_vectors.most_similar('uir')

INFO : precomputing L2-norms of word weight vectors


[('et', 0.3993789255619049),
 ('fabrilem', 0.3965851962566376),
 ('conloquetur', 0.38349807262420654),
 ('dominus', 0.3774290084838867),
 ('dixit', 0.37603938579559326),
 ('mancupium', 0.37392544746398926),
 ('incolet', 0.3695604205131531),
 ('non', 0.36870378255844116),
 ('uociferabitur', 0.36849337816238403),
 ('atramentarium', 0.36640453338623047)]

In [12]:
latin_lib_vec.wv.most_similar('homo')

[('exprimebatur', 0.400768518447876),
 ('inuerecundus', 0.3843785226345062),
 ('letetur', 0.37862467765808105),
 ('adamatur', 0.3700706660747528),
 ('plantauerat', 0.36723676323890686),
 ('uir', 0.36512863636016846),
 ('factus', 0.3648669719696045),
 ('proselytis', 0.35976335406303406),
 ('choicus', 0.3569141626358032),
 ('nabla', 0.3554146885871887)]

In [13]:
latin_lib_vec.wv.most_similar('canere', topn=10) 

[('sonatur', 0.5754474401473999),
 ('bucinator', 0.3679332137107849),
 ('responsurium', 0.36166948080062866),
 ('consueuisset', 0.35109448432922363),
 ('momoriter', 0.34004682302474976),
 ('elimosinam', 0.3307771384716034),
 ('monochordo', 0.32640892267227173),
 ('conmoratus', 0.3229045569896698),
 ('frondiferas', 0.3085840344429016),
 ('cessarit', 0.3073081076145172)]

In [14]:
latin_lib_vec.wv.most_similar('piger', topn=10) 

[('consurges', 0.3656068742275238),
 ('comesor', 0.35631316900253296),
 ('reticuisse', 0.35073623061180115),
 ('mendicabit', 0.33155322074890137),
 ('mergetur', 0.32906609773635864),
 ('ephippia', 0.32282310724258423),
 ('holitori', 0.31779569387435913),
 ('desidendo', 0.3161380887031555),
 ('debentibus', 0.30631572008132935),
 ('inpinguabitur', 0.29799818992614746)]

In [15]:
latin_lib_vec.wv.most_similar('scandere')

[('gliscis', 0.3265058398246765),
 ('pulsantis', 0.2862904667854309),
 ('fruticeta', 0.2798694968223572),
 ('iactastis', 0.2630001902580261),
 ('eximendorum', 0.25384703278541565),
 ('lapsantibus', 0.25376540422439575),
 ('uibrent', 0.2533043920993805),
 ('erraretis', 0.24881130456924438),
 ('assulatim', 0.24873502552509308),
 ('frangentia', 0.24708648025989532)]

In [16]:
latin_lib_vec.wv.most_similar('praelucere')

[('tenderemus', 0.487804114818573),
 ('gradienti', 0.48268240690231323),
 ('laeuorsus', 0.37227192521095276),
 ('austra', 0.31538620591163635),
 ('montanos', 0.3103235960006714),
 ('effecero', 0.3040349781513214),
 ('ferinas', 0.30135902762413025),
 ('remistheo', 0.3008953034877777),
 ('exorabant', 0.2989898920059204),
 ('despicimus', 0.29503586888313293)]

In [17]:
latin_lib_vec.wv.similar_by_word('mentula')

[('hirundini', 0.3500634729862213),
 ('arrigis', 0.3463703989982605),
 ('uerpa', 0.32565271854400635),
 ('ducenties', 0.32386505603790283),
 ('luctaris', 0.2973496615886688),
 ('pedico', 0.28964224457740784),
 ('pedicaris', 0.28842228651046753),
 ('sesquipedalis', 0.2832435369491577),
 ('meiere', 0.2773906886577606),
 ('cunnus', 0.2743661403656006)]

In [18]:
latin_lib_vec.wv.similar_by_word('ciuis')

[('effregisti', 0.4355214238166809),
 ('serenanti', 0.4327419400215149),
 ('bacchium', 0.36393025517463684),
 ('flectendus', 0.3540099561214447),
 ('romanus', 0.3519400954246521),
 ('bibliopola', 0.3463898301124573),
 ('multiscius', 0.34309887886047363),
 ('fumea', 0.3318007290363312),
 ('aequaeuus', 0.3301989734172821),
 ('architectatus', 0.3297162652015686)]

In [35]:
the_lemmatized_filename = 'latin_library.2019.03.07.kv' 
lem_lat_wordvec = KeyedVectors.load(the_lemmatized_filename, mmap='r')

INFO : loading Word2VecKeyedVectors object from latin_library.2019.03.07.kv
INFO : setting ignored attribute vectors_norm to None
INFO : loaded latin_library.2019.03.07.kv


In [40]:
lem_lat_wordvec.most_similar('puella')

[('puer', 0.5749707818031311),
 ('iuuenis', 0.5151010751724243),
 ('uirgo', 0.49944934248924255),
 ('soror', 0.4523782730102539),
 ('mater', 0.45129919052124023),
 ('amare', 0.4469846189022064),
 ('uxor', 0.44040897488594055),
 ('maritus', 0.43844184279441833),
 ('at', 0.4366375207901001),
 ('coniunx', 0.4324739873409271)]

In [42]:
lem_lat_wordvec.most_similar('puer')

[('mater', 0.6095702052116394),
 ('iuuenis', 0.5838768482208252),
 ('puella', 0.5749707818031311),
 ('ille', 0.5479326248168945),
 ('ludere', 0.535244345664978),
 ('senex', 0.519166111946106),
 ('uirgo', 0.5188031196594238),
 ('at', 0.5050873756408691),
 ('ferre', 0.504031240940094),
 ('parare', 0.5006879568099976)]

In [44]:
! head unglossed.fixed.txt

quum,142
pol,117
siet,96
coss,86
analogia,78
que,77
abi,74
ite,64
eccum,62
hactenus,62


In [49]:
'eccum' in lem_lat_wordvec

True

In [50]:
lem_lat_wordvec.most_similar('eccum')

[('eccam', 0.525143027305603),
 ('attat', 0.49384814500808716),
 ('popli', 0.44641977548599243),
 ('scibo', 0.414834201335907),
 ('surrupta', 0.41333162784576416),
 ('sycophantiam', 0.3958692252635956),
 ('quoia', 0.39479494094848633),
 ('uidulo', 0.39022475481033325),
 ('optume', 0.38850533962249756),
 ('erus', 0.38483574986457825)]