In [1]:
from gensim.models import word2vec
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.corpora.mmcorpus import MmCorpus
import os
import re
import codecs
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize,sent_tokenize
from collections import Counter, OrderedDict
path = "/Users/stijnvoss/Documents/uni/capita-selecta-ai/datasets/"

In order to train an word2vec model we have to split our text in sentences and words. It's not trivial for the voynich manuscript tough: https://stephenbax.net/?p=940

In [28]:
def remove_punctation(sentence):
    """
    Given an sentence represented as a list of tokens, remove the tokens from the list that do not contain ""
    """
    return [t.lower() for t in sentence if re.search("\w",t) is not None]

def prepare_sentences(folder):
    #walk over all files in folder
    s = 0
    for filename in os.listdir(folder):
        if filename.endswith(".txt"):
            with codecs.open(os.path.join(folder,filename),'r',encoding='utf8') as io:
                for s in sent_tokenize(io.read()):
                    yield word_tokenize(s)

def plot_statistics(folder,index):
    counter = OrderedDict(sorted([(int(k), int(v)) for k,v in Counter([len(s) for s in prepare_sentences(folder)]).iteritems()]))
    print counter
    #sentence
    plt.figure(index)
    plt.subplot(211)
    plt.title("Number of words per sentence")
    plt.plot(counter.keys(), counter.values())
    
    tokens = [t for s in prepare_sentences(folder) for t in s ]
    token_counter =  OrderedDict(Counter(tokens))
    plt.subplot(212)
    plt.title("Word frequency")
    plt.plot(sorted(token_counter.values(),reverse=True))
    plt.show()



## Our own dataset (with similar properties as the voynisch manuscript)

In [44]:
sentences = [remove_punctation(s) for s in prepare_sentences(os.path.join(path,'nl'))]
nl = word2vec.Word2Vec(sentences, size=20, window=5, min_count=2, workers=4,sg=1,hs=1, alpha=0.00025,iter=50)
sentences = [s for s in prepare_sentences(os.path.join(path,'en'))]
en = word2vec.Word2Vec(sentences, size=25, window=5, min_count=2, workers=4)
print nl.most_similar('spinazie')
print en.similarity('the','mint')

[(u'stengels', 0.8808669447898865), (u'hiervoor', 0.8438396453857422), (u'hem', 0.8056163787841797), (u'\xe9n', 0.8026626110076904), (u'gaf', 0.7889652848243713), (u'bepaald', 0.7864552140235901), (u'frito', 0.7809202075004578), (u'werkzame', 0.7780563235282898), (u'con', 0.7766792178153992), (u'centrale', 0.7713658213615417)]
0.99971769872


The word2vec model seems to have difficulty learning real good word2vec values, probably due to the limited number of words 

## Wikipedia

Gensim also supports learning word2vec directly from wikipedia, which may lead to more usable word2vec values. Because of the way larger vocubaries at hand

In [None]:
es = os.path.join(path,'es')
wiki = WikiCorpus(os.path.join(es,'eswiki-20161120-pages-articles1.xml-p000000005p000229076.bz2'))
MmCorpus.serialize(os.path.join(es,'wiki.mm'), wiki)