In [1]:
from gensim.models import word2vec
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.corpora.mmcorpus import MmCorpus
import os
import re
import codecs
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize,sent_tokenize
from collections import Counter, OrderedDict
path = "/Users/stijnvoss/Documents/uni/capita-selecta-ai/datasets/"

In order to train an word2vec model we have to split our text in sentences and words. It's not trivial for the voynich manuscript tough: https://stephenbax.net/?p=940

In [2]:
def remove_punctation(sentence):
    """
    Given an sentence represented as a list of tokens, remove the tokens from the list that do not contain ""
    """
    return [t.lower() for t in sentence if re.search("\w",t) is not None]

def prepare_sentences(folder):
    #walk over all files in folder
    s = 0
    for filename in os.listdir(folder):
        if filename.endswith(".txt"):
            with codecs.open(os.path.join(folder,filename),'r',encoding='utf8') as io:
                for s in sent_tokenize(io.read()):
                    yield word_tokenize(s)

def plot_statistics(folder,index):
    counter = OrderedDict(sorted([(int(k), int(v)) for k,v in Counter([len(s) for s in prepare_sentences(folder)]).iteritems()]))
    print counter
    #sentence
    plt.figure(index)
    plt.subplot(211)
    plt.title("Number of words per sentence")
    plt.plot(counter.keys(), counter.values())
    
    tokens = [t for s in prepare_sentences(folder) for t in s ]
    token_counter =  OrderedDict(Counter(tokens))
    plt.subplot(212)
    plt.title("Word frequency")
    plt.plot(sorted(token_counter.values(),reverse=True))
    plt.show()

## Our own dataset (with similar properties as the voynisch manuscript)
First let's try to investigate how word2vec could be used on the voynish manuscript by using our . I had to finetune the word2vec params a bit. Partly trough doing some best practice research, partly by doing some experimentation.

In [42]:
sentences = [remove_punctation(s) for s in prepare_sentences(os.path.join(path,'es'))]
es = word2vec.Word2Vec(sentences, size=20, window=5, min_count=2, workers=4,sg=1,hs=1, alpha=0.00025,iter=50)
print "ES"
print es.most_similar('la')
print es.most_similar('menta')
print es.similarity('menta','el'),es.similarity('menta','romero'),es.similarity('dios','la')
print "---"

sentences = [remove_punctation(s) for s in prepare_sentences(os.path.join(path,'nl'))]
nl = word2vec.Word2Vec(sentences, size=20, window=5, min_count=2, workers=4,sg=1,hs=1, alpha=0.00025,iter=50)
print "NL"
print nl.most_similar('de')
print nl.most_similar('mint')
print nl.similarity('mint','hem'),nl.similarity('mint','anijs'),nl.similarity('god','de')
print "---"

sentences = [remove_punctation(s) for s in prepare_sentences(os.path.join(path,'en'))]
en = word2vec.Word2Vec(sentences, size=20, window=5, min_count=2, workers=4,sg=1,hs=1, alpha=0.00025,iter=50)
print "EN"
print en.most_similar('the')
print en.most_similar('mint')
print en.similarity('mint','him'),en.similarity('mint','rosemary'),en.similarity('god','the')
print "---"

ES
[(u'que', 0.9946640729904175), (u'a', 0.994613766670227), (u'se', 0.9935480356216431), (u'y', 0.9924092888832092), (u'el', 0.9922371506690979), (u'de', 0.9912117719650269), (u'en', 0.9903433322906494), (u'los', 0.9897137880325317), (u'como', 0.9878660440444946), (u'con', 0.9876834154129028)]
[(u'una', 0.8490885496139526), (u'arroz', 0.8478215336799622), (u'petroselinum', 0.8423986434936523), (u'm\xe1s', 0.8422155380249023), (u'sus', 0.8415376543998718), (u'tambi\xe9n', 0.8353066444396973), (u'son', 0.8339482545852661), (u'lo', 0.8319666981697083), (u'sistema', 0.8316299915313721), (u'planta', 0.822829008102417)]
0.818549321368 0.678316980796 0.942374482885
---
NL
[(u'van', 0.9911580085754395), (u'het', 0.9809150695800781), (u'een', 0.9787498712539673), (u'in', 0.977186381816864), (u'worden', 0.9732418656349182), (u'en', 0.9711448550224304), (u'is', 0.9703822135925293), (u'op', 0.964158296585083), (u'voor', 0.9587825536727905), (u'kan', 0.9569529891014099)]
[(u'zaaien', 0.65428519248

The word2vec models perform far from perfect as could be expected from the amount of data we have. But still we see some useful patterns. We see that articles are pretty close together in the vector space, we also see some herbs/plants grouped together

## Wikipedia

Gensim also supports learning word2vec directly from wikipedia, which may lead to more usable word2vec values. Because of the way larger vocubaries at hand

In [None]:
es = os.path.join(path,'es')
wiki = WikiCorpus(os.path.join(es,'eswiki-20161120-pages-articles1.xml-p000000005p000229076.bz2'))
MmCorpus.serialize(os.path.join(es,'wiki.mm'), wiki)