In [1]:
import dhlab as dh
import pandas as pd
from dhlab.nbtokenizer import tokenize
import jsonlines
dh.css()

In [95]:
def generate_word_sequences_from_corpus(corpus, samples, limit):
    sentences = dh.Concordance(corpus.sample(samples), query = "og OR i OR på OR av OR eller", limit = limit)
    a = sentences.show(style=False, n=limit).concordance.apply(lambda x:x.replace('<b>','').replace('</b>','')).values
    return a

In [96]:
def convert(sentence):
    """Tokenize i a string, turn all tokens to lower case and 
    remove punctuations and other stuff not normally part of a word"""
    lst = tokenize(sentence)
    lowered = [x.lower() for x in lst if x.isalpha()]
    return lowered

## Generate json lines

In [None]:
for i in range(1000):
    if i%100 == 0:
        print(i)
    a = generate_word_sequences_from_corpus(corpus, 100, 1000)
    with jsonlines.open('sentences.jsonlines',mode='a') as wr:
        for sent in map(convert, a):
            wr.write(sent)
    

0
100
200
300
400
500
600


# Create models

In [43]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.test.utils import datapath

In [149]:
sentence_file = 'sentences_varied_corpus.textlines'

In [150]:
%%time
sentences = LineSentence(sentence_file)
model = Word2Vec(sentences= sentences, vector_size=40, window=5, min_count=5, workers=20)
model.save("third_varied.model")

CPU times: user 21min 1s, sys: 14.6 s, total: 21min 16s
Wall time: 8min 11s


In [128]:
v1 = model.wv['konge'] - model.wv['mann'] + model.wv['kvinne']

In [124]:
model.wv.cosine_similarities(v1, model.wv.vectors)

array([-0.03368857, -0.13378242, -0.07588252, ...,  0.37701008,
        0.33858377,  0.17295977], dtype=float32)

In [151]:
model.wv.most_similar('kvinne', topn=10)

[('mann', 0.9317436814308167),
 ('gutt', 0.8670259714126587),
 ('pike', 0.8615455627441406),
 ('dame', 0.836986243724823),
 ('jente', 0.8322846293449402),
 ('person', 0.7917787432670593),
 ('soldat', 0.7851689457893372),
 ('ung', 0.7745891809463501),
 ('hund', 0.7726834416389465),
 ('tenåring', 0.7620474100112915)]

In [152]:
model.wv.most_similar('bil', topn=10)

[('lastebil', 0.8768933415412903),
 ('motorsykkel', 0.8584362864494324),
 ('buss', 0.8576847910881042),
 ('sykkel', 0.8522830605506897),
 ('båt', 0.844989538192749),
 ('traktor', 0.8176421523094177),
 ('bilen', 0.8040301203727722),
 ('hest', 0.788760244846344),
 ('butikk', 0.7867204546928406),
 ('vogn', 0.7843781113624573)]

In [153]:
model.wv.most_similar('torsk', topn=10)

[('sild', 0.9531148076057434),
 ('kveite', 0.9240300059318542),
 ('rogn', 0.9166420698165894),
 ('ørret', 0.9137999415397644),
 ('gjedde', 0.9101670384407043),
 ('makrell', 0.9085878729820251),
 ('klippfisk', 0.9074795842170715),
 ('laks', 0.9063968062400818),
 ('sjøørret', 0.8915678262710571),
 ('hyse', 0.8878176808357239)]

In [154]:
model.wv.most_similar('blå', topn=10)

[('grå', 0.9053907990455627),
 ('gule', 0.9034755825996399),
 ('lilla', 0.8953226804733276),
 ('lyseblå', 0.8931620717048645),
 ('oransje', 0.8785077333450317),
 ('rosa', 0.8751715421676636),
 ('hvite', 0.8727355599403381),
 ('sorte', 0.8705081939697266),
 ('fiolette', 0.8588444590568542),
 ('blodrøde', 0.8502930998802185)]

In [155]:
model.wv.most_similar('statsminister', topn=10)

[('utenriksminister', 0.9169834852218628),
 ('generalsekretær', 0.8974413871765137),
 ('minister', 0.8844966292381287),
 ('ambassadør', 0.8800327181816101),
 ('president', 0.8783475160598755),
 ('forsvarsminister', 0.8723247051239014),
 ('visepresident', 0.868618369102478),
 ('guvernør', 0.8630712032318115),
 ('gerhardsen', 0.85364830493927),
 ('gorbatsjov', 0.8531057834625244)]

In [197]:
model.wv.most_similar('dronning', topn=20)

[('prins', 0.928143322467804),
 ('prinsesse', 0.8946334719657898),
 ('datter', 0.8366270661354065),
 ('ridder', 0.8334811329841614),
 ('munk', 0.8287879228591919),
 ('nevø', 0.8172115087509155),
 ('sønnesønn', 0.8165740370750427),
 ('jomfru', 0.8157027363777161),
 ('høvding', 0.8143293857574463),
 ('sønn', 0.8133881092071533),
 ('fetter', 0.8112004995346069),
 ('jesabel', 0.809544026851654),
 ('gudinne', 0.8078281283378601),
 ('borg', 0.8072675466537476),
 ('teresa', 0.805499792098999),
 ('kusine', 0.8035560250282288),
 ('nonne', 0.7997184991836548),
 ('martyr', 0.7988517880439758),
 ('broder', 0.7984464168548584),
 ('skytshelgen', 0.7950207591056824)]

In [204]:
model.wv.most_similar('', topn=20)

[('ham', 0.8746190667152405),
 ('oss', 0.8503870964050293),
 ('dere', 0.8433657288551331),
 ('meg', 0.8245152235031128),
 ('henne', 0.8128118515014648),
 ('hverandre', 0.7852757573127747),
 ('menneskene', 0.7778470516204834),
 ('deg', 0.7773881554603577),
 ('gud', 0.740097165107727),
 ('jesus', 0.738551676273346),
 ('tingene', 0.7332742810249329),
 ('tilhørerne', 0.728418231010437),
 ('englene', 0.7246710062026978),
 ('barna', 0.7230252623558044),
 ('folk', 0.7198902368545532),
 ('ting', 0.7193999886512756),
 ('mennesker', 0.7179787755012512),
 ('herren', 0.7163693308830261),
 ('gjestene', 0.7044335603713989),
 ('mennene', 0.6965382695198059)]

In [193]:
model.wv['deltidsstilling']

array([-0.08971797,  0.07842822,  0.16596247, -0.14734945,  0.04119027,
       -0.15446575,  0.44952488,  0.04890817, -0.19546679,  0.07589255,
        0.05008372, -0.00709923,  0.05072048, -0.20595767,  0.12802891,
        0.08077952, -0.41787663,  0.36620158, -0.20778832, -0.50555795,
       -0.12203322, -0.3404704 ,  0.20983867, -0.5469011 ,  0.27939728,
        0.12360768,  0.00980221,  0.37181962,  0.197234  ,  0.0953685 ,
        0.431384  ,  0.21606198,  0.39121142, -0.950384  ,  0.17196113,
       -0.16799541,  0.7429617 ,  0.09146556, -0.6701796 , -0.2992059 ],
      dtype=float32)