In [2]:
index_file = "/home/szymon/lingwy/nkjp/nkjp_index.txt"
nkjp_path = "/home/szymon/lingwy/nkjp/pełny/"
vecs_path = "/home/szymon/lingwy/nkjp/wektory/nkjp+wiki-lemmas-all-100-skipg-ns.txt/"
vecs_dim = 100
window_size = 4 # how many words to condider on both sides of the target
batch_size = window_size * 2
corp_runs = 2
learning_rate = 0.3
reg_rate = 0.005
points_per_neg_sample = 1

In [3]:
fragms = []

In [4]:
# Read the NKJP fragments
from lxml import etree
import os.path

unique_words = set()
words_count = 0

with open(index_file) as index:
    for fragm_id in index:
        filepath = nkjp_path+fragm_id.strip()+'/ann_words.xml'
        if not os.path.isfile(filepath): # there are two versions of name of these files
            filepath = nkjp_path+fragm_id.strip()+'/ann_named.xml'
        if not os.path.isfile(filepath):
            print('Note: cannot access {}'.format(fragm_id.strip()))
            continue
        fragms += [[]]
        
        tree = etree.parse(filepath)
         # tag is namespaced, .// for finding anywhere in the tree
        for elem in tree.iterfind('.//{http://www.tei-c.org/ns/1.0}f[@name]'):
            if elem.attrib['name'] == 'base':
                fragms[-1].append(elem[0].text) # first child <string>
                words_count += 1
                unique_words.add(elem[0].text)

In [5]:
print(fragms[0])

['zatrzasnąć', 'drzwi', 'od', 'mieszkanie', ',', 'dwa', 'raz', 'przekręcić', 'klucz', ',', 'nacisnąć', 'klamka', ',', 'by', 'sprawdzić', ',', 'czy', 'dobrze', 'zamknąć', ',', 'zbiec', 'po', 'schody', ',', 'minąć', 'furtka', ',', 'także', 'on', 'zamknąć', ',', 'i', 'znaleźć się', 'na', 'wąski', 'uliczka', 'między', 'ogródek', ',', 'gdzie', 'drzemać', 'w', 'majowy', 'słońce', 'trójkątny', 'ciemnozielony', 'świerk', ',', 'jaki', 'być', 'w pobliżu', 'on', 'dom', '.', 'bohater', 'powieść', 'Paźniewski', 'być', 'miasto', ',', 'Krzemieniec', '.', 'jak', 'za', 'czas', 'Słowacki', 'funkcjonować', 'liceum', 'i', 'płynąć', 'Ikwa', '.', 'Krzemieniec', 'powieściowy', 'być', 'tamten', 'Krzemieniec', ',', 'ale', 'być', 'także', 'miasto', 'wywołać', 'z', 'osobisty', 'pamięć', 'Paźniewski', '.', 'swój', 'droga', 'do', 'ten', 'miasto', 'autor', '„', 'krótki', 'dzień', '”', 'zacząć', 'z daleka', 'bardzo', '.', '„', 'nigdy', 'być', 'w', 'ten', 'dom', ',', 'a', 'przecież', 'wszystko', 'pamiętać', 'doskonal

In [6]:
import numpy as np

In [7]:
first_line = True
word_n = 0
word_idx = {}

# we'll read those from the data file
vecs_count = 0
vecs_dim = 100

with open(vecs_path+"data") as vecs_file:
    for line in vecs_file:
        if first_line:
            # Read metadata.
            vecs_count = int(line.split(' ')[0])
            vecs_dim = int(line.split(' ')[1])
            first_line = False
            continue
        # Read lemma base forms.
        word_idx[line.split(' ')[0]] = word_n
        word_n += 1

In [8]:
word_vecs = np.loadtxt(vecs_path+"data", encoding="utf-8",
                       dtype=np.float32, # tensorflow's requirement
                       skiprows=1, usecols=tuple(range(1, vecs_dim+1)))

In [9]:
# Add the dummy boundary/unknown marker.
word_vecs = np.vstack([word_vecs, np.zeros((1,vecs_dim), dtype=np.float32)])
vecs_count += 1

In [10]:
def word_id(word):
    return word_idx[word] if word in word_idx else vecs_count-1

In [11]:
word_id('ffggf')

1549322

In [12]:
vecs_count, vecs_dim

(1549323, 100)

# Keras

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, Embedding
from keras.optimizers import SGD

Using TensorFlow backend.
  return f(*args, **kwds)


In [14]:
#
# Training corpus preparation.
#

from random import randint
from math import floor

# We need a special token for cases when the target word is near the start or end of sentence.
bound_token_id = vecs_count - 1 # the zero additional vector

sample_n = 0

train = np.zeros(((words_count + words_count // points_per_neg_sample) * corp_runs,
                  window_size * 2 + 1), dtype='int')
labels = np.ones(((words_count + words_count // points_per_neg_sample) * corp_runs,),
                dtype='int')

for run_n in range(corp_runs):
    fragm_n = 0
    word_n = 0
        
    while fragm_n < len(fragms) and sample_n < train.shape[0]:
        
        # The positive sample.
        train[sample_n, window_size] = word_id(fragms[fragm_n][word_n])
        
        for j in range(window_size):
            train[sample_n, j] = (word_id(fragms[fragm_n][word_n-j-1]) if word_n-j-1 >= 0
                                  else bound_token_id)
            train[sample_n, window_size+j+1] = (word_id(fragms[fragm_n][word_n+j+1])
                                                if word_n+j+1 < len(fragms[fragm_n])
                                                else bound_token_id)
        
        # (Maybe) a negative sample.
        if word_n % points_per_neg_sample == 0:
            sample_n += 1
            
            neg_sample = []
            while len(neg_sample) < 1 + (2 * window_size):
                neg_sample.append(randint(0, vecs_count-1))
                
            train[sample_n,] = np.asarray(neg_sample, dtype='int')
            labels[sample_n] = 0.0
                
        sample_n += 1
        word_n += 1
        try:
            while word_n == len(fragms[fragm_n]):
                word_n = 0
                fragm_n += 1
        except IndexError: # happens on the end of the corpus
            break

In [16]:
model = Sequential()                                                                                               
model.add(Embedding(vecs_count,
                    vecs_dim,
                    weights=[word_vecs],
                    input_length=window_size * 2 + 1,
                    trainable=False))                                                                              
model.add(LSTM(96))
model.add(Dense(1))
model.add(Activation('sigmoid'))

opt = SGD(lr=learning_rate, decay=reg_rate)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [17]:
model.fit(train, labels)

Epoch 1/1


<keras.callbacks.History at 0x7f464956ed68>

In [16]:
model

<keras.models.Sequential at 0x7fc56c618860>

In [28]:
X = np.asarray([word_id(w) for w in
                ['Niemcy', 'znienacka', 'wkroczyć', 'do', 'Francja', 'w', 'maj', 'kolejny', 'rok']],
               dtype='int')
X

(1, 9)

In [62]:
Y = np.asarray([word_id(w) for w in
                ['kot', 'zimny', 'okrągły', 'start', 'do', 'w', 'Czechy', 'wykres', 'klub']],
               dtype='int')
Y
Y = np.asarray([word_id(w) for w in
                ['w', 'zimny', 'miesiąc', 'zakładać', 'but', 'do', 'szkoła', 'każdy', 'dzień']],
               dtype='int')
Y

array([   0, 3133,  272, 1251, 2252,    8,  102,   96,   57])

In [26]:
train.shape

(2646184, 9)

In [63]:
model.predict(np.atleast_2d(Y), batch_size=1)

array([[0.9999851]], dtype=float32)

# Torch

In [13]:
import torch
from torch.autograd import Variable

In [15]:
from random import randint
from math import floor

fragms_step = 1 # set to higher values if we want to skip some proportion of fragments
# We need a special token for cases when the target word is near the start or end of sentence.
bound_token_id = vecs_count - 1 # the zero additional vector

def skipgram_batches():
    for run_n in range(corp_runs):
        sent_n = 0
        word_n = 0
        
        target_n = 0 # relative to the current batch
        
        batch = np.ndarray(shape=(batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size), dtype=np.int32)
        
        while sent_n < len(fragms):
            for j in range(window_size):
                batch[target_n*window_size+j] = word_id(fragms[sent_n][word_n])
            # "Good" examples - words near the target (we will let TensorFlow randomize the "bad" ones)
            for j in range(window_size // 2):
                labels[target_n*window_size+j*2] = (word_id(fragms[sent_n][word_n-j-1]) if word_n-j-1 >= 0
                                                       else bound_token_id)
                labels[target_n*window_size+j*2+1] = (word_id(fragms[sent_n][word_n+j+1])
                                                         if word_n+j+1 < len(fragms[sent_n])
                                                         else bound_token_id)
                
            target_n += 1
            if target_n == (batch_size // window_size):
                yield batch, labels
                batch = np.ndarray(shape=(batch_size), dtype=np.int32)
                labels = np.ndarray(shape=(batch_size), dtype=np.int32)
                target_n = 0
                
            word_n += 1
            try:
                while word_n == len(fragms[sent_n]):
                    word_n = 0
                    sent_n += fragms_step
                    if (floor(sent_n / len(fragms) * 10)
                        > floor((sent_n-fragms_step) / len(fragms) * 10)):
                        print('{}0%'.format(floor(sent_n / len(fragms) * 10)), end=' ')
            except IndexError: # happens on the end of the corpus
                break
                
        batch[target_n:] = 0.0
        labels[target_n:, :] = 0.0
        yield batch, labels#, (run_n == corp_runs - 1)

In [36]:
classif_W1 = Variable(torch.randn(vecs_dim*2, vecs_dim*2), requires_grad=True)
classif_b1 = Variable(torch.randn(1, vecs_dim*2), requires_grad=True)
classif_W2 = Variable(torch.randn(vecs_dim*2, 1), requires_grad=True)
classif_b2 = Variable(torch.randn(1, 1), requires_grad=True)

In [38]:
import datetime

batch_n = 0

print('Training start:', datetime.datetime.now())
for batch, labels in skipgram_batches():
    # We get word indices, convert them to vectors.
    batch = word_vecs[batch] # main words
    pos_examples = word_vecs[labels]
    neg_examples = word_vecs[np.random.randint(vecs_count, size=pos_examples.shape)]
    
    pos_batch = Variable(torch.Tensor(np.hstack((batch, pos_examples))), requires_grad=False)
    pos_preds = ((pos_batch.mm(classif_W1) + classif_b1).sigmoid().mm(classif_W2)
                      + classif_b2).sigmoid()
    neg_batch = Variable(torch.Tensor(np.hstack((batch, pos_examples))), requires_grad=False)
    neg_preds = (((neg_batch.mm(classif_W1) + classif_b1).sigmoid()).mm(classif_W2)
                      + classif_b2).sigmoid()
    
    loss1 = ((- pos_preds).sum() / batch_size +
             # regularization:
                (classif_W1.abs().sum() + classif_b1.abs().sum() +
                 classif_W2.abs().sum() + classif_b2.abs().sum())
                * reg_rate)
    loss1.backward()
    loss2 = (neg_preds.sum() / (batch_size * 10) +
             # regularization:
                (classif_W1.abs().sum() + classif_b1.abs().sum() +
                 classif_W2.abs().sum() + classif_b2.abs().sum())
                * reg_rate)
    loss2.backward()
    if batch_n % 25000 == 0:
        print("Loss: {}".format(loss1 + loss2))
        #print(pos_preds)
        #print(neg_preds)
    
    classif_W1.data = learning_rate * classif_W1.grad.data
    classif_b1.data = learning_rate * classif_b1.grad.data
    classif_W2.data = learning_rate * classif_W2.grad.data
    classif_b2.data = learning_rate * classif_b2.grad.data
    classif_W1.grad.data.zero_()
    classif_b1.grad.data.zero_()
    classif_W2.grad.data.zero_()
    classif_b2.grad.data.zero_()
    
    batch_n += 1
print('Training end:', datetime.datetime.now())

Training start: 2018-02-21 22:48:45.389630
Loss: Variable containing:
 319.7979
[torch.FloatTensor of size 1]

Loss: Variable containing:
 1.2369
[torch.FloatTensor of size 1]

Loss: Variable containing:
 1.2369
[torch.FloatTensor of size 1]

Loss: Variable containing:
 1.2370
[torch.FloatTensor of size 1]

Loss: Variable containing:
 1.2371
[torch.FloatTensor of size 1]

Loss: Variable containing:
 1.2371
[torch.FloatTensor of size 1]

Loss: Variable containing:
 1.2368
[torch.FloatTensor of size 1]

10% Loss: Variable containing:
 1.2368
[torch.FloatTensor of size 1]

20% 30% 40% Loss: Variable containing:
 1.2369
[torch.FloatTensor of size 1]

50% Loss: Variable containing:
 1.2370
[torch.FloatTensor of size 1]

Loss: Variable containing:
 1.2369
[torch.FloatTensor of size 1]

Loss: Variable containing:
 1.2369
[torch.FloatTensor of size 1]

60% Loss: Variable containing:
 1.2370
[torch.FloatTensor of size 1]

Loss: Variable containing:
 1.2367
[torch.FloatTensor of size 1]

Loss: V

IndexError: too many indices for array

In [43]:
word = "być"
softmax = torch.nn.Softmax()
vec = Variable(torch.Tensor(
            np.hstack((np.broadcast_to(word_vecs[word_id(word), :], (vecs_count, vecs_dim)),
                       word_vecs))
         ), requires_grad=False)
pred = softmax((((vec.mm(classif_W1) + classif_b1).sigmoid()).mm(classif_W2)
                      + classif_b2))

  


In [41]:
pred.min()

Variable containing:
 0.3104
[torch.FloatTensor of size 1]

# Tensorflow (currently unused)

In [17]:
import tensorflow as tf
import math

  return f(*args, **kwds)


In [113]:
tf.reset_default_graph()

with tf.device('/cpu:0'):
    # Model parameters: word embeddings and model weights & biases for each word.
    embeddings = tf.Variable(tf.random_uniform([vecs_count, vecs_dim], -1.0, 1.0))
    nce_weights = tf.Variable(tf.truncated_normal([vecs_count, vecs_dim],
                                                  stddev=1.0 / math.sqrt(vecs_dim)))
    nce_biases = tf.Variable(tf.zeros([vecs_count]))
    
    # The computation graph.
    inputs = tf.placeholder(tf.int32, shape=[batch_size])
    labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    embedding_layer = tf.nn.embedding_lookup(embeddings, inputs)
    # Note that word2vec has no "real" hidden layers apart from the embedding.
    
    # Number of random words to sample apart from the true target; the model should learn to
    # assign low probability to them given the context.
    negative_samples_n = batch_size
    
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                         biases=nce_biases,
                                         labels=labels,
                                         inputs=embedding_layer,
                                         num_sampled=negative_samples_n,
                                         num_classes=vecs_count))
    # Vanilla SGD seems to work here better - since we train practically a different word vector
    # each time, decaying momentum hinders training of later vectors before they can even be shown
    # to the net, especially in the case of Adagrad's vanishing updates.
    # (NOTE!) here we DO NOT touch the embeddings, we want to only learn nce_weights and biases!
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

In [114]:
import datetime

# we want to use those later:
trained_nce_weights = []
trained_nce_biases = []

with tf.Session() as sess:
    print('Training start:', datetime.datetime.now())
    tf.global_variables_initializer().run()
    i = 0
    for batch_inputs, batch_labels, is_last in skipgram_batches():
        if is_last:
            _, loss_val, trained_nce_weights, trained_nce_biases = sess.run(
                [optimizer, loss, nce_weights, nce_biases],
                feed_dict={inputs: batch_inputs, labels: batch_labels})
            print('Final loss:', loss_val)
            print('Training end:', datetime.datetime.now())
        else:
            _, loss_val = sess.run([optimizer, loss], feed_dict={inputs: batch_inputs,
                                                             labels: batch_labels})
            if (i % 250000 == 0):
                print('(loss: {})'.format(loss_val), end=' ')
        i += 1

Training start: 2018-02-07 19:02:51.118178
(loss: 675.9548950195312) 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% 10% 20% 30% 40% 50% 60% 70% 80% (loss: 2.3737263679504395) 90% 100% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100% 10% 20% 30% 40% 50% (loss: 1.877175211906433) 60% 70% 80% 90% 100% Final loss: 2.2043426
Training end: 2018-02-07 19:25:10.739346


In [115]:
trained_nce_weights.shape, trained_nce_biases.shape

((55766, 100), (55766,))

In [116]:
pred_words = [rev_word_idx[i] if i < vecs_count-1 else ' ' for i in range(idxs.shape[1])][::-1] # reverse

In [117]:
word = "być"
vec = np.reshape(word_vecs[word_id(word), :], (1, vecs_dim))
prediction = np.dot(vec, np.transpose(trained_nce_weights))
prediction = np.add(prediction, trained_nce_biases)

In [118]:
word_id(word)

31166

In [119]:
idxs = np.argsort(prediction)

In [120]:
prediction.shape

(1, 55766)

In [122]:
pred_words[:100]

[' ',
 'kazirodczy',
 'teori',
 'nscrossen',
 'Bierówka',
 'hat-tricka',
 'bazgrać',
 'rozgościć',
 'niezmącony',
 'Valle',
 'łajać',
 '7-13',
 'Brett',
 'Silesii',
 'zaliczkowy',
 'Wojcieszów',
 'Sław',
 'odwiecznie',
 '²',
 'kwalifikator',
 'stulejka',
 'Eugen',
 'avi',
 'abażur',
 'wydeptywać',
 'Dederko',
 'płatowiec',
 'niedosłuch',
 'Kornasiewicz',
 'Krysiewicza',
 'Besbir',
 'separować',
 'Rumsfeld',
 'Kurowo',
 'obstrzał',
 'Galapagos',
 'odmownie',
 'Dłutów',
 'wolnomularski',
 'Liptak',
 'zbutwiały',
 'Gliwa',
 'Johana',
 'tia',
 'Cieślikowski',
 'doktrynerstwo',
 'nieokrzesany',
 '1440',
 'EG',
 'Lokia',
 'Elo',
 'budzetu',
 'UFK',
 '70-300',
 'wszystkowiedzący',
 'Burger',
 'ankiete',
 'Małastowskiej',
 'Kubiś',
 'rozpustny',
 'Galos',
 '75-300',
 'JKK',
 '637-12-23',
 'płaszczak',
 'wine',
 'ats',
 'żupan',
 'zauwazyl',
 'wykusz',
 "There's",
 'Solar',
 'skamleć',
 'szczotkować',
 'Karamazow',
 'gromnica',
 'dysgrafia',
 'łyżworolka',
 'Zarzycka',
 'Muskat',
 'choleryk',
 