In [1]:
import json
from itertools import chain
from pprint import pprint
from time import time

import numpy as np

from gensim.models import Word2Vec
from gensim.corpora.dictionary import Dictionary

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
np.random.seed(1337)

Using Theano backend.


In [2]:
def indices_to_one_hot_encodings(index, vector_length):
    return [1 if i == index else 0 for i in xrange(vector_length)]

In [3]:
# Load and process treebank data

treebank_file1 = open('json/OPTA-treebank-0.1.json')
treebank_file2 = open('skladnica_output.json')
treebank = chain(list(json.load(treebank_file1)), list(json.load(treebank_file2)))

X = []
y = []
for entry in treebank:
    tree = entry['parsedSent']
    words = []
    sentiment = None
    for index, node in enumerate(tree):
        word = node.split('\t')[1].lower()
        words.append(word)
        if node.split('\t')[10] == 'S':
            sentiment = index
    if sentiment:
        X.append(words)
        y.append(indices_to_one_hot_encodings(sentiment, len(words)))

dataset_length = len(X)
slicing_point = int(dataset_length*0.9)

X_train_raw = X[:slicing_point]
y_train_raw = y[:slicing_point]
X_test_raw = X[slicing_point+1:]
y_test_raw = y[slicing_point+1:]

treebank_vocabulary = set(chain(*X))
print len(treebank_vocabulary)

3906


In [4]:
print X_train_raw[2]
print y_train_raw[2]

[u'raczej', u'nie', u'dla', u'm\u0142odych', u'ch\u0142opc\xf3w', u'.']
[0, 0, 0, 1, 0, 0]


In [5]:
w2v_model = Word2Vec.load('w2v_allwiki_nkjp300_200.model')

In [6]:
# Import w2v's dictionary to a bag-of-words model
w2v_vocabulary = Dictionary()
w2v_vocabulary.doc2bow(w2v_model.vocab.keys(), allow_update=True)
print w2v_vocabulary.items()[:10]

[(0, u'zapachnie'), (1, u'PORADNI'), (2, u'Fitelberga'), (3, u'komedianta'), (4, u'Zaprzesta\u0107'), (5, u'Nampo'), (6, u'Schloendorff'), (7, u'zn\u0119kanym'), (8, u'synkopy'), (9, u'unifikacji')]


In [7]:
# Initialize dicts for representing w2v's dictionary as indices and 200-dim vectors
w2indx = {v: k+1 for k, v in w2v_vocabulary.items()}
w2vec = {word: w2v_model[word] for word in w2indx.keys()}

In [8]:
w2v_vocabulary_size = len(w2indx) + 1
w2v_vocabulary_dimension = len(w2vec.values()[0])

In [9]:
def map_treebank_words_to_w2v_indices(treebank_data, w2indx):
    treebank_data_vec = []
    for sentence in treebank_data:
        vectorized_sentence = []
        for word in sentence:
            try:
                vectorized_sentence.append(w2indx[word])
            except KeyError:  # words absent in w2v model will be indexed as 0s
                vectorized_sentence.append(0)
        treebank_data_vec.append(vectorized_sentence)
    return treebank_data_vec 

X_train = map_treebank_words_to_w2v_indices(X_train_raw, w2indx)
X_test = map_treebank_words_to_w2v_indices(X_test_raw, w2indx)

print X_test[4]

[51615, 277138, 416148, 422622, 318134, 584324, 176240, 503788, 0]


In [10]:
# Define numpy weights matrix for embedding layer
embedding_weights = np.zeros((w2v_vocabulary_size , w2v_vocabulary_dimension))
for word, index in w2indx.items():
    embedding_weights[index, :] = w2vec[word]

In [11]:
# max sentence length
max(
    len(max(X_train, key=lambda sentence: len(sentence))),
    len(max(X_test, key=lambda sentence: len(sentence)))
)

39

In [14]:
# Normalize sequences length to 40 (will be extended with 0s)
input_length = 40
X_train = sequence.pad_sequences(X_train, maxlen=input_length)
X_test = sequence.pad_sequences(X_test, maxlen=input_length)

y_train = sequence.pad_sequences(y_train_raw, maxlen=input_length)
y_test = sequence.pad_sequences(y_test_raw, maxlen=input_length)

print X_train[2]
print y_train[2]

[     0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0 580109 431241 193758 639684 453311      0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0]


In [15]:
model = Sequential()

model.add(Embedding(input_dim=w2v_vocabulary_size,  # input: a sequence of word indices
                    output_dim=w2v_vocabulary_dimension,  # output: a sequence of word vectors (200-dim)
                    mask_zero=True,
                    weights=[embedding_weights])  #initialize weights with pre-trained embeddings
         )

model.add(LSTM(input_dim=w2v_vocabulary_dimension,  # input: a sequence of word vectors (200-dim)
               output_dim=w2v_vocabulary_dimension,  # output: a sequence of 200-dim vectors
               return_sequences=True))

# model.add(Dropout(0.3))
# preserves data dimensions

model.add(TimeDistributed(Dense(1, activation='sigmoid')))
# apply Dense to each vector in a sequence

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [16]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
embedding_2 (Embedding)            (None, None, 200)   141387200   embedding_input_2[0][0]          
____________________________________________________________________________________________________
lstm_2 (LSTM)                      (None, None, 200)   320800      embedding_2[0][0]                
____________________________________________________________________________________________________
timedistributed_2 (TimeDistributed)(None, None, 1)     201         lstm_2[0][0]                     
Total params: 141708201
____________________________________________________________________________________________________


In [18]:
batch_size = 32
n_epoch = 2


hist = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=n_epoch, 
                 validation_data=(X_test, y_test), verbose=2)

Exception: A target array with shape (1288, 40) was passed for an output of shape (None, None, 1) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.