In [229]:
import json
from itertools import chain
from pprint import pprint
from time import time

import numpy as np

from gensim.models import Word2Vec
from gensim.corpora.dictionary import Dictionary

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
np.random.seed(1337)

In [89]:
def index_to_one_hot_encoding(index, vector_length):
    return [1 if i == index else 0 for i in xrange(vector_length)]

In [166]:
treebank_file1 = open('json/OPTA-treebank-0.1.json')
treebank_file2 = open('skladnica_output.json')
treebank = chain(list(json.load(treebank_file1)), list(json.load(treebank_file2)))

X = []
y = []
for entry in treebank:
    tree = entry['parsedSent']
    words = []
    sentiment = None
    for index, node in enumerate(tree):
        word = node.split('\t')[1].lower()
        words.append(word)
        if node.split('\t')[10] == 'S':
            sentiment = index
    if sentiment:
        X.append(words)
        y.append(index_to_one_hot_encoding(sentiment, len(words)))

# for a, b in zip(X, y):
#     print ' '.join(a)
#     print b

dataset_length = len(X)
slicing_point = int(dataset_length*0.9)

X_train = X[:slicing_point]
y_train = y[:slicing_point]
X_test = X[slicing_point+1:]
y_test = y[slicing_point+1:]

treebank_vocabulary = set(chain(*X))
print len(treebank_vocabulary)

3906


In [128]:

w2v_model = Word2Vec.load('w2v_allwiki_nkjp300_200.model')

In [131]:
w2v_vocabulary = Dictionary()
w2v_vocabulary.doc2bow(w2v_model.vocab.keys(), allow_update=True)


706935

In [143]:
w2indx = {v: k+1 for k, v in w2v_vocabulary.items()}
w2vec = {word: model[word] for word in w2indx.keys()}

In [165]:
w2v_vocabulary_size = len(w2indx) + 1
w2v_vocabulary_dimension = len(w2vec.values()[0])

In [172]:
# our_dict = {}
# missing_count = 0

# for word in treebank_vocabulary:
#     if word in w2v_model:
#         our_dict[word] = w2v_model[word]
#     else:
#         missing_count += 1
# #         print("missing key %s" %(word)) 
#         our_dict[word] = np.zeros(w2v_vocabulary_dimension)
# print missing_count

241


In [219]:
def map_treebank_words_to_w2v_indices(treebank_data, w2indx):
    treebank_data_vec = []
    for sentence in treebank_data:
        vectorized_sentence = []
        for word in sentence:
            try:
                vectorized_sentence.append(w2indx[word])
            except KeyError:
                vectorized_sentence.append(0)
        treebank_data_vec.append(vectorized_sentence)
    return treebank_data_vec 

X_train_vec = map_treebank_words_to_w2v_indices(X_train, w2indx)
X_test_vec = map_treebank_words_to_w2v_indices(X_test, w2indx)


In [195]:
embedding_weights = np.zeros((w2v_vocabulary_size , w2v_vocabulary_dimension))
for word, index in w2indx.items():
    embedding_weights[index, :] = w2vec[word]

In [232]:
input_length = 100
X_train = sequence.pad_sequences(X_train_vec, maxlen=input_length)
X_test = sequence.pad_sequences(X_test_vec, maxlen=input_length)

model = Sequential()

model.add(Embedding(output_dim=w2v_vocabulary_dimension,
                    input_dim=w2v_vocabulary_size,
                    mask_zero=True,
                    weights=[embedding_weights],
                    input_length=input_length))
# input: a sequence of word indices
# output: a sequence of word vectors (200-dim)

model.add(LSTM(output_dim=w2v_vocabulary_dimension, return_sequences=True))
# input: a sequence of word vectors (200-dim)
# output: a sequence of 200-dim vectors

model.add(Dropout(0.3))
# preserves data dimensions

model.add(TimeDistributed(Dense(1, activation='sigmoid')))
# apply Dense to each vector in a sequence, i.e. apply 

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              class_mode='binary',
              metrics=['accuracy'])

In [233]:
batch_size = 32
n_epoch = 2

start_time = time()
hist = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=n_epoch, 
                 validation_data=(X_test, y_test), verbose=2)
end_time = time()

print("Training took %f secs" %(end_time - start_time))

Exception: You are passing a target array of shape (1288, 1) while using as loss `categorical_crossentropy`. `categorical_crossentropy` expects targets to be binary matrices (1s and 0s) of shape (samples, classes). If your targets are integer classes, you can convert them to the expected format via:
```
from keras.utils.np_utils import to_categorical
y_binary = to_categorical(y_int)
```

Alternatively, you can use the loss function `sparse_categorical_crossentropy` instead, which does expect integer targets.