In [80]:
from multiprocessing import cpu_count
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from sklearn.model_selection import train_test_split

os.environ['OMP_NUM_THREADS'] = str(cpu_count())
notebook_dir = !pwd
# Some issues with jupyter, so had to add these
REPO_PATH = notebook_dir[0].rsplit("/notebooks", 1)[0]
sys.path.append(REPO_PATH)
# Import Geiger modules
from geiger.utils import load_coling_data, load_word_vectors
from geiger import coling, transform, models, evaluate, stores
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [107]:
# Get the data
x_train, x_dev, y_train, y_dev = load_coling_data(os.path.join(REPO_PATH, "datasets"))
# Define some constants
n_classes = 3
max_features = 30000
maxlen = 100
embed_size = 300
batch_size = 16
epochs = 2

In [108]:
y_train = np.asarray([coling.one_hot_encode(class_tag) for class_tag in y_train])
y_dev = np.asarray([coling.one_hot_encode(class_tag) for class_tag in y_dev])
print(y_train.shape)
print(y_dev.shape)
print(len(x_train))
print(len(x_dev))

(11999, 3)
(3001, 3)
11999
3001


In [36]:
# Load the embedding lookup, this takes a while so don't re excute this cell over and over again.
embed_lookup = stores.MultiLangVectorStore()

reading word vectors from /Users/thiago/code/tgalery/geiger/resources/wiki-news-300d-1M-subword.vec
reading word vectors from /Users/thiago/code/tgalery/geiger/resources/wiki.hi.vec


In [105]:
# Initialize the transformer
transformer = transform.KerasTransformer(list(x_train) + list(x_dev), max_features, maxlen)
# Generate embedding Matrix
embed_matrix = transformer.generate_embedding_matrix(embed_lookup, embed_size)

100%|██████████| 22014/22014 [00:00<00:00, 97667.31it/s]

6183 words were out of vocabulary.





In [83]:
print(embed_matrix.shape)

(22015, 300)


In [109]:
# Instantiate the model
model = models.build_pooled_gru(n_classes, transformer.rel_features, maxlen, embed_matrix, embed_size)

In [110]:
X_train = transformer.texts_to_seq(x_train)
X_test = transformer.texts_to_seq(x_dev)

In [111]:
RocAuc = evaluate.RocAucEvaluation(validation_data=(X_test, y_dev), interval=1)
# Train the model
hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_dev),
                 callbacks=[RocAuc])

Train on 11999 samples, validate on 3001 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.763263 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.751034 

