In [1]:
from multiprocessing import cpu_count
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

os.environ['OMP_NUM_THREADS'] = str(cpu_count())
notebook_dir = !pwd
# Some issues with jupyter, so had to add these
REPO_PATH = notebook_dir[0].rsplit("/notebooks", 1)[0]
sys.path.append(REPO_PATH)
# Import Geiger modules
from geiger.utils import load_word_vectors
from geiger import coling, transform, models, evaluate, stores
%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


### After loading the data, we want to load the data and define some contraints for modelling the sequence classification

In [2]:
# Get the data
x_train, x_dev, y_train, y_dev = coling.load_coling_data(os.path.join(REPO_PATH, "datasets/coling/english"))

In [3]:
# Define some constants
n_classes = 3  # {non-aggression, covert aggression, overt aggression}
max_features = 30000 # max number of tokens
maxlen = 100 # max sequence lenght
embed_size = 300 # fastText embedding size
batch_size = 32 # batch
epochs = 1 # epochs

### Just look into the datasets to see if it's all legit

In [4]:
print(y_train.shape)
print(y_dev.shape)
print(len(x_train))
print(len(x_dev))

(11999, 3)
(3001, 3)
11999
3001


### Experiment conditions

The line above is crucial as it defines the experimental conditions. We have created an abstraction for loading the embeddings, which takes a language parameter and also an `apply_transform` boolean value. With this, we can define two types of embedding lookups:
- single language (only english vectors are loaded)
- multi language with transform (english and hindi vectors are loaded and the vectors are aligned to a common space using [these transformation matrices](https://github.com/Babylonpartners/fastText_multilingual))

In [5]:
# Load the embedding lookup, this takes a while so don't re excute this cell over and over again.
single_lang_embed_lookup = stores.MultiLangVectorStore(langs=["en"])
multi_lang_embed_lookup = stores.MultiLangVectorStore(langs=["en", "hi"], apply_transform=True)

reading word vectors from /Users/thiago/code/tgalery/geiger/resources/wiki-news-300d-1M-subword.vec
reading word vectors from /Users/thiago/code/tgalery/geiger/resources/wiki-news-300d-1M-subword.vec
reading word vectors from /Users/thiago/code/tgalery/geiger/resources/wiki.hi.vec


### Transformer and Embedding Matrix
For this stage, we create a sequence transformer and embedding matrices for testing the conditions we defined.

In [12]:
# Initialize the transformer
transformer = transform.KerasTransformer(list(x_train) + list(x_dev), max_features, maxlen)

In [13]:
# Generate single lang embedding matrix
single_lang_embed_matrix, _ = transformer.generate_embedding_matrix(single_lang_embed_lookup, embed_size)
print(single_lang_embed_matrix.shape)

100%|██████████| 23800/23800 [00:00<00:00, 63056.03it/s]

6001 words were out of vocabulary.
(23801, 300)





In [14]:
# Generate multi lang embedding matrix
multi_lang_embed_matrix, _ = transformer.generate_embedding_matrix(multi_lang_embed_lookup, embed_size)
print(multi_lang_embed_matrix.shape)

100%|██████████| 23800/23800 [00:00<00:00, 63019.16it/s]

6001 words were out of vocabulary.
(23801, 300)





### Model evaluation
We can now create the models and evaluate whether there's any improvement in incoporating oov vectors from multiple languages.

In [15]:
# Pad sequences
x_train_pad = transformer.texts_to_seq(x_train)
x_dev_pad = transformer.texts_to_seq(x_dev)
# Define summary
SumEval = evaluate.SummaryEvaluation(validation_data=(x_dev_pad, y_dev), interval=1)

In [16]:
# Instantiate the single language model
single_lang_model = models.build_pooled_gru(n_classes, transformer.rel_features, maxlen, single_lang_embed_matrix, embed_size)

In [17]:
# Train the single language model
single_lang_hist = single_lang_model.fit(x_train_pad, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_dev_pad, y_dev),
                 callbacks=[SumEval])

Train on 11999 samples, validate on 3001 samples
Epoch 1/1

 F1Score - epoch: 1 - score:
             precision    recall  f1-score   support

          0       0.66      0.68      0.67      1233
          1       0.50      0.57      0.53      1057
          2       0.57      0.41      0.48       711

avg / total       0.58      0.58      0.58      3001
 



In [18]:
# Instantiate the multi language model
multi_lang_model = models.build_pooled_gru(n_classes, transformer.rel_features, maxlen, multi_lang_embed_matrix, embed_size)

In [19]:
multi_lang_hist = multi_lang_model.fit(x_train_pad, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_dev_pad, y_dev),
                 callbacks=[SumEval])

Train on 11999 samples, validate on 3001 samples
Epoch 1/1

 F1Score - epoch: 1 - score:
             precision    recall  f1-score   support

          0       0.60      0.78      0.68      1233
          1       0.48      0.48      0.48      1057
          2       0.61      0.27      0.37       711

avg / total       0.56      0.55      0.53      3001
 

