<a href="https://colab.research.google.com/github/vondersam/sdgs_text_classifier/blob/master/experiments/word_embeddings_elmo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#!pip install -I keras
#!pip install iterative-stratification
#!pip install -U sacremoses

In [0]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, hamming_loss
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

mlb = MultiLabelBinarizer()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.initializers import Constant

import tensorflow_hub as hub
import os
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer

from sacremoses import MosesTokenizer

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
base_dir = "gdrive/My Drive/fastai-v3/sdgs/"

Mounted at /content/gdrive


In [0]:
TEXT_DATA_DIR = f"{base_dir}dataset/cleanup_labelled.csv"
ELMO_DIR = f"{base_dir}embeddings/elmo/"

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2
labels_index = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]

In [0]:
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))
        

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)

In [0]:
# Function to build model
def build_model(): 
  input_text = layers.Input(shape=(1,), dtype="string")
  embedding = ElmoEmbeddingLayer()(input_text)
  dense = layers.Dense(256, activation='relu')(embedding)
  pred = layers.Dense(17, activation='sigmoid')(dense)

  model = Model(inputs=[input_text], outputs=pred)

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  
  return model

In [0]:
print('Processing text dataset')
df = pd.read_csv(TEXT_DATA_DIR)
df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])

train_df, test_df = df.iloc[0:3000], df.iloc[3001:]

train_text = train_df['text'].tolist()
train_text = [' '.join(t.split()[0:150]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = mlb.fit_transform(train_df.labels)

test_text = test_df['text'].tolist()
test_text = [' '.join(t.split()[0:150]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = mlb.fit_transform(test_df.labels)

Processing text dataset


In [0]:
model = build_model()
model.fit(train_text, 
          train_label,
          epochs=1,
          batch_size=32)

In [0]:
df = pd.read_csv(TEXT_DATA_DIR)
df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])

train_df, test_df = df.iloc[0:3000], df.iloc[3001:]

train_text = train_df['text'].tolist()
train_text = [' '.join(t.split()[0:150]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = mlb.fit_transform(train_df.labels)

test_text = test_df['text'].tolist()
test_text = [' '.join(t.split()[0:150]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = mlb.fit_transform(test_df.labels)

In [0]:
# second, prepare text samples and their labels
print('Processing text dataset')
df = pd.read_csv(TEXT_DATA_DIR)
df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])

print('Tokenizing data')
data = np.array([mt.tokenize(t, escape=False)[:150] for t in df.text])

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df.labels)

# Cross-validation: split the data into a training set and a test set

count = 0
mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0)

for train_index, test_index in mskf.split(data, labels):
    count += 1
    print(f"Fold no. {count}")
    train_text, test_text = data[train_index], data[test_index]
    
    # Look into adapting the script to accept list of tokens instead
    train_text = [' '.join(t) for t in train_text]
    train_text = np.array(train_text, dtype=object)[:, np.newaxis]
    
    test_text = [' '.join(t) for t in test_text]
    test_text = np.array(test_text, dtype=object)[:, np.newaxis]
    
    train_label, test_label = labels[train_index], labels[test_index]

Processing text dataset
Tokenizing data
Fold no. 1
Fold no. 2


In [0]:
model = build_model()
model.fit(train_text, 
          train_label,
          epochs=2,
          batch_size=32)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_8 (Elmo (None, 1024)              4         
_________________________________________________________________
dense_15 (Dense)             (None, 256)               262400    
_________________________________________________________________
dense_16 (Dense)             (None, 17)                4369      
Total params: 266,773
Trainable params: 266,773
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fba0c5bb1d0>

In [0]:
def metrics_avg(models_testx_testy, labels_, thres=0.3):
  def calc(model, test_x, test_y):
    predictions = model.predict(test_x)>thres
    metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)
    metrics_df = pd.DataFrame.from_dict(metrics)
    h = hamming_loss(test_y, predictions)
    roc = roc_auc_score(test_y, predictions, average='micro')
    return metrics_df, h, roc
    
  model_1, test_x_1, test_y_1 = models_testx_testy[0]
  metrics_agg, ham, roc = calc(model_1, test_x_1, test_y_1)
  n = len(models_testx_testy)
  
  for model, test_x, test_y_1 in models_testx_testy[1:]:
    metrics, h, r = calc(model, test_x, test_y_1)
    metrics_agg += metrics
    ham += h
    roc += r
  
  return metrics_agg/n, ham/n, roc/n

In [0]:
model.save(ELMO_DIR + f"2_epoch-elmo.h5")   

In [0]:
labels = [str(i) for i in range(1,18)]
averaged_results = metrics_avg([(model, test_text, test_label)], labels)