<a href="https://colab.research.google.com/github/vondersam/sdgs_text_classifier/blob/master/experiments/glove.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, roc_auc_score, hamming_loss, accuracy_score
from keras import optimizers
import os
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, Bidirectional
from keras.models import Model, Sequential
from keras.initializers import Constant
from keras.optimizers import Adam, RMSprop

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Conv1D, GlobalMaxPooling1D, CuDNNLSTM
from keras.layers import Embedding
from keras.models import load_model


#from tensorflow.keras.backend import set_session
#sess = tf.Session()
#set_session(sess)
#sess.run(tf.global_variables_initializer())

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
base_dir = "gdrive/My Drive/fastai-v3/sdgs/"

In [0]:
TEXT_DATA_DIR = f"{base_dir}dataset/cleanup_labelled.csv"
CROSS_FOLDS = f"{base_dir}dataset/cross_validation/"
GLOVE_DIR = f"{base_dir}embeddings/glove/glove.6B/"
EMBEDDINGS_DIR = f"{base_dir}embeddings/glove/"

MAX_SEQUENCE_LENGTH = 500
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
NUM_EPOCHS = 20
BATCH_SIZE = 128
labels_index = [str(i) for i in range(1,18)]

In [0]:
# Load pretrained embeddings in an index mapping words in the embeddings set
# to their embeddings vector
print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(f"Found {len(embeddings_index)} word vectors.")

In [0]:
is_mask = ""
df = pd.read_csv(TEXT_DATA_DIR)

###### MASK LABELS
pattern = r"(indicator)(\s+\d+\.[\d+a-d]\.\d+)|(target)(\s+\d+\.[\d+a-d])|(sdgs|sdg|goals|goal)\W*\s+(,?\s*\b\d{1,2}\b[and\s\b\d{1,2}\b]*)"
masked_df = df.text.str.replace(pattern, ' SDGLABEL ', regex=True, flags=re.IGNORECASE)
masked_df = pd.DataFrame(masked_df.str.replace('  ', ' ', regex=True, flags=re.IGNORECASE))


# Masked sequences for training, word index 
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(masked_df.text)
word_index = tokenizer.word_index
masked_sequences = tokenizer.texts_to_sequences(masked_df.text)
masked_data = pad_sequences(masked_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Non masked sequences for testing
non_masked_sequences = tokenizer.texts_to_sequences(df.text)
non_masked_data = pad_sequences(non_masked_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Labels
mlb = MultiLabelBinarizer()
df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])
labels = mlb.fit_transform(df.labels)

In [0]:
models = []
arch = 'Conv1D_glorot_uniform'

# Cross-validation: split the data into a training set and a test set
for fold in os.listdir(CROSS_FOLDS):
    train_index = np.load(f"{CROSS_FOLDS}{fold}/train.npy")
    val_index = np.load(f"{CROSS_FOLDS}{fold}/val.npy")
    test_index = np.load(f"{CROSS_FOLDS}{fold}/test.npy")

    # Masked for training, and non_masked for testing
    x_train, x_val, x_test  = masked_data[train_index], masked_data[val_index], non_masked_data[test_index]
    y_train, y_val, y_test = labels[train_index], labels[val_index], labels[test_index]
    
    print(fold)
    print('Preparing embedding matrix.')
    
    # prepare embedding matrix
    num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        # Ignore word if not in the n most common words
        if i > MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    
    # 
    if arch == "Bidirectional_LSTM":
        x = Bidirectional(LSTM(25, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedded_sequences)
        x = GlobalMaxPooling1D()(x)
        x = Dense(50, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(17, activation="sigmoid")(x)
        model = Model(inputs=sequence_input, outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    
    # 0.179 with 10 epochs, 300 dimensions
    if arch == "convnet":
        # 1D convnet with global maxpooling
        x = Conv1D(128, 5, activation='relu')(embedded_sequences)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = GlobalMaxPooling1D()(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(len(labels_index), activation='sigmoid')(x)
        model = Model(sequence_input, preds)
        model.compile(loss='binary_crossentropy', 
                optimizer='rmsprop',
                metrics=['accuracy'])
        
    
    # 0.131 with 20 epochs, 300 dimensions
    if arch == "Conv1D_glorot_uniform":
        x = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(embedded_sequences)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        x = concatenate([avg_pool, max_pool])
        preds = Dense(len(labels_index), activation='sigmoid')(x)
        model = Model(sequence_input, preds)
        model.compile(loss='binary_crossentropy', 
                optimizer=RMSprop(lr=0.001),
                metrics=['accuracy'])
   

        
    if arch == "convolution1d":
        #https://github.com/keras-team/keras/blob/master/examples/imdb_cnn.py
        model = Sequential()

        # we start off with an efficient embedding layer which maps
        # our vocab indices into embedding_dims dimensions
        #model.add(embedded_sequences)
        model.add(Embedding(num_words,
                    EMBEDDING_DIM,
                    input_length=MAX_SEQUENCE_LENGTH))
        model.add(Dropout(0.2))

        # we add a Convolution1D, which will learn filters
        # word group filters of size filter_length:
        model.add(Conv1D(filters,
                         kernel_size,
                         padding='valid',
                         activation='relu',
                         strides=1))
        # we use max pooling:
        model.add(GlobalMaxPooling1D())

        # We add a vanilla hidden layer:
        model.add(Dense(hidden_dims))
        model.add(Dropout(0.2))
        model.add(Activation('relu'))

        # We project onto a single unit output layer, and squash it with a sigmoid:
        model.add(len(labels_index))
        model.add(Activation('sigmoid'))


    model.fit(x_train, y_train,
            batch_size=128,
            epochs=NUM_EPOCHS,
            validation_data=(x_val, y_val))

    models.append((model, x_test, y_test))
    #model.save(EMBEDDINGS_DIR + f"{is_mask}{arch}_{NUM_EPOCHS}epochs_{EMBEDDING_DIM}D_batchsize{BATCH_SIZE}_5fold-cross-val_{fold}.h5")

# Load and evaluate folds on test

In [0]:
def metrics_avg(models_testx_testy, labels_, thres=0.3):
    def calc(model, test_x, test_y):
        predictions = model.predict(test_x)>thres
        metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)
        metrics_df = pd.DataFrame.from_dict(metrics)
        h = hamming_loss(test_y, predictions)
        roc = roc_auc_score(test_y, predictions, average='micro')
        return metrics_df, h, roc

    model_1, test_x_first, test_y_first = models_testx_testy[0]
    metrics_agg, ham, roc = calc(model_1, test_x_first, test_y_first)
    n = len(models_testx_testy)

    for model, test_x, test_y in models_testx_testy[1:]:
        metrics, h, r = calc(model, test_x, test_y)
        metrics_agg += metrics
        ham += h
        roc += r

    return metrics_agg/n, ham/n, roc/n

In [0]:
loaded_arch = 'Conv1D_glorot_uniform'
loaded_models = []
for fold in os.listdir(CROSS_FOLDS):
    print(f"Loading {fold}...")
    test_index = np.load(f"{CROSS_FOLDS}{fold}/test.npy")

    x_test = data[test_index]
    y_test = labels[test_index]
    
    load_dir = EMBEDDINGS_DIR + f"{loaded_arch}_{NUM_EPOCHS}epochs_{EMBEDDING_DIM}D_batchsize{BATCH_SIZE}_5fold-cross-val_{fold}.h5"
    loaded_model = load_model(load_dir)
    
    loaded_models.append((loaded_model, x_test, y_test))
print(f"Finished loading the {loaded_arch} models.")

In [0]:
avg_results = metrics_avg(models, labels_index); avg_results[0]

In [0]:
#avg_results[0].to_csv(EMBEDDINGS_DIR + f'masked_results_{arch}.csv', sep=';')

In [0]:
hl = round(avg_results[1],4)
roc_auc = round(avg_results[2],4)
print(f"hl;{hl};;roc-auc;{roc_auc}")