<a href="https://colab.research.google.com/github/vondersam/sdgs_text_classifier/blob/master/experiments/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, roc_auc_score, hamming_loss, accuracy_score
from keras import optimizers
import os

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, Flatten
from keras.models import Model, Sequential
from keras.initializers import Constant
# Conv
from keras.layers import Conv1D, MaxPooling1D, Embedding
# LSTM
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, SpatialDropout1D, Bidirectional, GRU, LSTM
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam

import gensim
from gensim.models import Word2Vec

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from collections import Counter
from keras.models import load_model

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
base_dir = "gdrive/My Drive/fastai-v3/sdgs/"

Mounted at /content/gdrive


In [0]:
TEXT_DATA_DIR = f"{base_dir}dataset/cleanup_labelled.csv"
EMBEDDINGS_DIR = f"{base_dir}embeddings/word2vec/"
CROSS_FOLDS = f"{base_dir}dataset/cross_validation/"

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
NUM_EPOCHS = 15
BATCH_SIZE = 128
labels_index = [str(i) for i in range(1,18)]

In [0]:
df = pd.read_csv(TEXT_DATA_DIR)
df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])

vocab = Counter()
texts = [word_tokenize(t.lower()) for t in df.text]

for text in texts:
    vocab.update(text)    

model = Word2Vec(texts, size=EMBEDDING_DIM, window=5, min_count=5, workers=16, sg=0, negative=5)

In [0]:
word_vectors = model.wv

In [0]:
word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NUM_WORDS))}
sequences = np.array([[word_index.get(t, 0) for t in text]
             for text in texts])

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
mlb = MultiLabelBinarizer()
labels = np.array(mlb.fit_transform(df.labels))

In [35]:
results = []
arch = 'Conv1D_glorot_uniform'

for fold in os.listdir(CROSS_FOLDS):
    train_index = np.load(f"{CROSS_FOLDS}{fold}/train.npy")
    val_index = np.load(f"{CROSS_FOLDS}{fold}/val.npy")
    #test_index = np.load(f"{CROSS_FOLDS}{fold}/test.npy")

    x_train, x_val = data[train_index], data[val_index]
    y_train, y_val = labels[train_index], labels[val_index]
    
        
    print(F"Training {fold}")

    print('Preparing embedding matrix.')
    # prepare embedding matrix
    num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    
    for word, i in word_index.items():
        if i > MAX_NUM_WORDS:
            continue
        try:
            embedding_vector = word_vectors[word]
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        except:
            pass   
    
    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    print('Training model.')
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    # 0.22
    if arch == 'conv': 
        # 1D convnet with global maxpooling
        x = Conv1D(128, 5, activation='relu')(embedded_sequences)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = GlobalMaxPooling1D()(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(len(labels_index), activation='sigmoid')(x)
        model = Model(sequence_input, preds)
        model.compile(loss='binary_crossentropy', 
                    optimizer=Adam(lr=0.01), 
                    metrics=['accuracy'])
    
    
    # 0.16, 8 epochs without Bidirectional
    # 0.15, 8 epochs with Bidirectional
    # 0.13, 10 epochs with Bidirectional
    if arch == "bidirectionalGRU":
        x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1,recurrent_dropout=0.1))(embedded_sequences)
        x = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        x = concatenate([avg_pool, max_pool])
        preds = Dense(17, activation="sigmoid")(x)
        model = Model(sequence_input, preds)
        model.summary() 
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    
    # around .21, 10 epochs with Bidirectional
    if arch == "Bidirectional_LSTM":
        x = Bidirectional(LSTM(25, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedded_sequences)
        x = GlobalMaxPooling1D()(x)
        x = Dense(50, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(17, activation="sigmoid")(x)
        model = Model(inputs=sequence_input, outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        
        
    if arch == "Conv1D_glorot_uniform":
        x = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(embedded_sequences)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        x = concatenate([avg_pool, max_pool])
        preds = Dense(len(labels_index), activation='sigmoid')(x)
        model = Model(sequence_input, preds)
        model.compile(loss='binary_crossentropy', 
                optimizer='rmsprop',
                metrics=['accuracy'])
    
    model.fit(x_train, y_train,
            batch_size=BATCH_SIZE,
            epochs=NUM_EPOCHS,
            validation_data=(x_val, y_val))
    
    model.save(EMBEDDINGS_DIR + f"{arch}_{NUM_EPOCHS}epochs_{EMBEDDING_DIM}D_batchsize{BATCH_SIZE}_5fold-cross-val_{fold}.h5")

Training fold_1
Preparing embedding matrix.
Training model.
Train on 4173 samples, validate on 516 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Training fold_2
Preparing embedding matrix.
Training model.
Train on 4151 samples, validate on 514 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Training fold_3
Preparing embedding matrix.
Training model.
Train on 4142 samples, validate on 513 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Training fold_4
Preparing embedding matrix.
Training model.
Train on 4140 samples, validate on 512 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoc

# Load and evaluate folds on test

In [0]:
def metrics_avg(models_testx_testy, labels_, thres=0.3):
    def calc(model, test_x, test_y):
        predictions = model.predict(test_x)>thres
        metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)
        metrics_df = pd.DataFrame.from_dict(metrics)
        h = hamming_loss(test_y, predictions)
        roc = roc_auc_score(test_y, predictions, average='micro')
        return metrics_df, h, roc

    model_1, test_x_first, test_y_first = models_testx_testy[0]
    metrics_agg, ham, roc = calc(model_1, test_x_first, test_y_first)
    n = len(models_testx_testy)

    for model, test_x, test_y in models_testx_testy[1:]:
        metrics, h, r = calc(model, test_x, test_y)
        metrics_agg += metrics
        ham += h
        roc += r

    return metrics_agg/n, ham/n, roc/n

In [41]:
loaded_arch = 'Conv1D_glorot_uniform'
loaded_models = []
for fold in os.listdir(CROSS_FOLDS):
    print(f"Loading {fold}...")
    test_index = np.load(f"{CROSS_FOLDS}{fold}/test.npy")

    x_test = data[test_index]
    y_test = labels[test_index]
    
    load_dir = EMBEDDINGS_DIR + f"{loaded_arch}_{NUM_EPOCHS}epochs_{EMBEDDING_DIM}D_batchsize{BATCH_SIZE}_5fold-cross-val_{fold}.h5"
    loaded_model = load_model(load_dir)
    
    loaded_models.append((loaded_model, x_test, y_test))
print(f"Finished loading the {loaded_arch} models.")

Loading fold_1...
Loading fold_2...
Loading fold_3...
Loading fold_4...
Loading fold_5...
Finished loading the Conv1D_glorot_uniform models.


In [56]:
avg_results = metrics_avg(loaded_models, labels_index, thres=0.2)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [59]:
avg_results[2]

0.7739318542452324