<a href="https://colab.research.google.com/github/vondersam/sdgs_text_classifier/blob/master/experiments/word_embeddings_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install iterative-stratification
!pip install -I keras

In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, roc_auc_score, hamming_loss, accuracy_score
from keras import optimizers
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import os

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.models import Model
from keras.initializers import Constant
# Conv
from keras.layers import Conv1D, MaxPooling1D, Embedding
# LSTM
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam

import gensim
from gensim.models import Word2Vec

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from collections import Counter

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
base_dir = "gdrive/My Drive/fastai-v3/sdgs/"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
TEXT_DATA_DIR = f"{base_dir}dataset/cleanup_labelled.csv"
EMBEDDINGS_DIR = f"{base_dir}embeddings/word2vec/"

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
labels_index = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]

In [0]:
df = pd.read_csv(TEXT_DATA_DIR)
df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])
vocab = Counter()
texts = [word_tokenize(t.lower()) for t in df.text]
for text in texts:
    vocab.update(text)    
model = Word2Vec(texts, size=100, window=5, min_count=5, workers=16, sg=0, negative=5)

In [0]:
word_vectors = model.wv

In [10]:
word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NUM_WORDS))}
sequences = np.array([[word_index.get(t, 0) for t in text]
             for text in texts])
sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

mskf = MultilabelStratifiedKFold(n_splits=10, random_state=0)
mlb = MultiLabelBinarizer()
labels = np.array(mlb.fit_transform(df.labels))
cross_entropy_files = []
count = 0

for train_index, val_index in mskf.split(sequences, labels):
    count += 1
    print(f"Fold no. {count}")
    x_train = sequences[train_index]
    x_val = sequences[val_index]
    y_train = labels[train_index]
    y_val = labels[val_index]
    cross_entropy_files.append((x_train, x_val, y_train, y_val))

Fold no. 1
Fold no. 2
Fold no. 3
Fold no. 4
Fold no. 5
Fold no. 6
Fold no. 7
Fold no. 8
Fold no. 9
Fold no. 10


In [47]:
count = 0
results = []
arch = 'rnn'
for x_train, x_val, y_train, y_val in cross_entropy_files:
    count += 1
    print(F"Training fold {count}")
    print('Preparing embedding matrix.')
    # prepare embedding matrix
    num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    
    for word, i in word_index.items():
        if i > MAX_NUM_WORDS:
            continue
        try:
            embedding_vector = word_vectors[word]
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        except:
            pass   
    
    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    print('Training model.')
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    if arch == 'conv':
        # 1D convnet with global maxpooling
        x = Conv1D(128, 5, activation='relu')(embedded_sequences)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = GlobalMaxPooling1D()(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(len(labels_index), activation='sigmoid')(x)
        model = Model(sequence_input, preds)
        model.compile(loss='binary_crossentropy', 
                    optimizer=Adam(lr=0.01), 
                    metrics=['accuracy'])
    
    if arch == 'lstm':
        # biGRU
        embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(embedded_sequences)

        # Output
        x = Dropout(0.2)(x)
        x = BatchNormalization()(x)
        preds = Dense(17, activation='sigmoid')(x)

        # build the model
        model = Model(sequence_input, preds)
        model.compile(loss='binary_crossentropy',
                    #optimizer=Adam(lr=0.0001, clipnorm=.25, beta_1=0.7, beta_2=0.99),
                    optimizer='adam',
                    metrics=[])
        

        x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True, dropout=0.1,
                                                      recurrent_dropout=0.1))(x)
    if arch == "rnn":
        x = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(embedded_sequences)

        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)

        x = concatenate([avg_pool, max_pool])

        preds = Dense(17, activation="sigmoid")(x)

        model = Model(sequence_input, preds)

        model.summary() 

        model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.01), metrics=['accuracy'])

    # Fit model    
    model.fit(x_train, y_train,
            batch_size=128,
            epochs=10,
            validation_data=(x_val, y_val))
    results.append((model, x_val, y_val))

Training fold 1
Preparing embedding matrix.
Training model.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           (None, 1000)         0                                            
__________________________________________________________________________________________________
embedding_21 (Embedding)        (None, 1000, 100)    1614200     input_21[0][0]                   
__________________________________________________________________________________________________
conv1d_23 (Conv1D)              (None, 998, 64)      19264       embedding_21[0][0]               
__________________________________________________________________________________________________
global_average_pooling1d_17 (Gl (None, 64)           0           conv1d_23[0][0]                  
_________________________________________________

In [0]:
count = 0
for model, t_x, t_y in results:
    count += 1
    model.save(EMBEDDINGS_DIR + f"rnn_word2vec_10_epoch-100d-10-fold-cross-val_{count}.h5") 

In [0]:
def metrics_avg(models_testx_testy, labels_, thres=0.3):
  def calc(model, test_x, test_y):
    predictions = model.predict(test_x)>thres
    metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)
    metrics_df = pd.DataFrame.from_dict(metrics)
    h = hamming_loss(test_y, predictions)
    roc = roc_auc_score(test_y, predictions, average='micro')
    return metrics_df, h, roc
    
  model_1, test_x_1, test_y_1 = models_testx_testy[0]
  metrics_agg, ham, roc = calc(model_1, test_x_1, test_y_1)
  n = len(models_testx_testy)
  
  for model, test_x, test_y_1 in models_testx_testy[1:]:
    metrics, h, r = calc(model, test_x, test_y_1)
    metrics_agg += metrics
    ham += h
    roc += r
  
  return metrics_agg/n, ham/n, roc/n

In [49]:
labels = [str(i) for i in range(1,18)]
avg_results = metrics_avg(results, labels)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [52]:
avg_results[2]

0.7850336276575716