<a href="https://colab.research.google.com/github/vondersam/sdgs_text_classifier/blob/master/experiments/word_embeddings_glove.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#!pip install -I tensorflow
#!pip install -I keras

Collecting iterative-stratification
  Downloading https://files.pythonhosted.org/packages/9d/79/9ba64c8c07b07b8b45d80725b2ebd7b7884701c1da34f70d4749f7b45f9a/iterative_stratification-0.1.6-py3-none-any.whl
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.6


In [0]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, hamming_loss, accuracy_score
from keras import optimizers
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import os

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
base_dir = "gdrive/My Drive/fastai-v3/sdgs/"

Mounted at /content/gdrive


In [0]:
TEXT_DATA_DIR = f"{base_dir}dataset/cleanup_labelled.csv"
GLOVE_DIR = f"{base_dir}embeddings/glove/glove.6B/"

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
labels_index = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]

In [0]:
# Load pretrained embeddings in an index mapping words in the embeddings set
# to their embeddings vector
print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(f"Found {len(embeddings_index)} word vectors.")

# second, prepare text samples and their labels
print('Processing text dataset')
df = pd.read_csv(TEXT_DATA_DIR)
df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df.text)
sequences = tokenizer.texts_to_sequences(df.text)
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df.labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Cross-validation: split the data into a training set and a test set
count = 0
mskf = MultilabelStratifiedKFold(n_splits=10, random_state=0)
cross_entropy_files = []

for train_index, test_index in mskf.split(data, labels):
    count += 1
    print(f"Fold no. {count}")
    x_train, x_val = data[train_index], data[test_index]
    y_train, y_val = labels[train_index], labels[test_index]
    cross_entropy_files.append((x_train, x_val, y_train, y_val))

Indexing word vectors.
Found 400000 word vectors.
Processing text dataset
Found 14736 unique tokens.
Shape of data tensor: (5182, 1000)
Shape of label tensor: (5182, 17)
Fold no. 1
Fold no. 2
Fold no. 3
Fold no. 4
Fold no. 5
Fold no. 6
Fold no. 7
Fold no. 8
Fold no. 9
Fold no. 10


In [0]:
count = 0
results = []
for x_train, x_val, y_train, y_val in cross_entropy_files:
    count += 1
    print(F"Training fold {count}")
    print('Preparing embedding matrix.')
    # prepare embedding matrix
    num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    print('Training model.')
    # train a 1D convnet with global maxpooling
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='relu')(x)
    preds = Dense(len(labels_index), activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy', 
                optimizer='Adam', 
                metrics=['accuracy'])

    model.fit(x_train, y_train,
            batch_size=128,
            epochs=10,
            validation_data=(x_val, y_val))
    results.append((model, x_val, y_val))

In [0]:
count = 0
for model, t_x, t_y in results:
    count += 1
    model.save(GLOVE_DIR + f"10_epoch-300d-10-fold-cross-val_{count}.h5")   

In [0]:
def metrics_avg(models_testx_testy, labels_, thres=0.3):
  def calc(model, test_x, test_y):
    predictions = model.predict(test_x)>thres
    metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)
    metrics_df = pd.DataFrame.from_dict(metrics)
    h = hamming_loss(test_y, predictions)
    roc = roc_auc_score(test_y, predictions, average='micro')
    return metrics_df, h, roc
    
  model_1, test_x_1, test_y_1 = models_testx_testy[0]
  metrics_agg, ham, roc = calc(model_1, test_x_1, test_y_1)
  n = len(models_testx_testy)
  
  for model, test_x, test_y_1 in models_testx_testy[1:]:
    metrics, h, r = calc(model, test_x, test_y_1)
    metrics_agg += metrics
    ham += h
    roc += r
  
  return metrics_agg/n, ham/n, roc/n

In [0]:
labels = [str(i) for i in range(1,18)]
avg_results = metrics_avg(results, labels)

In [0]:
avg_results[1]

0.06598673260429105