In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import (LSTM, Activation, Conv1D, Conv2D, Dense,
                                     Embedding, Flatten,
                                     GlobalAveragePooling1D, MaxPooling1D,
                                     MaxPooling2D)
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import (Tokenizer,
                                                 text_to_word_sequence)
from tensorflow.keras.utils import set_random_seed

<IPython.core.display.Javascript object>

In [3]:
from sklearn.datasets import fetch_20newsgroups

<IPython.core.display.Javascript object>

In [4]:
news = fetch_20newsgroups()

<IPython.core.display.Javascript object>

In [5]:
X = news["data"]
y = news["target"]

<IPython.core.display.Javascript object>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

<IPython.core.display.Javascript object>

In [7]:
X_train_raw = pd.DataFrame(X_train, columns=["text"])
X_test_raw = pd.DataFrame(X_test, columns=["text"])

<IPython.core.display.Javascript object>

In [8]:
X_train_raw["text"].apply(lambda x: len(x.split())).mean()

286.5551872721246

<IPython.core.display.Javascript object>

# Train data

## Set tokenizer

In [9]:
OOV_TOKEN = "<OOV>"
LOWER = True
CHAR_LEVEL = False
NUM_WORDS = 10000  # Number of terms which will be used for the creating the vocabulary.


tokenizer = Tokenizer(
    oov_token=OOV_TOKEN,
    split=" ",
    lower=LOWER,
    char_level=CHAR_LEVEL,
    num_words=NUM_WORDS,
)

<IPython.core.display.Javascript object>

In [10]:
tokenizer.fit_on_texts(X_train_raw["text"].values)

<IPython.core.display.Javascript object>

In [11]:
index_word = tokenizer.index_word
word_index = {v: k for k, v in tokenizer.index_word.items()}


vocabulary = {i: index_word.get(i) for i in range(1, NUM_WORDS)}
vocabulary_inverse = {w: i for i, w in vocabulary.items()}

<IPython.core.display.Javascript object>

In [12]:
texts_to_sequences_train = tokenizer.texts_to_sequences(X_train_raw["text"].values)

<IPython.core.display.Javascript object>

In [13]:
PADDING = "post"
TRUNCATING = "post"
MAXLEN = 200

sequences_padded_train = pad_sequences(
    texts_to_sequences_train, padding=PADDING, truncating=TRUNCATING, maxlen=MAXLEN
)

<IPython.core.display.Javascript object>

## Test data

In [14]:
texts_to_sequences_test = tokenizer.texts_to_sequences(X_test_raw["text"].values)

<IPython.core.display.Javascript object>

In [15]:
sequences_padded_test = pad_sequences(
    texts_to_sequences_test, padding=PADDING, truncating=TRUNCATING, maxlen=MAXLEN
)

<IPython.core.display.Javascript object>

# Set model

In [16]:
set_random_seed(42)

INPUT_DIM = NUM_WORDS  # Usually len(vocab) + 1
INPUT_LENGTH = MAXLEN  # Length of a sequence
EMBEDDING_DIM = 300

model = Sequential(
    [
        Embedding(
            input_dim=INPUT_DIM,
            output_dim=EMBEDDING_DIM,
            input_length=INPUT_LENGTH,
            weights=None,
            trainable=True,
            name="embedding_layer",
        ),
        Conv1D(
            filters=128, kernel_size=5, strides=1, padding="valid", activation="relu"
        ),  # tokens will be grouped into the size of kernel_size into a sliding way fashion
        GlobalAveragePooling1D(
            data_format="channels_last",
            keepdims=False,
        ),  # Get the average of each dimension along columns -> (1, EMBEDDING_DIM)
        Dense(units=24, activation="relu"),
        Dense(units=20, activation="softmax"),
    ],
    name="simple",
)


OPTIMIZER = Adam(learning_rate=0.01)
LOSS = SparseCategoricalCrossentropy()


model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=["accuracy"])

model.summary()

Model: "simple"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_layer (Embedding)  (None, 200, 300)         3000000   
                                                                 
 conv1d (Conv1D)             (None, 196, 128)          192128    
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 24)                3096      
                                                                 
 dense_1 (Dense)             (None, 20)                500       
                                                                 
Total params: 3,195,724
Trainable params: 3,195,724
Non-trainable params: 0
__________________________________________________

<IPython.core.display.Javascript object>

In [17]:
def train(X_train, y_train, **kwargs):
    history = model.fit(X_train, y_train, **kwargs)
    return history

<IPython.core.display.Javascript object>

In [18]:
history = train(
    sequences_padded_train,
    y_train,
    validation_data=(sequences_padded_test, y_test),
    epochs=10,
    batch_size=128,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<IPython.core.display.Javascript object>

# Model architecture concept

## 1.  Get a specific example

In [82]:
preprocessed_x = tf.convert_to_tensor([[1, 20, 5, 100, 5, 6, 0, 0, 0, 0]])

<IPython.core.display.Javascript object>

## 2. Transform each word into an embedding vector

- Each token (out of 10) is converted into an embedding of size 300.

In [83]:
x_embeddings = Embedding(
    input_dim=INPUT_DIM,
    output_dim=EMBEDDING_DIM,
    input_length=10,
    weights=None,
    trainable=True,
    name="embedding_layer",
)(preprocessed_x)


x_embeddings

<tf.Tensor: shape=(1, 10, 300), dtype=float32, numpy=
array([[[-0.03792673,  0.01216624,  0.04102094, ..., -0.01477361,
         -0.04610056, -0.01698422],
        [-0.01894388, -0.0339001 , -0.02878075, ..., -0.03635576,
          0.04231792,  0.01827201],
        [ 0.04776961, -0.04850559, -0.02631692, ...,  0.01916956,
          0.00996971, -0.01479719],
        ...,
        [ 0.03692415, -0.00588097,  0.03930279, ..., -0.00416108,
          0.00596811, -0.03943629],
        [ 0.03692415, -0.00588097,  0.03930279, ..., -0.00416108,
          0.00596811, -0.03943629],
        [ 0.03692415, -0.00588097,  0.03930279, ..., -0.00416108,
          0.00596811, -0.03943629]]], dtype=float32)>

<IPython.core.display.Javascript object>

# Convolutional layer

- Group 5 (kernel_size) tokens together in a sliding fashion and apply a filter (creating a feature map)
- Apply 128 filters in total

In [84]:
x_embeddings_conv = Conv1D(filters=128, kernel_size=5, strides=1)(x_embeddings)
x_embeddings_conv

<tf.Tensor: shape=(1, 6, 128), dtype=float32, numpy=
array([[[ 2.64654662e-02,  1.06289654e-05, -1.30135454e-02,
         -2.79547684e-02,  2.34610531e-02,  3.97426337e-02,
          3.68155316e-02,  4.94785048e-02,  2.91928705e-02,
         -3.15566473e-02,  1.38124311e-02,  1.90439336e-02,
          4.89882454e-02, -2.47744191e-03, -3.07232095e-03,
         -2.79719252e-02,  2.07156893e-02,  8.27492774e-03,
          1.23292804e-02,  9.28917900e-03, -6.77116960e-02,
         -1.85454898e-02,  6.34525344e-02, -1.16845500e-02,
         -6.65146019e-03, -2.90338136e-02, -5.14927730e-02,
         -8.54387656e-02, -1.15467841e-02, -1.13069443e-02,
         -1.21714929e-02,  4.29453552e-02, -4.84831333e-02,
         -6.04208447e-02,  2.25669276e-02, -2.36066412e-02,
          1.12904310e-02, -1.17048752e-02,  3.95711288e-02,
         -1.61402242e-03,  3.94231230e-02,  5.35211042e-02,
          2.70236991e-02, -1.67256277e-02,  1.27258720e-02,
         -4.29503284e-02,  1.31317060e-02, -3.5

<IPython.core.display.Javascript object>

# Apply GlobalAveragePooling

- Apply mean across each column on the vector of shape (6, 128) -> (1, 128)

In [85]:
GlobalAveragePooling1D(data_format="channels_last", keepdims=False)(x_embeddings_conv)

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[-0.00039661,  0.03464271, -0.00129973,  0.00406786,  0.01217308,
        -0.00795424, -0.01302964,  0.01047372,  0.02796918, -0.00439331,
         0.02016652, -0.02671128, -0.00685959, -0.00171959,  0.01733313,
        -0.0132616 ,  0.00620547,  0.00616885, -0.00191789,  0.02794292,
        -0.04255196,  0.02352474,  0.02057702, -0.00173663,  0.03349061,
         0.02800533, -0.03141077, -0.01991761, -0.00660919, -0.00307905,
        -0.00464081,  0.03078742,  0.00919534, -0.01324844,  0.01367689,
         0.01417028,  0.00709938,  0.01651193, -0.02865957,  0.05791071,
         0.01045931,  0.03069473,  0.01056758,  0.01090864, -0.03178004,
        -0.03055568, -0.02150802, -0.00407515, -0.01395745,  0.00524548,
        -0.01123413, -0.00152871,  0.01154399,  0.00334777,  0.00355038,
        -0.01850641,  0.01710602, -0.01762464,  0.01466906,  0.00349631,
         0.02972666,  0.06265616, -0.02652083, -0.00128035, -0.0146708 ,
 

<IPython.core.display.Javascript object>

In [73]:
(
    GlobalAveragePooling1D(data_format="channels_last", keepdims=False)(
        x_embeddings_conv
    ).numpy()
    == np.mean(x_embeddings_conv, axis=1)
).all()

True

<IPython.core.display.Javascript object>