In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import (
    LSTM,
    Activation,
    Conv1D,
    Conv2D,
    Dense,
    Embedding,
    Flatten,
    GlobalAveragePooling1D,
    MaxPooling1D,
    MaxPooling2D,
)
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.utils import set_random_seed

<IPython.core.display.Javascript object>

In [3]:
from sklearn.datasets import fetch_20newsgroups

<IPython.core.display.Javascript object>

In [4]:
news = fetch_20newsgroups()

<IPython.core.display.Javascript object>

In [5]:
X = news["data"]
y = news["target"]

<IPython.core.display.Javascript object>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

<IPython.core.display.Javascript object>

In [7]:
X_train_raw = pd.DataFrame(X_train, columns=["text"])
X_test_raw = pd.DataFrame(X_test, columns=["text"])

<IPython.core.display.Javascript object>

In [8]:
X_train_raw["text"].apply(lambda x: len(x.split())).mean()

286.5551872721246

<IPython.core.display.Javascript object>

# Train data

## Set tokenizer

In [9]:
OOV_TOKEN = "<OOV>"
LOWER = True
CHAR_LEVEL = False
NUM_WORDS = 10000  # Number of terms which will be used for the creating the vocabulary.


tokenizer = Tokenizer(
    oov_token=OOV_TOKEN,
    split=" ",
    lower=LOWER,
    char_level=CHAR_LEVEL,
    num_words=NUM_WORDS,
)

<IPython.core.display.Javascript object>

In [10]:
tokenizer.fit_on_texts(X_train_raw["text"].values)

<IPython.core.display.Javascript object>

In [11]:
index_word = tokenizer.index_word
word_index = {v: k for k, v in tokenizer.index_word.items()}


vocabulary = {i: index_word.get(i) for i in range(1, NUM_WORDS)}
vocabulary_inverse = {w: i for i, w in vocabulary.items()}

<IPython.core.display.Javascript object>

In [12]:
texts_to_sequences_train = tokenizer.texts_to_sequences(X_train_raw["text"].values)

<IPython.core.display.Javascript object>

In [13]:
PADDING = "post"
TRUNCATING = "post"
MAXLEN = 200

sequences_padded_train = pad_sequences(
    texts_to_sequences_train, padding=PADDING, truncating=TRUNCATING, maxlen=MAXLEN
)

<IPython.core.display.Javascript object>

## Test data

In [14]:
texts_to_sequences_test = tokenizer.texts_to_sequences(X_test_raw["text"].values)

<IPython.core.display.Javascript object>

In [15]:
sequences_padded_test = pad_sequences(
    texts_to_sequences_test, padding=PADDING, truncating=TRUNCATING, maxlen=MAXLEN
)

<IPython.core.display.Javascript object>

# Set model

In [50]:
set_random_seed(42)

INPUT_DIM = NUM_WORDS  # Usually len(vocab) + 1
INPUT_LENGTH = MAXLEN  # Length of a sequence
EMBEDDING_DIM = 300

model = Sequential(
    [
        Embedding(
            input_dim=INPUT_DIM,
            output_dim=EMBEDDING_DIM,
            input_length=INPUT_LENGTH,
            weights=None,
            trainable=True,
            name="embedding_layer",
        ),
        Conv1D(
            filters=128, kernel_size=5, strides=1, padding="valid", activation="relu"
        ),  # tokens will be grouped into the size of kernel_size into a sliding way fashion
        GlobalAveragePooling1D(
            data_format="channels_last",
            keepdims=False,
        ),  # Get the average of each dimension along columns, which basically
        #  correspond to sequential "tokens" in the sentence -> (1, filters)
        Dense(units=24, activation="relu"),
        Dense(units=20, activation="softmax"),
    ],
    name="simple",
)


OPTIMIZER = Adam(learning_rate=0.01)
LOSS = SparseCategoricalCrossentropy()


model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=["accuracy"])

model.summary()

Model: "simple"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_layer (Embedding)  (None, 200, 300)         3000000   
                                                                 
 conv1d_3 (Conv1D)           (None, 196, 128)          192128    
                                                                 
 global_average_pooling1d_10  (None, 128)              0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_4 (Dense)             (None, 24)                3096      
                                                                 
 dense_5 (Dense)             (None, 20)                500       
                                                                 
Total params: 3,195,724
Trainable params: 3,195,724
Non-trainable params: 0
__________________________________________________

<IPython.core.display.Javascript object>

In [51]:
def train(X_train, y_train, **kwargs):
    history = model.fit(X_train, y_train, **kwargs)
    return history

<IPython.core.display.Javascript object>

In [52]:
history = train(
    sequences_padded_train,
    y_train,
    validation_data=(sequences_padded_test, y_test),
    epochs=10,
    batch_size=128,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<IPython.core.display.Javascript object>

# Model architecture concept

## 1.  Get a specific example

In [19]:
preprocessed_x = tf.convert_to_tensor([[1, 20, 5, 100, 5, 6, 0, 0, 0, 0]])

<IPython.core.display.Javascript object>

## 2. Transform each word into an embedding vector

- Each token (out of 10) is converted into an embedding of size 300.

In [20]:
x_embeddings = Embedding(
    input_dim=INPUT_DIM,
    output_dim=EMBEDDING_DIM,
    input_length=10,
    weights=None,
    trainable=True,
    name="embedding_layer",
)(preprocessed_x)


x_embeddings

<tf.Tensor: shape=(1, 10, 300), dtype=float32, numpy=
array([[[-0.01138071, -0.00028052,  0.00486631, ..., -0.03133049,
          0.00459578, -0.00654196],
        [-0.03527732, -0.02593962, -0.00941326, ..., -0.01866294,
          0.04566784,  0.04125491],
        [-0.01446673, -0.04826748, -0.04033958, ...,  0.04247494,
         -0.00073688,  0.02294165],
        ...,
        [ 0.04956419, -0.03067314,  0.03253037, ..., -0.04128636,
          0.00944753, -0.01768367],
        [ 0.04956419, -0.03067314,  0.03253037, ..., -0.04128636,
          0.00944753, -0.01768367],
        [ 0.04956419, -0.03067314,  0.03253037, ..., -0.04128636,
          0.00944753, -0.01768367]]], dtype=float32)>

<IPython.core.display.Javascript object>

# Convolutional layer

- Group 5 (kernel_size) tokens together in a sliding fashion and apply a filter (creating a feature map)
- Apply 128 filters in total

In [21]:
x_embeddings_conv = Conv1D(filters=128, kernel_size=5, strides=1)(x_embeddings)
x_embeddings_conv

<tf.Tensor: shape=(1, 6, 128), dtype=float32, numpy=
array([[[-5.34108169e-02,  5.47105446e-02,  1.98208597e-02,
         -4.18145321e-02, -2.33813785e-02, -5.71361568e-04,
          2.00376362e-02,  1.33971227e-02,  4.69140150e-02,
         -4.71926443e-02,  4.84785251e-02,  7.59591684e-02,
         -8.63344350e-04, -2.96909790e-02,  3.32187079e-02,
          4.65907007e-02, -6.89424202e-02, -6.22935258e-02,
         -4.74914163e-02, -2.16751639e-02, -3.01345773e-02,
         -1.78914089e-02,  4.55379561e-02,  4.90763932e-02,
          1.27896667e-02, -6.53295685e-03,  4.91316691e-02,
          5.45761585e-02,  5.66323772e-02, -1.55160138e-02,
         -3.11801657e-02,  3.23185660e-02,  1.54267307e-02,
         -1.55090252e-02, -2.99233869e-02, -1.29870167e-02,
          4.71019745e-03,  6.35879338e-02,  8.96916818e-03,
          1.39941657e-02, -2.62946114e-02, -5.47023788e-02,
         -2.68323272e-02, -3.58169265e-02, -6.40312955e-03,
          2.22741328e-02, -8.18172321e-02,  4.3

<IPython.core.display.Javascript object>

# Apply GlobalAveragePooling

- Apply mean across each column on the vector of shape (6, 128) -> (1, 128)

In [53]:
GlobalAveragePooling1D(data_format="channels_last", keepdims=False)(x_embeddings_conv)

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[-0.03793712, -0.01939092, -0.00020089,  0.00590422, -0.01248919,
        -0.00169052, -0.01626821,  0.01151122,  0.02024494, -0.01155284,
         0.03275395,  0.02437269,  0.0387207 , -0.00595966,  0.01693203,
        -0.01769865, -0.0249456 , -0.02558559,  0.0075829 ,  0.02791175,
         0.00852726, -0.0011244 ,  0.04778561, -0.0091789 ,  0.00822638,
         0.00087525, -0.01470401,  0.0055057 ,  0.03334527, -0.02290542,
        -0.01815302,  0.03603454, -0.00260648, -0.01815443, -0.01765213,
        -0.02973837,  0.02120875,  0.02668724, -0.02414597, -0.00555248,
        -0.00076683, -0.01766458, -0.00040278, -0.01252959, -0.00850995,
         0.0351337 , -0.01299454, -0.02464048, -0.00498329,  0.02114153,
         0.01398337,  0.00081485, -0.01975248, -0.0184624 , -0.01325253,
         0.02978374, -0.02401468, -0.02457019, -0.0080511 , -0.01088212,
        -0.01005084,  0.00983927,  0.01635016,  0.04558197, -0.01659774,
 

<IPython.core.display.Javascript object>

In [60]:
x_embeddings_conv[0, :, 0].numpy().mean()

-0.037937116

<IPython.core.display.Javascript object>