In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import (
    Activation,
    Dense,
    Embedding,
    Conv1D,
    Conv2D,
    Flatten,
    GlobalAveragePooling1D,
    LSTM,
    MaxPooling1D,
    MaxPooling2D,

)
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence

from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import set_random_seed


<IPython.core.display.Javascript object>

In [3]:
from sklearn.datasets import fetch_20newsgroups

<IPython.core.display.Javascript object>

In [4]:
news = fetch_20newsgroups()

<IPython.core.display.Javascript object>

In [5]:
X = news["data"]
y = news["target"]

<IPython.core.display.Javascript object>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

<IPython.core.display.Javascript object>

In [7]:
X_train_raw = pd.DataFrame(X_train, columns=["text"])
X_test_raw = pd.DataFrame(X_test, columns=["text"])

<IPython.core.display.Javascript object>

# Train data

## Set tokenizer

In [8]:
OOV_TOKEN = "<OOV>"
LOWER = True
CHAR_LEVEL = False
NUM_WORDS = 10000  # Number of terms which will be used for the creating the vocabulary.


tokenizer = Tokenizer(
    oov_token=OOV_TOKEN,
    split=" ",
    lower=LOWER,
    char_level=CHAR_LEVEL,
    num_words=NUM_WORDS,
)

<IPython.core.display.Javascript object>

In [9]:
tokenizer.fit_on_texts(X_train_raw["text"].values)

<IPython.core.display.Javascript object>

In [10]:
index_word = tokenizer.index_word
word_index = {v: k for k, v in tokenizer.index_word.items()}


vocabulary = {i: index_word.get(i) for i in range(1, NUM_WORDS)}
vocabulary_inverse = {w: i for i, w in vocabulary.items()}

<IPython.core.display.Javascript object>

In [11]:
texts_to_sequences_train = tokenizer.texts_to_sequences(X_train_raw["text"].values)

<IPython.core.display.Javascript object>

In [12]:
PADDING = "post"
TRUNCATING = "post"
MAXLEN = 10

sequences_padded_train = pad_sequences(
    texts_to_sequences_train, padding=PADDING, truncating=TRUNCATING, maxlen=MAXLEN
)

<IPython.core.display.Javascript object>

## Test data

In [13]:
texts_to_sequences_test = tokenizer.texts_to_sequences(X_test_raw["text"].values)

<IPython.core.display.Javascript object>

In [14]:
sequences_padded_test = pad_sequences(
    texts_to_sequences_test, padding=PADDING, truncating=TRUNCATING, maxlen=MAXLEN
)

<IPython.core.display.Javascript object>

# Set model

In [15]:
set_random_seed(42)

INPUT_DIM = NUM_WORDS  # Usually len(vocab) + 1
INPUT_LENGTH = 10  # Length of a sequence
EMBEDDING_DIM = 300

model = Sequential(
    [
        Embedding(
            input_dim=INPUT_DIM,
            output_dim=EMBEDDING_DIM,
            input_length=INPUT_LENGTH,
            weights=None,
            trainable=True,
            name="embedding_layer",
        ),
        GlobalAveragePooling1D(
            data_format="channels_last",
            keepdims=False,
        ),  # Get the average of each dimension along columns -> (1, 300)
        Dense(units=6, activation="relu"),
        Dense(units=20, activation="softmax"),
    ],
    name="simple",
)


OPTIMIZER = Adam(learning_rate=0.01)
LOSS = SparseCategoricalCrossentropy()


model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=["accuracy"])

model.summary()

Model: "simple"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_layer (Embedding)  (None, 10, 300)          3000000   
                                                                 
 global_average_pooling1d (G  (None, 300)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 6)                 1806      
                                                                 
 dense_1 (Dense)             (None, 20)                140       
                                                                 
Total params: 3,001,946
Trainable params: 3,001,946
Non-trainable params: 0
_________________________________________________________________


<IPython.core.display.Javascript object>

In [16]:
def train(X_train, y_train, **kwargs):
    history = model.fit(X_train, y_train, **kwargs)
    return history

<IPython.core.display.Javascript object>

In [17]:
history = train(
    sequences_padded_train,
    y_train,
    validation_data=(sequences_padded_test, y_test),
    epochs=10,
    batch_size=128,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<IPython.core.display.Javascript object>