In [11]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.8.32-py2.py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 14 kB/s eta 0:00:017
[?25hCollecting nvidia-ml-py3>=7.352.0
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
Collecting gql==0.2.0
  Downloading gql-0.2.0.tar.gz (18 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.1-py3-none-any.whl (7.5 kB)
Collecting watchdog>=0.8.3
  Downloading watchdog-0.10.2.tar.gz (95 kB)
[K     |████████████████████████████████| 95 kB 20 kB/s eta 0:00:01
[?25hCollecting subprocess32>=3.5.3
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 40 kB/s eta 0:00:01
Collecting configparser>=3.8.1
  Downloading configparser-5.0.0-py3-none-any.whl (22 kB)
Collecting Click>=7.0
  Downloading click-7.1.1-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 18 kB/s eta 0:00:015
[?25hCollecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-a

In [1]:
import os
import nltk
import math
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, BatchNormalization, Activation, Input, Add, Concatenate
from keras_layer_normalization import LayerNormalization
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stopwords = set(stopwords.words("english"))
ps = PorterStemmer()

Using TensorFlow backend.


# Pipeline

### Data Loader

Python provides a lot of packages to load files in different formats. We provide a simple data loader to help you load .csv files.

In [13]:
def load_data(file_name):
    """
    :param file_name: a file name, type: str
    return a list of ids, a list of reviews, a list of labels
    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    """
    df = pd.read_csv(file_name)

    return df["review_id"], df["text"], df["stars"]

def load_labels(file_name):
    """
    :param file_name: a file name, type: str
    return a list of labels
    """
    return pd.read_csv(file_name)["stars"]

def write_predictions(file_name, pred):
    df = pd.DataFrame(zip(range(len(pred)), pred))
    df.columns = ["review_id", "stars"]
    df.to_csv(file_name, index=False)


### Feature Extractor


The **feature extractor** is one of the most important parts in a pipeline.
In this tutorial, we introduce four different functions to extract features.


In [14]:
def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)

def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]

def n_gram(tokens, n=1):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results
    
def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]

In [15]:
def get_feats_dict(feats, min_freq=-1, max_freq=-1, max_size=-1):
    """
    :param data: a list of features, type: list(list)
    :param min_freq: the lowest fequency that the fequency of a feature smaller than it will be filtered out, type: int
    :param max_freq: the highest fequency that the fequency of a feature larger than it will be filtered out, type: int
    :param max_size: the max size of feature dict, type: int
    return a feature dict that maps features to indices, sorted by frequencies
    # Counter document: https://docs.python.org/3.6/library/collections.html#collections.Counter
    """
    # count all features
    feat_cnt = Counter(feats) # ["text", "text", "mine"] --> {"text": 2, "mine": 1}
    if max_size > 0 and min_freq == -1 and max_freq == -1:
        valid_feats = [f for f, cnt in feat_cnt.most_common(max_size)]
    else:
        valid_feats = list()
        for f, cnt in feat_cnt.most_common():
            if (min_freq == -1 or cnt >= min_freq) and \
                (max_freq == -1 or cnt <= max_freq):
                valid_feats.append(f)
    if max_size > 0 and len(valid_feats) > max_size:
        valid_feats = valid_feats[:max_size]        
    print("Size of features:", len(valid_feats))
    
    # build a mapping from features to indices
    feats_dict = dict(zip(valid_feats, range(len(valid_feats))))
    return feats_dict

def get_onehot_vector(feats, feats_dict):
    """
    :param feats: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float)
    for f in feats:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            # set the corresponding element as 1
            vector[f_idx] = 1
    return vector

### Classifier

In this tutorial, we introduce a 1-layer perceptron to classify reviews. This perceptron includes 1 dense layer with the softmax activation.
Keras is the easiest deep learning framework so that we choose it to build this network.

In [31]:
def build_classifier(input_size, output_size,
                     l2_reg=0.0,
                     loss="categorical_crossentropy",
                     optimizer="SGD",
                     learning_rate=0.1,
                     metric="accuracy"):
    """
    :param input_size: the dimension of the input, type: int
    :param output_size: the dimension of the prediction, type: int
    :param l2_reg: the weight for the L2 regularizer, type: str
    :param loss: the training loss, type: str
    :param optimizer: the optimizer, type: str
    :param learning_rate: the learning rate for the optimizer, type: float
    :param metric: the metric, type: str
    return a 1-layer perceptron,
    # losses document: https://keras.io/losses/
    # optimizers document: https://keras.io/optimizers/
    # metrics document: https://keras.io/metrics/
    """
    model = Sequential()
    
    # the projection layer
    model.add(Dense(output_size,
                    activation="softmax",
                    input_dim=input_size,
                    kernel_initializer=keras.initializers.he_normal(seed=0),
                    bias_initializer="zeros",
                    kernel_regularizer=keras.regularizers.l2(l2_reg)))
    
    # set the loss, the optimizer, and the metric
    if optimizer == "SGD":
        optimizer = keras.optimizers.SGD(lr=learning_rate)
    elif optimizer == "RMSprop":
        optmizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "Adam":
        optmizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise NotImplementedError
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

### Make Data Ready

Now we have the data loader, feature extractor, and the classifier. We can connect them to finish this pipeline of classification. We follow the setting in Tutorial 2 and only select the features whose frequencies are no less than 5.

In [23]:
train_file = "data/train.csv"
test_file = "data/valid.csv"
#ans_file = "data/ans.csv"
#pred_file = "data/pred.csv"
min_freq = 3

# load data
train_ids, train_texts, train_labels = load_data(train_file)
test_ids, test_texts, _ = load_data(test_file)
test_labels = load_labels(test_file)

# extract features
train_tokens = [tokenize(text) for text in train_texts]
test_tokens = [tokenize(text) for text in test_texts]

train_stemmed = [stem(tokens) for tokens in train_tokens]
test_stemmed = [stem(tokens) for tokens in test_tokens]

train_stemmed = [filter_stopwords(tokens) for tokens in train_stemmed]
test_stemmed = [filter_stopwords(tokens) for tokens in test_stemmed]

train_2_gram = [n_gram(tokens, 2) for tokens in train_stemmed]
train_3_gram = [n_gram(tokens, 3) for tokens in train_stemmed]
test_2_gram = [n_gram(tokens, 2) for tokens in test_stemmed]
test_3_gram = [n_gram(tokens, 3) for tokens in test_stemmed]

# build the feature list
train_feats = list()
for i in range(len(train_ids)):
    train_feats.append(
        train_stemmed[i] + train_2_gram[i] + train_3_gram[i])
test_feats = list()
for i in range(len(test_ids)):
    test_feats.append(
        test_stemmed[i] + test_2_gram[i] + test_3_gram[i])

# build a mapping from features to indices
feats_dict = get_feats_dict(
    chain.from_iterable(train_feats),
    min_freq=5)

train_feats_matrix = np.vstack(
    [get_onehot_vector(f, feats_dict) for f in train_feats])
test_feats_matrix = np.vstack(
    [get_onehot_vector(f, feats_dict) for f in test_feats])

# convert labels to label_matrix
num_classes = max(train_labels)
# convert each label to a ont-hot vector, and then stack vectors as a matrix
train_label_matrix = keras.utils.to_categorical(train_labels-1, num_classes=num_classes)
test_label_matrix = keras.utils.to_categorical(test_labels-1, num_classes=num_classes)

Size of features: 71646


# Multi-layer Perceptron

One vital change in multi-layer perceptron is the number of layers. We stack more layers so that extracted features can be further enhanced in hidden layers.

In [32]:
def build_MLP(input_size, output_size, num_layers, hidden_size,
              activation="relu",
              dropout_rate=0.01,
              batch_norm=True,
              layer_norm=False,
              l2_reg=0.001,
              loss="categorical_crossentropy",
              optimizer="Adam",
              learning_rate=0.1,
              metric="accuracy"):
    """
    :param input_size: the dimension of the input, type: int
    :param output_size: the dimension of the prediction, type: int
    :param num_layers: the number of layers, type: int
    :param hidden_size: the dimension of the hidden states, type: int
    :param activation: the activation type, type: str
    :param dropout_rate: the probability of dropout, type: float
    :param batch_norm: whether to enable batch normalization, type: bool
    :param layer_norm: whether to enable layer normalization, type: bool
    :param l2_reg: the weight for the L2 regularizer, type: str
    :param loss: the training loss, type: str
    :param optimizer: the optimizer, type: str
    :param learning_rate: the learning rate for the optimizer, type: float
    :param metric: the metric, type: str
    return a multi-layer perceptron,
    # activation
    # dropout document: https://keras.io/layers/core/#dropout
    # batch normalization document: https://keras.io/layers/normalization/
    # layer normalization: https://github.com/CyberZHG/keras-layer-normalization
    # losses document: https://keras.io/losses/
    # optimizers document: https://keras.io/optimizers/
    # metrics document: https://keras.io/metrics/
    """
    model = Sequential()
    
    if num_layers == 1:
        model.add(Dense(output_size,
                        activation="softmax",
                        input_dim=input_size,
                        kernel_initializer=keras.initializers.he_normal(seed=0),
                        bias_initializer="zeros",
                        kernel_regularizer=keras.regularizers.l2(l2_reg)))
    else:
        for i in range(num_layers-1):
            if i == 0:
                # fitst layer: input -> hidden
                model.add(Dense(hidden_size,
                                input_dim=input_size,
                                kernel_initializer=keras.initializers.he_normal(seed=0),
                                bias_initializer="zeros",
                                kernel_regularizer=keras.regularizers.l2(l2_reg)))
            else:
                # hidden layers: hidden -> hidden
                model.add(Dense(hidden_size,
                                input_dim=hidden_size,
                                kernel_initializer=keras.initializers.he_normal(seed=0),
                                bias_initializer="zeros",
                                kernel_regularizer=keras.regularizers.l2(l2_reg)))
            # add layer_norm
            if layer_norm:
                model.add(LayerNormalization())
            # add batch_norm
            if batch_norm:
                model.add(BatchNormalization())
            # add activation
            model.add(Activation(activation))
            # add dropout here (set seed as 0 in order to reproduce)
            if dropout_rate > 0.0:
                model.add(Dropout(dropout_rate, seed=0))
        # last layer: hidden -> class
        model.add(Dense(output_size,
                        activation="softmax",
                        input_dim=hidden_size,
                        kernel_initializer=keras.initializers.he_normal(seed=0),
                        bias_initializer="zeros"))
    
    # set the loss, the optimizer, and the metric
    if optimizer == "SGD":
        optimizer = keras.optimizers.SGD(lr=learning_rate)
    elif optimizer == "RMSprop":
        optmizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "Adam":
        optmizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise NotImplementedError
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

Firstly, we build a single layer perceptron, which can achieve test 57.75% accuracy.

In [33]:
os.makedirs("models", exist_ok=True)
model = build_classifier(input_size=len(feats_dict), output_size=num_classes,
                         l2_reg=0.0001)
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)

np.random.seed(0)
tf.random.set_random_seed(0)
slp_history = model.fit(train_feats_matrix, train_label_matrix,
                    validation_split=0.1,
                    epochs=20, batch_size=64, verbose=0,
                    callbacks=[checkpointer])
model = keras.models.load_model(os.path.join("models", "weights.hdf5"))

train_score = model.evaluate(train_feats_matrix, train_label_matrix,
                             batch_size=64)
test_score = model.evaluate(test_feats_matrix, test_label_matrix,
                            batch_size=64)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

training loss: 0.6165140730857849 training accuracy 0.8051000237464905
test loss: 0.8969401912689209 test accuracy 0.6449999809265137


Let's try a 3-layer MLP whose hidden size is 100. We choose the **ReLU** as activations in hidden layers, which is used more widely. And another popular activation is **Tanh**.

![ReLU](ReLU.png) ![Tanh](Tanh.png)

In [34]:
model = build_MLP(input_size=len(feats_dict), output_size=num_classes,
                  num_layers=3, hidden_size=100, activation="relu",
                  l2_reg=0.005)
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)

np.random.seed(0)
tf.random.set_random_seed(0)
mlp_history = model.fit(train_feats_matrix, train_label_matrix,
                    validation_split=0.1,
                    epochs=20, batch_size=100, verbose=0,
                    callbacks=[checkpointer])
model = keras.models.load_model(os.path.join("models", "weights.hdf5"))

train_score = model.evaluate(train_feats_matrix, train_label_matrix,
                             batch_size=100)
test_score = model.evaluate(test_feats_matrix, test_label_matrix,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

training loss: 1.7992390644550325 training accuracy 0.9331499934196472
test loss: 3.0175549149513246 test accuracy 0.5995000004768372


In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(slp_history.history["loss"], label="SLP-training", color="blue", linestyle="dashed")
plt.plot(slp_history.history["val_loss"], label="SLP-validation", color="blue")
plt.plot(mlp_history.history["loss"], label="MLP-training", color="orange", linestyle="dashed")
plt.plot(mlp_history.history["val_loss"], label="MLP-validation", color="orange")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend()
plt.subplot(1,2,2)
plt.plot(slp_history.history["accuracy"], label="SLP-training", color="blue", linestyle="dashed")
plt.plot(slp_history.history["val_accuracy"], label="SLP-validation", color="blue")
plt.plot(mlp_history.history["accuracy"], label="MLP-training", color="orange", linestyle="dashed")
plt.plot(mlp_history.history["val_accuracy"], label="MLP-validation", color="orange")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

Unfortunately, this MLP is not as good as SLP. When we check curves, we find MLP can only achieve a better training accuracy. Hence, we also need to solve the overfitting (to improve the test performance) and underfitting (to decrease the training loss) problems.

# Strategies to Reduce Overfitting Con't

### Dropout

Let's set the dropout rate as 0.2 for MLP.

In [35]:
model = build_MLP(input_size=len(feats_dict), output_size=num_classes,
                  num_layers=3, hidden_size=100, activation="relu",
                  l2_reg=0.005, dropout_rate=0.3)
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)

np.random.seed(0)
tf.random.set_random_seed(0)
drop_history = model.fit(train_feats_matrix, train_label_matrix,
                    validation_split=0.1,
                    epochs=20, batch_size=100, verbose=0,
                    callbacks=[checkpointer])
model = keras.models.load_model(os.path.join("models", "weights.hdf5"))

train_score = model.evaluate(train_feats_matrix, train_label_matrix,
                             batch_size=100)
test_score = model.evaluate(test_feats_matrix, test_label_matrix,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

training loss: 1.9498669070005417 training accuracy 0.9010499715805054
test loss: 2.5224180102348326 test accuracy 0.6294999718666077


In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(mlp_history.history["loss"], label="MLP-training", color="blue", linestyle="dashed")
plt.plot(mlp_history.history["val_loss"], label="MLP-validation", color="blue")
plt.plot(drop_history.history["loss"], label="MLP-training (dropout)", color="orange", linestyle="dashed")
plt.plot(drop_history.history["val_loss"], label="MLP-validation (dropout)", color="orange")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend()
plt.subplot(1,2,2)
plt.plot(mlp_history.history["accuracy"], label="MLP-training", color="blue", linestyle="dashed")
plt.plot(mlp_history.history["val_accuracy"], label="MLP-validation", color="blue")
plt.plot(drop_history.history["accuracy"], label="MLP-training (dropout)", color="orange", linestyle="dashed")
plt.plot(drop_history.history["val_accuracy"], label="MLP-validation (dropout)", color="orange")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

The dropout improves the model performance from 54.25% to 58.25%. 

# Strategies to Reduce Underfitting

### Normalization

We firstly introduce the most common two methods to help model converge faster.

When models know nothing, they must perform worse in face of the first batch examples. And then each model will update its parameters. But the learning rate controls the weight updating. Let's consider two models. Elements of inputs of the first model are always 0 or 1, and inputs of the other model are always the scaled inputs of the first, e.g., 0 or 100. If we provide infinite time and same settings to train the two models with same data except the scale, the two models must be equivalent. But the first model is very likely to converge first because of the smaller scale. Several normalization methods are proposed to address this problem.

The batch normalization is designed to find feature distributions from a batch of data, while the layer normalization is used to find feature distributions from each example. For example, if 0-th feature is always 0 or 100, then the batch normalization can find it; if the average length of sentences is about 20, then the layer normalization can find it.

Let's try batch normalization, layer normalization, and both. In order to reduce other effects, we disable the regularization and the dropout.

In [None]:
histories = list()
for batch_norm in [False, True]:
    for layer_norm in [False, True]:
        model = build_MLP(input_size=len(feats_dict), output_size=num_classes,
                          num_layers=3, hidden_size=100, activation="relu",
                          batch_norm=batch_norm, layer_norm=layer_norm)
        checkpointer = keras.callbacks.ModelCheckpoint(
            filepath=os.path.join("models", "weights.hdf5"),
            monitor="val_accuracy",
            verbose=0,
            save_best_only=True)

        np.random.seed(0)
        tf.random.set_random_seed(0)
        history = model.fit(train_feats_matrix, train_label_matrix,
                            validation_split=0.1,
                            epochs=20, batch_size=100, verbose=0,
                            callbacks=[checkpointer])
        model = keras.models.load_model(os.path.join("models", "weights.hdf5"),
                                        custom_objects={"LayerNormalization": LayerNormalization})

        train_score = model.evaluate(train_feats_matrix, train_label_matrix,
                                     batch_size=100)
        test_score = model.evaluate(test_feats_matrix, test_label_matrix,
                                    batch_size=100)
        
        histories.append((batch_norm, layer_norm, history))
        print("batch normalization:", batch_norm, "layer normalization:", layer_norm)
        print("training loss:", train_score[0], "training accuracy", train_score[1])
        print("test loss:", test_score[0], "test accuracy", test_score[1])
        print()

In [None]:
colors = ["orange", "gray", "red", "blue"]
plt.figure(figsize=(10,4))

for i, (batch_norm, layer_norm, history) in enumerate(histories):
    if batch_norm and layer_norm:
        name = " (bn & ln)"
    elif batch_norm and not layer_norm:
        name = " (bn)"
    elif not batch_norm and layer_norm:
        name = " (ln)"
    else:
        name = ""
    plt.subplot(1,2,1)
    plt.plot(history.history["loss"], label="MLP-training" + name, color=colors[i], linestyle="dashed")
    plt.plot(history.history["val_loss"], label="MLP-validation" + name, color=colors[i])
    plt.subplot(1,2,2)
    plt.plot(history.history["accuracy"], label="MLP-training" + name, color=colors[i], linestyle="dashed")
    plt.plot(history.history["val_accuracy"], label="MLP-validation" + name, color=colors[i])
plt.subplot(1,2,1)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend(fontsize=8)
plt.subplot(1,2,2)
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
# plt.legend()
plt.show()

As you can see, all normalization can improve the training performance significantly and test performance slightly. Once we choose these normalization methods, we still need to use strategies to avoid overfitting.

### Residual Connection

In [None]:
def build_Res_Net(input_size, output_size, num_layers, hidden_size,
              activation="relu",
              dropout_rate=0.0,
              batch_norm=False,
              layer_norm=False,
              l2_reg=0.0,
              loss="categorical_crossentropy",
              optimizer="SGD",
              learning_rate=0.1,
              metric="accuracy"):
    """
    :param input_size: the dimension of the input, type: int
    :param output_size: the dimension of the prediction, type: int
    :param num_layers: the number of layers, type: int
    :param hidden_size: the dimension of the hidden states, type: int
    :param activation: the activation type, type: str
    :param dropout_rate: the probability of dropout, type: float
    :param batch_norm: whether to enable batch normalization, type: bool
    :param layer_norm: whether to enable layer normalization, type: bool
    :param l2_reg: the weight for the L2 regularizer, type: str
    :param loss: the training loss, type: str
    :param optimizer: the optimizer, type: str
    :param learning_rate: the learning rate for the optimizer, type: float
    :param metric: the metric, type: str
    return a multi-layer network with residual connections,
    # activation
    # dropout document: https://keras.io/layers/core/#dropout
    # batch normalization document: https://keras.io/layers/normalization/
    # layer normalization: https://github.com/CyberZHG/keras-layer-normalization
    # losses document: https://keras.io/losses/
    # optimizers document: https://keras.io/optimizers/
    # metrics document: https://keras.io/metrics/
    """
    x = Input(shape=(input_size,))
    
    if num_layers == 1:
        y = Dense(output_size,
                  activation="softmax",
                  input_dim=input_size,
                  kernel_initializer=keras.initializers.he_normal(seed=0),
                  bias_initializer="zeros",
                  kernel_regularizer=keras.regularizers.l2(l2_reg))(x)
    else:
        h = x
        for i in range(num_layers-1):
            if i == 0:
                # fitst layer: input -> hidden
                new_h = Dense(hidden_size,
                          input_dim=input_size,
                          kernel_initializer=keras.initializers.he_normal(seed=0),
                          bias_initializer="zeros",
                          kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
            else:
                new_h = Dense(hidden_size,
                          input_dim=hidden_size,
                          kernel_initializer=keras.initializers.he_normal(seed=0),
                          bias_initializer="zeros",
                          kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
            # add layer_norm
            if layer_norm:
                new_h = LayerNormalization()(new_h)
            # add batch_norm
            if batch_norm:
                new_h = BatchNormalization()(new_h)
            # residual connection
            if i == 0:
                h = new_h
            else:
                h = Add()([h, new_h])
            # add activation
            h = Activation(activation)(h)
            # add dropout here (set seed as 0 in order to reproduce)
            if dropout_rate > 0.0:
                h = Dropout(dropout_rate, seed=0)(h)
        # last layer: hidden -> class
        y = Dense(output_size,
                  activation="softmax",
                  input_dim=hidden_size,
                  kernel_initializer=keras.initializers.he_normal(seed=0),
                  bias_initializer="zeros")(h)
    
    # set the loss, the optimizer, and the metric
    if optimizer == "SGD":
        optimizer = keras.optimizers.SGD(lr=learning_rate)
    elif optimizer == "RMSprop":
        optmizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "Adam":
        optmizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise NotImplementedError
    model = Model(x, y)
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

In [None]:
model = build_Res_Net(input_size=len(feats_dict), output_size=num_classes,
                  num_layers=3, hidden_size=100, activation="relu",
                  l2_reg=0.005, dropout_rate=0.1)
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)

np.random.seed(0)
tf.random.set_random_seed(0)
res_history = model.fit(train_feats_matrix, train_label_matrix,
                        validation_split=0.1,
                        epochs=20, batch_size=100, verbose=0,
                        callbacks=[checkpointer])
model = keras.models.load_model(os.path.join("models", "weights.hdf5"))

train_score = model.evaluate(train_feats_matrix, train_label_matrix,
                             batch_size=100)
test_score = model.evaluate(test_feats_matrix, test_label_matrix,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(drop_history.history["loss"], label="MLP-training", color="blue", linestyle="dashed")
plt.plot(drop_history.history["val_loss"], label="MLP-validation", color="blue")
plt.plot(res_history.history["loss"], label="Res-training", color="orange", linestyle="dashed")
plt.plot(res_history.history["val_loss"], label="Res-validation", color="orange")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend()
plt.subplot(1,2,2)
plt.plot(drop_history.history["accuracy"], label="MLP-training", color="blue", linestyle="dashed")
plt.plot(drop_history.history["val_accuracy"], label="MLP-validation", color="blue")
plt.plot(res_history.history["accuracy"], label="Res-training", color="orange", linestyle="dashed")
plt.plot(res_history.history["val_accuracy"], label="Res-validation", color="orange")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

Residual connections can also decrease the training loss, especially in the beginning of training.

### Concatenation

In [None]:
def build_Cat_Net(input_size, output_size, num_layers, hidden_size,
              activation="relu",
              dropout_rate=0.0,
              batch_norm=False,
              layer_norm=False,
              l2_reg=0.0,
              loss="categorical_crossentropy",
              optimizer="SGD",
              learning_rate=0.1,
              metric="accuracy"):
    """
    :param input_size: the dimension of the input, type: int
    :param output_size: the dimension of the prediction, type: int
    :param num_layers: the number of layers, type: int
    :param hidden_size: the dimension of the hidden states, type: int
    :param activation: the activation type, type: str
    :param dropout_rate: the probability of dropout, type: float
    :param batch_norm: whether to enable batch normalization, type: bool
    :param layer_norm: whether to enable layer normalization, type: bool
    :param l2_reg: the weight for the L2 regularizer, type: str
    :param loss: the training loss, type: str
    :param optimizer: the optimizer, type: str
    :param learning_rate: the learning rate for the optimizer, type: float
    :param metric: the metric, type: str
    return a multi-layer networks with concatenations,
    # activation
    # dropout document: https://keras.io/layers/core/#dropout
    # batch normalization document: https://keras.io/layers/normalization/
    # layer normalization: https://github.com/CyberZHG/keras-layer-normalization
    # losses document: https://keras.io/losses/
    # optimizers document: https://keras.io/optimizers/
    # metrics document: https://keras.io/metrics/
    """
    x = Input(shape=(input_size,))
    
    if num_layers == 1:
        y = Dense(output_size,
                  activation="softmax",
                  input_dim=input_size,
                  kernel_initializer=keras.initializers.he_normal(seed=0),
                  bias_initializer="zeros",
                  kernel_regularizer=keras.regularizers.l2(l2_reg))(x)
    else:
        h = x
        for i in range(num_layers-1):
            if i == 0:
                # fitst layer: input -> hidden
                new_h = Dense(hidden_size,
                          input_dim=input_size,
                          kernel_initializer=keras.initializers.he_normal(seed=0),
                          bias_initializer="zeros",
                          kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
            else:
                new_h = Dense(hidden_size,
                          kernel_initializer=keras.initializers.he_normal(seed=0),
                          bias_initializer="zeros",
                          kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
            # add layer_norm
            if layer_norm:
                new_h = LayerNormalization()(new_h)
            # add batch_norm
            if batch_norm:
                new_h = BatchNormalization()(new_h)
            # residual connection
            if i == 0:
                h = new_h
            else:
                h = Concatenate()([h, new_h])
            # add activation
            h = Activation(activation)(h)
            # add dropout here (set seed as 0 in order to reproduce)
            if dropout_rate > 0.0:
                h = Dropout(dropout_rate, seed=0)(h)
        # last layer: hidden -> class
        y = Dense(output_size,
                  activation="softmax",
                  kernel_initializer=keras.initializers.he_normal(seed=0),
                  bias_initializer="zeros")(h)
    
    # set the loss, the optimizer, and the metric
    if optimizer == "SGD":
        optimizer = keras.optimizers.SGD(lr=learning_rate)
    elif optimizer == "RMSprop":
        optmizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "Adam":
        optmizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise NotImplementedError
    model = Model(x, y)
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

In [None]:
model = build_Cat_Net(input_size=len(feats_dict), output_size=num_classes,
                  num_layers=3, hidden_size=100, activation="relu",
                  l2_reg=0.005, dropout_rate=0.0)
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)

np.random.seed(0)
tf.random.set_random_seed(0)
cat_history = model.fit(train_feats_matrix, train_label_matrix,
                        validation_split=0.1,
                        epochs=20, batch_size=100, verbose=0,
                        callbacks=[checkpointer])
model = keras.models.load_model(os.path.join("models", "weights.hdf5"))

train_score = model.evaluate(train_feats_matrix, train_label_matrix,
                             batch_size=100)
test_score = model.evaluate(test_feats_matrix, test_label_matrix,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(drop_history.history["loss"], label="MLP-training", color="blue", linestyle="dashed")
plt.plot(drop_history.history["val_loss"], label="MLP-validation", color="blue")
plt.plot(res_history.history["loss"], label="Res-training", color="orange", linestyle="dashed")
plt.plot(res_history.history["val_loss"], label="Res-validation", color="orange")
plt.plot(cat_history.history["loss"], label="Cat-training", color="red", linestyle="dashed")
plt.plot(cat_history.history["val_loss"], label="Cat-validation", color="red")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend()
plt.subplot(1,2,2)
plt.plot(drop_history.history["accuracy"], label="MLP-training", color="blue", linestyle="dashed")
plt.plot(drop_history.history["val_accuracy"], label="MLP-validation", color="blue")
plt.plot(res_history.history["accuracy"], label="Res-training", color="orange", linestyle="dashed")
plt.plot(res_history.history["val_accuracy"], label="Res-validation", color="orange")
plt.plot(cat_history.history["accuracy"], label="Cat-training", color="red", linestyle="dashed")
plt.plot(cat_history.history["val_accuracy"], label="Cat-validation", color="red")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

The concatenation has similar training curves to the model with residual connections. One drawback is the increased amount of parameters. When the network goes deeper, you need to balance.

# Final Powerful Deep Network

We have learned so many strategies to avoid overfitting and underfitting. You can use all of them or some of them to improve your own deep networks.

In [None]:
model = None
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)

np.random.seed(0)
tf.random.set_seed(0)
history = model.fit(train_feats_matrix, train_label_matrix,
                        validation_split=0.1,
                        epochs=20, batch_size=100, verbose=0,
                        callbacks=[checkpointer])
model = keras.models.load_model(os.path.join("models", "weights.hdf5"),
                                custom_objects={"LayerNormalization": LayerNormalization})

train_score = model.evaluate(train_feats_matrix, train_label_matrix,
                             batch_size=100)
test_score = model.evaluate(test_feats_matrix, test_label_matrix,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(drop_history.history["loss"], label="MLP-training", color="blue", linestyle="dashed")
plt.plot(drop_history.history["val_loss"], label="MLP-validation", color="blue")
plt.plot(history.history["loss"], label="Yours-training", color="orange", linestyle="dashed")
plt.plot(history.history["val_loss"], label="Yours-validation", color="orange")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend()
plt.subplot(1,2,2)
plt.plot(drop_history.history["accuracy"], label="MLP-training", color="blue", linestyle="dashed")
plt.plot(drop_history.history["val_accuracy"], label="MLP-validation", color="blue")
plt.plot(history.history["accuracy"], label="Yours-training", color="orange", linestyle="dashed")
plt.plot(history.history["val_accuracy"], label="Yours-validation", color="orange")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

# Out of Tutorials

### Optimizers

In this tutorial and previous tutorials, we use the stochastic gradient descent optimizer to optimize the training loss. In fact, there is much cutting-edge research work about optimization. One of the most common is Adam, you can try it by setting the parameter *optimizer*. Of course, Keras provides many more choices for users and you can choose by your validation results.

### Normalization

We introduce the batch normalization and the layer normalization to help model converge. But more robust and higher-efficiency normalization methods have been proposed, such as the instance normalization and the group normalization. If you are interested in normalization, you can read related articles and implement some of them.

### Efficient Architectures

We have moved from the single layer perceptron to the multi-layer perceptron. But that's still not enough. The residual connection and the concatenation can improve deeper networks. Building highway networks is also a way to increase the power of multiple layers. Google also designs a wide & deep network for memorization and generalization. These architectures are still based on these simple linear layers and activations. If you can grasp strategies in tutorial 2 and tutorial 3, I believe you can design promising networks on your own.