In [1]:
# TUMCUD - Total Utility Maxmimization model for Cutting of Under-segmented Data

# You can download the jupyter notebook file by

# !wget --no-check-certificate https://raw.githubusercontent.com/ekapolc/NLP_2021/main/HW2/hw2_nn_word_tokenizer.ipynb

## Word Tokenizer exercise##

In this exercise, you are going to build a set of deep learning models on a (sort of) real world task using Tensorflow and Keras. Tensorflow is a deep learning framwork developed by Google, and Keras is a frontend library built on top of Tensorflow (or Theano, CNTK) to provide an easier way to use standard layers and networks.

To complete this exercise, you will need to build deep learning models for word tokenization in Thai (แบ่งเว้นวรรคภาษาไทย) using NECTEC's BEST corpus. You will build one model for each of the following type:
- Fully Connected (Feedforward) Neural Network
- One-Dimentional Convolution Neural Network (1D-CNN)
- Recurrent Neural Network with Gated Recurrent Unit (GRU)

and one more model of your choice to achieve the highest score possible.

We provide the code for data cleaning and some starter code for keras in this notebook but feel free to modify those parts to suit your needs. You can also complete this exercise using only Tensorflow (without using Keras). Feel free to use additional libraries (e.g. scikit-learn) as long as you have a model for each type mentioned above.

**Don't forget to change hardware accelerator to GPU in Google Colab.**


In [2]:
#Select tensorflow 2.0
%tensorflow_version 2.x

In [3]:
# Run setup code
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google_drive_downloader import GoogleDriveDownloader as gdd
import tensorflow as tf
%matplotlib inline

In [4]:
#Check GPU is available
tf.test.gpu_device_name()

'/device:GPU:0'

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Download dataset
gdd.download_file_from_google_drive(file_id='1iodAqVNWEkiJgH8cWkccsLi_tqoFcMrV',
                                    dest_path='./corpora.tar.gz')

Downloading 1iodAqVNWEkiJgH8cWkccsLi_tqoFcMrV into ./corpora.tar.gz... Done.


In [None]:
!tar xvzf corpora.tar.gz

corpora/
corpora/mnist_data/
corpora/mnist_data/t10k-images-idx3-ubyte.gz
corpora/mnist_data/train-images-idx3-ubyte.gz
corpora/mnist_data/.ipynb_checkpoints/
corpora/mnist_data/vis_utils.py
corpora/mnist_data/__init__.py
corpora/mnist_data/load_mnist.py
corpora/mnist_data/train-labels-idx1-ubyte.gz
corpora/mnist_data/t10k-labels-idx1-ubyte.gz
corpora/BEST/
corpora/BEST/test/
corpora/BEST/test/df_best_article_test.csv
corpora/BEST/test/df_best_encyclopedia_test.csv
corpora/BEST/test/df_best_novel_test.csv
corpora/BEST/test/df_best_news_test.csv
corpora/BEST/train/
corpora/BEST/train/df_best_encyclopedia_train.csv
corpora/BEST/train/df_best_article_train.csv
corpora/BEST/train/df_best_news_train.csv
corpora/BEST/train/df_best_novel_train.csv
corpora/BEST/val/
corpora/BEST/val/df_best_encyclopedia_val.csv
corpora/BEST/val/df_best_news_val.csv
corpora/BEST/val/df_best_article_val.csv
corpora/BEST/val/df_best_novel_val.csv
corpora/.ipynb_checkpoints/
corpora/.ipynb_checkpoints/Word_Tokeniz

In [None]:
!pip install pytictoc

Collecting pytictoc
  Downloading https://files.pythonhosted.org/packages/17/fa/c60631a899d6bb370f58d4cd95162ec3c204cd3987f2d65bd7c6a3270dd5/pytictoc-1.5.1-py2.py3-none-any.whl
Installing collected packages: pytictoc
Successfully installed pytictoc-1.5.1


In [None]:
from pytictoc import TicToc
t = TicToc() #create instance of class

In [None]:
# Prepare data
# You don't need to run the following code as we already did it for you to give everyone the same dataset
# import cattern.data_utils
# cattern.data_utils.generate_best_dataset(os.getcwd()+'/data', create_val=True)

For simplicity, we are going to build a word tokenization model which is a binary classification model trying to predict whether a character is the begining of the word or not (if it is, then there is a space in front of it) and without using any knowledge about type of character (vowel, number, English character etc.).

For example,

'แมวดำน่ารักมาก' -> 'แมว ดำ น่า รัก มาก'

will have these true labels:

[(แ,1), (ม,0), (ว,0) (ด,1), ( ำ,0), (น,1), (-่,0), (า,0), (ร,1), (-ั,0), (ก,0), (ม,1), (า,0), (ก,0)]

In this task, we will use only main character you are trying to predict  and the characters that surround it (the context) as features. However, you can imagine that a more complex model will try to include more knowledge about each character into the model. You can do that too if you feel like it.

In [None]:
# Create a character map
CHARS = [
  '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
  ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8',
  '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E',
  'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
  'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  'n', 'o', 'other', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
  'z', '}', '~', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช',
  'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท',
  'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ',
  'ล', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า',
  'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', 'เ', 'แ', 'โ', 'ใ', 'ไ',
  'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์', 'ํ', '๐', '๑', '๒', '๓',
  '๔', '๕', '๖', '๗', '๘', '๙', '‘', '’', '\ufeff'
]
CHARS_MAP = {v: k for k, v in enumerate(CHARS)}

In [None]:
def create_n_gram_df(df, n_pad):
  """
  Given an input dataframe, create a feature dataframe of shifted characters
  Input:
  df: timeseries of size (N)
  n_pad: the number of context. For a given character at position [idx],
    character at position [idx-n_pad/2 : idx+n_pad/2] will be used 
    as features for that character.
  
  Output:
  dataframe of size (N * n_pad) which each row contains the character, 
    n_pad_2 characters to the left, and n_pad_2 characters to the right
    of that character.
  """
  n_pad_2 = int((n_pad - 1)/2)
  for i in range(n_pad_2):
      df['char-{}'.format(i+1)] = df['char'].shift(i + 1)
      df['char{}'.format(i+1)] = df['char'].shift(-i - 1)
  return df[n_pad_2: -n_pad_2]

In [None]:
def prepare_feature(best_processed_path, option='train'):
  """
  Transform the path to a directory containing processed files 
  into a feature matrix and output array
  Input:
  best_processed_path: str, path to a processed version of the BEST dataset
  option: str, 'train' or 'test'
  """
  # we use padding equals 21 here to consider 10 characters to the left
  # and 10 characters to the right as features for the character in the middle
  n_pad = 21
  n_pad_2 = int((n_pad - 1)/2)
  pad = [{'char': ' ', 'target': True}]
  df_pad = pd.DataFrame(pad * n_pad_2)

  df = []
  # article types in BEST corpus
  article_types = ['article', 'encyclopedia', 'news', 'novel']
  for article_type in article_types:
      df.append(pd.read_csv(os.path.join(best_processed_path, option, 'df_best_{}_{}.csv'.format(article_type, option))))
  
  df = pd.concat(df)
  # pad with empty string feature
  df = pd.concat((df_pad, df, df_pad))

  # map characters to numbers, use 'other' if not in the predefined character set.
  df['char'] = df['char'].map(lambda x: CHARS_MAP.get(x, 80))

  # Use nearby characters as features
  df_with_context = create_n_gram_df(df, n_pad=n_pad)

  char_row = ['char' + str(i + 1) for i in range(n_pad_2)] + \
             ['char-' + str(i + 1) for i in range(n_pad_2)] + ['char']

  # convert pandas dataframe to numpy array to feed to the model
  x_char = df_with_context[char_row].to_numpy()
  y = df_with_context['target'].astype(int).to_numpy()

  return x_char, y

Before running the following commands, we must inform you that our data is quite large and loading the whole dataset at once will **use a lot of memory (~6 GB after processing and up to ~12GB while processing)**. We expect you to be running this on Google Cloud or Google Colab so that you will not run into this problem. But, if, for any reason, you have to run this on your PC or machine with not enough memory, you might need to write a data generator to process a few entries at a time then feed it to the model while training.

For keras, you can use [fit_generator](https://keras.io/getting-started/faq/#how-can-i-use-keras-with-datasets-that-dont-fit-in-memory) to cope with that.

In [None]:
# Path to the preprocessed data
best_processed_path = 'corpora/BEST'

In [None]:
# Load preprocessed BEST corpus
x_train_char, y_train = prepare_feature(best_processed_path, option='train')
x_val_char, y_val = prepare_feature(best_processed_path, option='val')
x_test_char, y_test = prepare_feature(best_processed_path, option='test')

# As a sanity check, we print out the size of the training, val, and test data.
print('Training data shape: ', x_train_char.shape)
print('Training data labels shape: ', y_train.shape)
print('Validation data shape: ', x_val_char.shape)
print('Validation data labels shape: ', y_val.shape)
print('Test data shape: ', x_test_char.shape)
print('Test data labels shape: ', y_test.shape)

Training data shape:  (16461637, 21)
Training data labels shape:  (16461637,)
Validation data shape:  (2035694, 21)
Validation data labels shape:  (2035694,)
Test data shape:  (2271932, 21)
Test data labels shape:  (2271932,)


In [None]:
# Print some entry from the data to make sure it is the same as what you think.
print('First 3 features: ', x_train_char[:3])
print('First 30 class labels', y_train[:30])

First 3 features:  [[112. 140. 114. 148. 130. 142.  94. 142. 128. 128.   1.   1.   1.   1.
    1.   1.   1.   1.   1.   1.  97.]
 [140. 114. 148. 130. 142.  94. 142. 128. 128. 141.  97.   1.   1.   1.
    1.   1.   1.   1.   1.   1. 112.]
 [114. 148. 130. 142.  94. 142. 128. 128. 141. 109. 112.  97.   1.   1.
    1.   1.   1.   1.   1.   1. 140.]]
First 30 class labels [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0]


In [None]:
#print char of feature 1
char = np.array(CHARS)

#A function for displaying our features in text
def print_features(tfeature,label,index):
    feature = np.array(tfeature[index],dtype=int).reshape(21,1)
    #Convert to string
    char_list = char[feature]
    left = ''.join(reversed(char_list[10:20].reshape(10))).replace(" ", "")
    center = ''.join(char_list[20])
    right =  ''.join(char_list[0:10].reshape(10)).replace(" ", "")
    word = ''.join([left,' ',center,' ',right])
    print(center + ': ' + word + "\tpred = "+str(label[index]))

for ind in range(0,30):
    print_features(x_train_char,y_train,ind)

ค:  ค ณะตุลาการร	pred = 1
ณ: ค ณ ะตุลาการรั	pred = 0
ะ: คณ ะ ตุลาการรัฐ	pred = 0
ต: คณะ ต ุลาการรัฐธ	pred = 0
ุ: คณะต ุ ลาการรัฐธร	pred = 0
ล: คณะตุ ล าการรัฐธรร	pred = 0
า: คณะตุล า การรัฐธรรม	pred = 0
ก: คณะตุลา ก ารรัฐธรรมน	pred = 0
า: คณะตุลาก า รรัฐธรรมนู	pred = 0
ร: คณะตุลากา ร รัฐธรรมนูญ	pred = 0
ร: คณะตุลาการ ร ัฐธรรมนูญก	pred = 0
ั: ณะตุลาการร ั ฐธรรมนูญกั	pred = 0
ฐ: ะตุลาการรั ฐ ธรรมนูญกับ	pred = 0
ธ: ตุลาการรัฐ ธ รรมนูญกับค	pred = 0
ร: ุลาการรัฐธ ร รมนูญกับคว	pred = 0
ร: ลาการรัฐธร ร มนูญกับควา	pred = 0
ม: าการรัฐธรร ม นูญกับความ	pred = 0
น: การรัฐธรรม น ูญกับความเ	pred = 0
ู: ารรัฐธรรมน ู ญกับความเป	pred = 0
ญ: รรัฐธรรมนู ญ กับความเป็	pred = 0
ก: รัฐธรรมนูญ ก ับความเป็น	pred = 1
ั: ัฐธรรมนูญก ั บความเป็นอ	pred = 0
บ: ฐธรรมนูญกั บ ความเป็นอง	pred = 0
ค: ธรรมนูญกับ ค วามเป็นองค	pred = 1
ว: รรมนูญกับค ว ามเป็นองค์	pred = 0
า: รมนูญกับคว า มเป็นองค์ก	pred = 0
ม: มนูญกับควา ม เป็นองค์กร	pred = 0
เ: นูญกับความ เ ป็นองค์กรต	pred = 1
ป: ูญกับความเ ป ็นองค์กรตุ	pred = 0
็: ญกับความ

Now, you are going to define the model to be used as your classifier. If you are using Keras, please follow the guideline we provide below. We encourage you to use the Keras Funtional API instead of the Sequential model as it is easier to create complex models (and to read your code). You can find more about Keras Functional API on Keras [documentation](https://keras.io/getting-started/functional-api-guide/).

You need to create a function that return a model you want and write code that invoke _**model.fit**_ to train your model. For example, your function ***get_nn()*** might looks like this:

```python
def get_nn():
  input1 = Input(...)
  ...
  # Create your wonderful model.
  ...
  out = ...
  model = Model(inputs=input1, outputs=out)
  model.compile(...)
return model
```

Also, beware that complex model requires more time to train and your dataset is already quite large. We tested it with a simple 1-hidden-layered feedforward nueral network and it used ~10 mins to train 1 epoch. It got more than 95% accuracy on validation set after the first epoch, so you should aim for a model with accuracy around 96-98%.

# Three-Layer Feedforward Neural Networks

Below, we provide you the code for creating a 3-layer fully connected neural network in keras. This will also serve as the baseline for your other models. Run the code below while making sure you understand what you are doing. Then, report the results.

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

def get_feedforward_nn():
  input1 = Input(shape=(21,))
  x = Dense(100, activation='relu')(input1)
  x = Dense(100, activation='relu')(x)
  x = Dense(100, activation='relu')(x)
  out = Dense(1, activation='sigmoid')(x)

  model = Model(inputs=input1, outputs=out)
  model.compile(optimizer=Adam(),
                loss='binary_crossentropy',
                metrics=['acc'])
  return model

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

# This is called to clear the original model session in order to use TensorBoard
from tensorflow.keras import backend as K
K.clear_session()

# Path to save model parameters
weight_path_feedforward_nn='./model_weight_feedforward_nn.h5'

# Training callbacks list. TensorBoard() write logs for tensorboard GUI. 
# ModelCheckpoint() writes the resulting model.
# Note that writing to disk takes time (longer than model training time). 
# For other sections, you might not writing any files to disk 
# or write only the graph for TensorBoard.

callbacks_list_feedforward_nn = [
        TensorBoard(log_dir='./Graph/ff', histogram_freq=1, write_graph=True, write_grads=True),
        ModelCheckpoint(
            weight_path_feedforward_nn,
            save_best_only=True,
            save_weights_only=True,
            monitor='val_loss',
            mode='min',
            verbose=1
        )
  ]

print('start training')
verbose = 1
model_feedforward_nn = get_feedforward_nn()

t.tic()
train_params = [(3, 512)]
for (epochs, batch_size) in train_params:
  print("train with {} epochs and {} batch size".format(epochs, batch_size))
  model_feedforward_nn.fit(x_train_char, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose,
                           callbacks=callbacks_list_feedforward_nn,
                           validation_data=(x_val_char, y_val))
t.toc()

start training
train with 3 epochs and 512 batch size
Epoch 1/3

Epoch 00001: val_loss improved from inf to 0.29731, saving model to ./model_weight_feedforward_nn.h5
Epoch 2/3

Epoch 00002: val_loss improved from 0.29731 to 0.25723, saving model to ./model_weight_feedforward_nn.h5
Epoch 3/3

Epoch 00003: val_loss improved from 0.25723 to 0.25058, saving model to ./model_weight_feedforward_nn.h5
Elapsed time is 278.189627 seconds.


In [None]:
################################################################################
# In case you want to close the session, you can re-load the model by          #
################################################################################
# weight_path_feedforward_nn='/data/model_weight_feedforward_nn.h5'
# model_feedforward_nn = get_feedforward_nn()
# model_feedforward_nn.load_weights(weight_path_feedforward_nn)
# model_feedforward_nn._make_predict_function()
# model_feedforward_nn.summary()

In [None]:
from sklearn.metrics import f1_score,precision_score,recall_score

################################################################################
# Write a function to evaluate your model. Your function must make prediction  #
# using the input model and return f-score, precision, and recall of the model.#
# You can make predictions by calling model.predict().                         #
################################################################################
def evaluate(x_test, y_test, model):
  """
  Evaluate model on the splitted 10 percent testing set.
  """
  y_pred = model.predict(x_test)

  #map probability to class
  prob_to_class = lambda p: 1 if p[0]>=0.5 else 0
  y_pred = np.apply_along_axis(prob_to_class,1,y_pred)
    
  f1score = f1_score(y_test,y_pred)
  precision = precision_score(y_test,y_pred)
  recall = recall_score(y_test,y_pred)
  return f1score, precision, recall, y_pred

In [None]:
f1score, precision, recall, y_pred = evaluate(x_test_char, y_test, model_feedforward_nn)

In [None]:
print('F1 : ', f1score)
print('Precision : ', precision)
print('Recall : ', recall)

F1 :  0.8062943766417542
Precision :  0.8494439003917179
Recall :  0.7673166994447472


# Debugging

In order to understand what is going on in your model and where the error is, you should try looking at the inputs your model made wrong predictions.

In this task, write a function to print the characters on test data that got wrong prediction along with its context of size 10 (from [x-10] to [x+10]). Examine a fews of those and write your assumption on where the model got wrong prediction.

In [None]:
# TODO#1
# Write code to show a few of the errors the models made.
pass

In [None]:
#A function for displaying our prediction
def print_prediction(tfeature,label,index,prediction):
    feature = np.array(tfeature[index],dtype=int).reshape(21,1)
    #Convert to string
    char_list = char[feature]
    left = ''.join(reversed(char_list[10:20].reshape(10))).replace(" ", "")
    center = ''.join(char_list[20])
    right =  ''.join(char_list[0:10].reshape(10)).replace(" ", "")
    word = ''.join([left,' ',center,' ',right])
    print(center + ': ' + word + "\tlabel = "+str(label[index]) + "\tpred = "+str(prediction[index]))

In [None]:
y_pred.shape

(2271932,)

In [None]:
ind_wrong = [i for i, e in enumerate(y_test) if y_test[i] != y_pred[i]]

In [None]:
len(ind_wrong)

233393

In [None]:
for ind in ind_wrong[:30]:
    print_prediction(x_test_char,y_test,ind,y_pred)

ิ: ปฏ ิ รูปการศึกษ	label = 0	pred = 1
ท: า:มุมมอง ท างกระบวนทั	label = 1	pred = 0
ท: งทางกระบวน ท ัศน์และบริ	label = 1	pred = 0
บ: วนทัศน์และ บ ริบทสังคมไ	label = 1	pred = 0
ร: นทัศน์และบ ร ิบทสังคมไท	label = 0	pred = 1
ส: น์และบริบท ส ังคมไทยThe	label = 1	pred = 0
ไ: บริบทสังคม ไ ทยTheRefo	label = 1	pred = 0
ท: ริบทสังคมไ ท ยTheRefor	label = 0	pred = 1
ท: tiveกระบวน ท ัศน์และวิธ	label = 1	pred = 0
ว: วนทัศน์และ ว ิธีคิดแบบแ	label = 1	pred = 0
แ: และวิธีคิด แ บบแยกส่วน	label = 1	pred = 0
บ: ะวิธีคิดแบ บ แยกส่วนลด	label = 0	pred = 1
แ: วิธีคิดแบบ แ ยกส่วนลดส	label = 1	pred = 0
ส: ีคิดแบบแยก ส ่วนลดส่วน	label = 1	pred = 0
เ: ้"การศึกษา เ รียนรู้"ใน	label = 1	pred = 0
ร: ศึกษาเรียน ร ู้"ในหลาย	label = 0	pred = 1
ท: ู้"ในหลาย ท ศวรรษที่ผ่	label = 1	pred = 0
ก: าการด้านศึ ก ษาศาสตร์ค	label = 0	pred = 1
เ: ื่องของโรง เ รียนครูอา	label = 1	pred = 0
ง: รย์กระทรว ง ศึกษาธิการ	label = 0	pred = 1
ม: ธิการทบวง ม หาวิทยาลัย	label = 0	pred = 1
อ: ทยาลัยฯมา อ ย่างต่อเนื	label = 1	pred = 0
ภ: ่เรื่องส

# Write your answer here

**Your answer**: TODO#2

My assumptions on where the model got wrong prediction following.
- มีการ predict สระอิ,สระอา เป็นตัวต้นคำ และมีการ predict ไม้มลาย, สระโอ, สระเอ ไม่เป็นตัวต้นคำ
- มีการตัดคำผิดกับคำที่เป็น Subset ในคำที่ยาวกว่า เช่น 
  - "เรียนรู้" model predict ร. ใน "รู้" เป็นตัวต้นคำ ซึ่งอาจจะถูกในบริบทอื่น ๆ
  - "วิพากษ์วิจารณ์" model predict ว. ใน "วิจารณ์" เป็นตัวต้นคำ ซึ่งอาจจะถูกในบริบทอื่น ๆ
  - "ยาวนาน" model predict น. ใน "นาน" เป็นตัวต้นคำ ซึ่งอาจจะถูกในบริบทอื่น ๆ




# Tensorboard #
The code provided also have Tensorboard (a visualization tool that comes with Tensorflow). Note the part that calls it `TensorBoard(log_dir='./Graph/' + graph_name, histogram_freq=1, write_graph=True, write_grads=True)`. This tells Tensorflow to write extra outputs to the `log_dir` which can then be used for visualization.

To start tensorboard do
```
tensorboard --logdir=/full_path_to_your_logs
```

In Tensorboard, you will be able to debug your computation graph which can be hard to keep track in code. This is might seem trivial in Keras, but it is very helpful for Tensorflow. You can see a visualization of the computation graph at the `GRAPH` tab. If you see multiple dense layers (more than 4), this is caused by running the code several times without deleting the log dir. Delete the log dir and re-run the code.

Next, let's look at the scalars tab, we can see the loss and accuracy on the training and validation set as they change over each epoch. This can be useful to detect overfitting.

Another useful tab is the histograms tab (Enable by setting 'write_grads=True'). This plot histograms of the weights, biases, and outputs of each layer. The depth of the histograms show the change over epochs. We can see how the histograms of weights change over the training peroid. This can be used to debug vanishing gradients or getting stuck in local minimas.

There are other useful tabs in Tensorboard, you can read about them in the Keras [documentation](https://keras.io/callbacks/#tensorboard) for tensorboard.

In [None]:
%load_ext tensorboard
%tensorboard --logdir='./Graph/ff'

In [None]:
model_feedforward_nn.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 21)]              0         
_________________________________________________________________
dense (Dense)                (None, 100)               2200      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 22,501
Trainable params: 22,501
Non-trainable params: 0
_________________________________________________________________


# Tensorboard observation

Write your own interpretation of the logs from this example. A simple sentence or two for each tab is sufficient.

**Your answer**: TODO#3
- Tab "Graph" : แสดงแต่ละ layer ของ model โดยประกอบไปด้วย input_1, dense, dense_1, dense_2, dense_3
- Tab "Scalars" : แต่ละ epoch ที่มากขึ้น ทั้ง train แลำ validation set มี loss ที่น้อยลง แปลว่ายังไม่เกิดการ overfit. เพราะุถ้า overfit ในแต่ละ epoch ที่เพิ่มขึ้น loss ใน train จะลดลงจริง แต่ validation จะเพิ่มขึ้น
- Tab "histograms" : Kernel แต่ละ ephoch ดูไม่ต่างกันมาก แต่ Bias มีการออกห่างจาก 0 มากขึ้น

# Dropout

You might notice that the 3-layered feedforward does not use dropout at all. Now, try adding dropout to the model, run, and report the result again.

In [None]:
# TODO#4
# Write a function that return feedforward model with dropout

from tensorflow.keras.layers import Dropout
def get_nn_with_dropout():
  input1 = Input(shape=(21,))
  x = Dense(100, activation='relu')(input1)
  x = Dense(100, activation='relu')(x)
  x = Dense(100, activation='relu')(x)
  x = Dropout(0.2)(x)
  out = Dense(1, activation='sigmoid')(x)

  model = Model(inputs=input1, outputs=out)
  model.compile(optimizer=Adam(),
                loss='binary_crossentropy',
                metrics=['acc'])
  return model

In [None]:
# Train your model
print('start training')
verbose = 2
model_nn_with_dropout = get_nn_with_dropout()
# TODO#5
# Complete the code to train your model with dropout
################################################################################
#                            WRITE YOUR CODE BELOW                             #
################################################################################
train_params = [(3, 512)]

t.tic()
for (epochs, batch_size) in train_params:
  print("train with {} epochs and {} batch size".format(epochs, batch_size))
  model_nn_with_dropout.fit(x_train_char, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose,
                           callbacks=callbacks_list_feedforward_nn,
                           validation_data=(x_val_char, y_val))

t.toc()

start training
train with 3 epochs and 512 batch size
Epoch 1/3
32152/32152 - 89s - loss: 0.3611 - acc: 0.8435 - val_loss: 0.3015 - val_acc: 0.8673

Epoch 00001: val_loss did not improve from 0.25058
Epoch 2/3
32152/32152 - 88s - loss: 0.2933 - acc: 0.8742 - val_loss: 0.2643 - val_acc: 0.8880

Epoch 00002: val_loss did not improve from 0.25058
Epoch 3/3
32152/32152 - 88s - loss: 0.2692 - acc: 0.8865 - val_loss: 0.2526 - val_acc: 0.8953

Epoch 00003: val_loss did not improve from 0.25058
Elapsed time is 265.838696 seconds.


In [None]:
f1score2, precision2, recall2, y_pred2 = evaluate(x_test_char, y_test, model_nn_with_dropout)
print('F1 : ', f1score2)
print('Precision : ', precision2)
print('Recall : ', recall2)

F1 :  0.8074659949204719
Precision :  0.8491825850529838
Recall :  0.7696561855792242


# Convolution Neural Networks

Now, you are going to implement you own 1d-convolution neural networks with the following structure:
input -> embedding layer (size 32) -> 1D-convolution layer (100 filters of size 5, strides of 1) -> TimeDistributed (Dense size 5) -> fully-connected layer (size 100) -> output.

These parameters are simple guidelines to save your time. You can play with them in the final section.

The results should be better than the feedforward model.

Embedding layers turn the input from a one-hot vector into better representations via some feature transform (a simple matrix multiply in this case). TimeDistributed is Keras' way of specifying that the layer of the network should be distributed along time (the first dimension) as shown in the picture below.

<img src="https://raw.githubusercontent.com/ekapolc/nlp_course/master/HW1/images/configuration.png">

Note you need to flatten() before the final fully connected layer because of dimension mis-match.

Do consult keras documentation on how to use [embedding layers](https://keras.io/layers/embeddings/) and [1D-cnn](https://keras.io/layers/convolutional/).


In [None]:
len(CHARS)

178

In [None]:
################################################################################
# TODO#6:                                                                      #
# Write a function that returns keras convolution nueral network model.        #
# You can choose any normalization methods, activation function, as well as    #
# any hyperparameter the way you want. Your goal is to predict a score         #
# between [0,1] for each input whether it is the beginning of the word or not. #
#                                                                              #
# Hint: You should read keras documentation to see the list of available       #
# layers and options you can use.                                              #
################################################################################

from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Flatten

# input -> embedding layer (size 32) -> 1D-convolution layer (100 filters of size 5, strides of 1) -> 
# TimeDistributed (Dense size 5) -> fully-connected layer (size 100) -> output.
def get_conv1d_nn():
  input1 = Input(shape=(21,))
  x = Embedding(len(CHARS), 32, input_length=21)(input1)
  x = Conv1D(filters=100, kernel_size=5, strides=1, padding='same', activation='relu')(x)
  x = TimeDistributed(Dense(5))(x)
  x = Flatten(data_format='channels_last')(x)
  x = Dense(100, activation='relu')(x)
  out = Dense(1, activation='sigmoid')(x)

  model = Model(inputs=input1, outputs=out)
  model.compile(optimizer=Adam(),
                loss='binary_crossentropy',
                metrics=['acc'])

  return model

In [None]:
################################################################################
# TODO#7:                                                                      #
# Write code that call model.fit, or model.fit_generator if you have data      #
# generator, to train you models. Make sure you have validation_data as an     # 
# argument and use verbose=2 to generate one log line per epoch. Select your   #
# batch size carefully as it will affect your model's ability to converge and  #
# time needed for one epoch.                                                   #
################################################################################
print('start training conv1d')
model_conv1d_nn = get_conv1d_nn()
################################################################################
#                            WRITE YOUR CODE BELOW                             #
################################################################################

t.tic()
train_params = [(3, 512)]
for (epochs, batch_size) in train_params:
  print("train with {} epochs and {} batch size".format(epochs, batch_size))
  model_conv1d_nn.fit(x_train_char, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose,
                           callbacks=callbacks_list_feedforward_nn,
                           validation_data=(x_val_char, y_val))
t.toc()

start training conv1d
train with 3 epochs and 512 batch size
Epoch 1/3
32152/32152 - 174s - loss: 0.0778 - acc: 0.9703 - val_loss: 0.0661 - val_acc: 0.9756

Epoch 00001: val_loss improved from 0.25058 to 0.06606, saving model to ./model_weight_feedforward_nn.h5
Epoch 2/3
32152/32152 - 163s - loss: 0.0581 - acc: 0.9786 - val_loss: 0.0622 - val_acc: 0.9776

Epoch 00002: val_loss improved from 0.06606 to 0.06219, saving model to ./model_weight_feedforward_nn.h5
Epoch 3/3
32152/32152 - 162s - loss: 0.0537 - acc: 0.9804 - val_loss: 0.0584 - val_acc: 0.9787

Epoch 00003: val_loss improved from 0.06219 to 0.05840, saving model to ./model_weight_feedforward_nn.h5
Elapsed time is 499.972907 seconds.


In [None]:
f1score3, precision3, recall3, y_pred3 = evaluate(x_test_char, y_test, model_conv1d_nn)
print('F1 : ', f1score3)
print('Precision : ', precision3)
print('Recall : ', recall3)

F1 :  0.9655679777105581
Precision :  0.9568474770820085
Recall :  0.9744488938385107


In [None]:
model_conv1d_nn.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 21)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 21, 32)            5696      
_________________________________________________________________
conv1d (Conv1D)              (None, 21, 100)           16100     
_________________________________________________________________
time_distributed (TimeDistri (None, 21, 5)             505       
_________________________________________________________________
flatten (Flatten)            (None, 105)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 100)               10600     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 101 

# GRU

Implement your GRU model with the following structure: input -> embedding layer (size 32) -> GRU layer (size 32) -> fully-connected layer (size 100) -> output.


These parameters are simple guidelines to save your time. You can play with them in the final section.

The result should be better than the feedforward model and at least on par with your CNN model.

Do consult keras documentation on how to use [embedding layers](https://keras.io/layers/embeddings/) and [GRUs](https://keras.io/layers/recurrent/).


In [None]:
################################################################################
# TODO#8                                                                       #
# Write a function that returns keras GRU network moded. You can choose any    #
# normalization methods, activation function, as well as any hyperparameter    #
# the way you want. Your goal is to predict a score between [0,1] for each     #
# input whether it is the beginning of the word or not.                        #
#                                                                              #
# Hint: You should read keras documentation to see the list of available       #
# layers and options you can use.                                              #
################################################################################

from tensorflow.keras.layers import GRU

# input -> embedding layer (size 32) -> GRU layer (size 32) -> fully-connected layer (size 100) -> output.
def get_gru():
  input1 = Input(shape=(21,))
  x = Embedding(len(CHARS), 32)(input1)
  x = GRU(32)(x)
  x = Dense(100, activation='relu')(x)
  out = Dense(1, activation='sigmoid')(x)

  model = Model(inputs=input1, outputs=out)
  model.compile(optimizer=Adam(),
                loss='binary_crossentropy',
                metrics=['acc'])

  return model

In [None]:
################################################################################
# TODO#9                                                                       #
# Write code that call model.fit, or model.fit_generator if you have data      #
# generator, to train you models. Make sure you have validation_data as an     # 
# argument and use verbose=2 to generate one log line per epoch. Select your   #
# batch size carefully as it will affect your model's ability to converge and  #
# time needed for one epoch.                                                   #
################################################################################
print('start training conv1d')
model_gru = get_gru()
################################################################################
#                            WRITE YOUR CODE BELOW                             #
################################################################################

t.tic()
train_params = [(3, 512)]
for (epochs, batch_size) in train_params:
  print("train with {} epochs and {} batch size".format(epochs, batch_size))
  model_gru.fit(x_train_char, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose,
                           callbacks=callbacks_list_feedforward_nn,
                           validation_data=(x_val_char, y_val))

t.toc()

start training conv1d
train with 3 epochs and 512 batch size
Epoch 1/3
32152/32152 - 203s - loss: 0.1217 - acc: 0.9512 - val_loss: 0.0899 - val_acc: 0.9662

Epoch 00001: val_loss did not improve from 0.05840
Epoch 2/3
32152/32152 - 200s - loss: 0.0774 - acc: 0.9711 - val_loss: 0.0739 - val_acc: 0.9731

Epoch 00002: val_loss did not improve from 0.05840
Epoch 3/3
32152/32152 - 201s - loss: 0.0687 - acc: 0.9748 - val_loss: 0.0699 - val_acc: 0.9751

Epoch 00003: val_loss did not improve from 0.05840
Elapsed time is 605.106170 seconds.


In [None]:
f1score4, precision4, recall4, y_pred4 = evaluate(x_test_char, y_test, model_gru)
print('F1 : ', f1score4)
print('Precision : ', precision4)
print('Recall : ', recall4)

F1 :  0.9589290172158895
Precision :  0.9444180624446616
Recall :  0.9738928512191076


# Final Section
# Keras playground

Now, train the best model you can do for this task. You can use any model structure and function available. Remember that trainig time increases with the complexity of the model. You might find TensorBoard helpful in tuning of complicated models.

Your model should be better than your CNN or GRU model in the previous sections.

Some ideas to try
1. Tune the parameters
2. Bi-directional GRU
3. CNN-GRU model

In [None]:
################################################################################
# TODO#10                                                                      #
# Write a function that returns keras your best model. You can use anything    #
# you want. The goal here is to create the best model you can think of.        #
# Your model should get f-score more than 97% from calling evaluate().         #
#                                                                              #
# Hint: You should read keras documentation to see the list of available       #
# layers and options you can use.                                              #
################################################################################

from tensorflow.keras.layers import Bidirectional

def get_my_best():
  input1 = Input(shape=(21,))
  x = Embedding(len(CHARS), 32, input_length=21)(input1)
  x = Conv1D(filters=100, kernel_size=5, strides=1, padding='same', activation='relu')(x)
  x = Bidirectional(GRU(32))(x)
  x = Dense(100, activation='relu')(x)
  out = Dense(1, activation='sigmoid')(x)

  model = Model(inputs=input1, outputs=out)
  model.compile(optimizer=Adam(),
                loss='binary_crossentropy',
                metrics=['acc'])
  return model

In [None]:
################################################################################
# TODO#11                                                                      #
# Write code that call model.fit, or model.fit_generator if you have data      #
# generator, to train you models. Make sure you have validation_data as an     # 
# argument and use verbose=2 to generate one log line per epoch. Select your   #
# batch size carefully as it will affect your model's ability to converge and  #
# time needed for one epoch.                                                   #
#                                                                              #
# Hint: Read about callbacks_list argument on the documentation. You might     #
# find  ReduceLROnPlateau() and ModelCheckpoint() useful for your training     #
# process. Feel free to use any other callback function available.             #
################################################################################
print('start training conv1d')
my_best_model = get_my_best_model()
################################################################################
#                            WRITE YOUR CODE BELOW                             #
################################################################################

print('start training my best model')
my_best_model = get_my_best_model()

t.tic()
train_params = [(3, 512)]
for (epochs, batch_size) in train_params:
  print("train with {} epochs and {} batch size".format(epochs, batch_size))
  my_best_model.fit(x_train_char, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose,
                           callbacks=callbacks_list_feedforward_nn,
                           validation_data=(x_val_char, y_val))

t.toc()

start training conv1d
start training my best model
train with 3 epochs and 512 batch size
Epoch 1/3
32152/32152 - 321s - loss: 0.0834 - acc: 0.9675 - val_loss: 0.0604 - val_acc: 0.9781

Epoch 00001: val_loss did not improve from 0.05083
Epoch 2/3
32152/32152 - 315s - loss: 0.0513 - acc: 0.9813 - val_loss: 0.0539 - val_acc: 0.9809

Epoch 00002: val_loss did not improve from 0.05083
Epoch 3/3
32152/32152 - 314s - loss: 0.0454 - acc: 0.9838 - val_loss: 0.0507 - val_acc: 0.9825

Epoch 00003: val_loss improved from 0.05083 to 0.05066, saving model to ./model_weight_feedforward_nn.h5
Elapsed time is 950.980801 seconds.


In [None]:
f1score5, precision5, recall5, y_pred5 = evaluate(x_test_char, y_test, my_best_model)
print('F1 : ', f1score5)
print('Precision : ', precision5)
print('Recall : ', recall5)

F1 :  0.9721200723077491
Precision :  0.9661223738365
Recall :  0.9781927035202869
