<a href="https://colab.research.google.com/github/xixilili/MSDS_458_Public/blob/master/MSDS458_Assignment_03/MSDS458_Assignment_03_Experiment_B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Experiments B: RNN: 
Try several experiments by tweaking (i) architecture (ii) Bidirectional/unidirectional & other hyper parameters, including regularization.

### Load Package

In [None]:
import datetime
from packaging import version
from collections import Counter
import numpy as np
import pandas as pd
import time

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

### Load and Process Data

In [None]:
# register  ag_news_subset so that tfds.load doesn't generate a checksum (mismatch) error
!python -m tensorflow_datasets.scripts.download_and_prepare --register_checksums --datasets=ag_news_subset

# Example Approaches to Split Data Set
# dataset, info = tfds.load('ag_news_subset', with_info=True,  split=['train[:]','test[:1000]', 'test[1000:]'],
dataset, info = tfds.load('ag_news_subset', with_info=True,  split=['train[:95%]','train[95%:]', 'test[:]'],
# dataset, info = tfds.load('ag_news_subset', with_info=True,  split=['train[:114000]','train[114000:]', 'test[:]'],
                          as_supervised=True)
train_dataset, validation_dataset, test_dataset = dataset
# train_dataset, test_dataset = dataset['train'],dataset['test']

2022-02-18 21:05:47.986702: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
I0218 21:05:47.987008 139701657638784 download_and_prepare.py:200] Running download_and_prepare for dataset(s):
ag_news_subset
I0218 21:05:47.988306 139701657638784 dataset_info.py:361] Load dataset info from /root/tensorflow_datasets/ag_news_subset/1.0.0
I0218 21:05:47.989976 139701657638784 download_and_prepare.py:138] download_and_prepare for dataset ag_news_subset/1.0.0...
I0218 21:05:47.990235 139701657638784 dataset_builder.py:299] Reusing dataset ag_news_subset (/root/tensorflow_datasets/ag_news_subset/1.0.0)
[1mname: "ag_news_subset"
description: "AG is a collection of more than 1 million news articles.\nNews articles have been gathered from more than 2000  news sources by ComeToMyHead in more than 1 year of activity.\nComeToMyHead is an academic news search engine which has been running since July, 2004.\nThe dataset

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
validation_dataset = validation_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

### Explore Vocab

In [None]:
def explore_vocab(train_dataset,encoder):
  doc_sizes = []
  corpus = []
  for example, _ in train_dataset.as_numpy_iterator():
    enc_example = encoder(example)
    doc_sizes.append(len(enc_example))
    corpus+=list(enc_example.numpy())
  return  corpus, doc_sizes

### Compile, Train Model

In [None]:
def namestr(obj, namespace):
  for name in namespace:
    if namespace[name] is obj:
      return name

In [None]:
import datetime as dt

def compile_train_model(model, epoch):
  #compile model
  model.compile(optimizer=tf.keras.optimizers.Adam(1e-4)
              ,loss=tf.keras.losses.SparseCategoricalCrossentropy() # if we set from_logits=True we don not have specify a softmax activation function in the last layer
              ,metrics=['accuracy'])

  start_datetime = dt.datetime.now()

  #train model  
  history = model.fit(train_dataset
                    ,epochs = epoch
                    ,validation_data=validation_dataset
                    ,callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)]
                    )


  #evaluate model
  loss, accuracy = model.evaluate(test_dataset)
  print('test set accuracy: ', accuracy * 100)

  runtime = (dt.datetime.now() - start_datetime).total_seconds()

  #training and validation performance metrix
  history_dict = history.history
  history_df=pd.DataFrame(history_dict)

  #loss and accuracy for training and validation data
  losses = history.history['loss']
  accs = history.history['accuracy']
  val_losses = history.history['val_loss']
  val_accs = history.history['val_accuracy']
  epochs = len(losses)

  result = history_df.tail(1)
  result['test_loss'] = loss
  result['test_accuracy'] = accuracy
  result['process_time'] = runtime
  result['epochs_setting'] = epoch  
  result['epochs_actual'] = epochs    

  plt.figure(figsize=(16, 4))
  for i, metrics in enumerate(zip([losses, accs], [val_losses, val_accs], ['Loss', 'Accuracy'])):
      plt.subplot(1, 2, i + 1)
      plt.plot(range(epochs), metrics[0], label='Training {}'.format(metrics[2]))
      plt.plot(range(epochs), metrics[1], label='Validation {}'.format(metrics[2]))
      plt.legend()
      plt.title('{0} with {1} epochs'.format(namestr(model, globals()), epoch))   
  plt.show()  

  return result

## Simple RNN

### Encoder

In [None]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))
vocab = np.array(encoder.get_vocabulary());

In [None]:
corpus, doc_sizes = explore_vocab(train_dataset,encoder)

In [None]:
vocab = np.array(encoder.get_vocabulary())
num_vocab_words_in_corpus =len(vocab)

num_words =len(corpus)
num_articles =len(doc_sizes)
min_token_in_a_article = min(doc_sizes)
max_token_in_a_article = max(doc_sizes)  

print(num_vocab_words_in_corpus)
print(num_words)
print(num_articles)
print(min_token_in_a_article)
print(max_token_in_a_article)

### Create Model

In [None]:
simpleRNN = tf.keras.Sequential([
                              encoder
                              ,tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()),output_dim=64,mask_zero=True)
                              ,tf.keras.layers.RNN(units = 64)
                              ,tf.keras.layers.Dense(64, activation='relu')
                              ,tf.keras.layers.Dense(4,activation='softmax')   # num_classes = 4
])


In [None]:
Multiplelayer_simpleRNN = tf.keras.Sequential([
                              encoder
                              ,tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()),output_dim=64,mask_zero=True)
                              ,tf.keras.layers.RNN(units = 64, return_sequences=True)                              
                              ,tf.keras.layers.RNN(units = 64)
                              ,tf.keras.layers.Dense(64, activation='relu')
                              ,tf.keras.layers.Dense(4,activation='softmax')   # num_classes = 4
])

In [None]:
BidirectionalRNN = tf.keras.Sequential([
                              encoder
                              ,tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()),output_dim=64,mask_zero=True)
                              ,tf.keras.layers.Bidirectional(tf.keras.layers.RNN(64)) 
                              ,tf.keras.layers.Dense(64, activation='relu')
                              ,tf.keras.layers.Dense(4,activation='softmax')   # num_classes = 4
])

In [None]:
Multiplelayer_BidirectionalRNN = tf.keras.Sequential([
                              encoder
                              ,tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()),output_dim=64,mask_zero=True)
                              ,tf.keras.layers.Bidirectional(tf.keras.layers.RNN(64,  return_sequences=True))
                              ,tf.keras.layers.Bidirectional(tf.keras.layers.RNN(32))
                              ,tf.keras.layers.Dense(64, activation='relu')
                              ,tf.keras.layers.Dense(4,activation='softmax')   # num_classes = 4
])

In [None]:
Multiplelayer_BidirectionalRNN_Dropout = tf.keras.Sequential([
                              encoder
                              ,tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()),output_dim=64,mask_zero=True)
                              ,tf.keras.layers.Bidirectional(tf.keras.layers.RNN(64, return_sequences=True, recurrent_dropout=0.25))
                              ,tf.keras.layers.Bidirectional(tf.keras.layers.RNN(32))
                              ,tf.keras.layers.Dense(64, activation='relu')
                              ,tf.keras.layers.Dropout(0.5)                              
                              ,tf.keras.layers.Dense(4,activation='softmax')   # num_classes = 4


Recurrent dropout—This is a variant of dropout, used to fight overfitting in recurrent layers.
Stacking recurrent layers—This increases the representational power of the model (at the cost of higher computational loads).
Bidirectional recurrent layers—These present the same information to a recurrent network in different ways, increasing accuracy and mitigating forgetting issues.

### Simple RNN

In [None]:
result_simpleRNN = compile_train_model(simpleRNN, 1)

In [None]:
result_simpleRNN['vocab_size'] = VOCAB_SIZE 
result_simpleRNN['num_words'] = num_words 
result_simpleRNN['num_articles'] = num_articles
result_simpleRNN['min_token_in_a_article'] = min_token_in_a_article 
result_simpleRNN['max_token_in_a_article'] = max_token_in_a_article 
new_col = ['SimpleRNN']   
result_simpleRNN.insert(loc=0, column='Model', value=new_col)

In [None]:
prev_results_df = pd.read_pickle('resultsAa.pkl')
assignment_result_table = prev_results_df.append(result_simpleRNN,ignore_index=True)
assignment_result_table

### Multiplelayer RNN

In [None]:
result_msimpleRNN = compile_train_model(Multiplelayer_simpleRNN, 1)

In [None]:
result_msimpleRNN['vocab_size'] = VOCAB_SIZE 
result_msimpleRNN['num_words'] = num_words 
result_msimpleRNN['num_articles'] = num_articles
result_msimpleRNN['min_token_in_a_article'] = min_token_in_a_article 
result_msimpleRNN['max_token_in_a_article'] = max_token_in_a_article 
new_col = ['Multiplelayer RNN']   
result_msimpleRNN.insert(loc=0, column='Model', value=new_col)

In [None]:
assignment_result_table = assignment_result_table.append(result_msimpleRNN,ignore_index=True)
assignment_result_table

### Bidirectional RNN

In [None]:
result_BiRNN = compile_train_model(BidirectionalRNN, 1)

In [None]:
result_BiRNN['vocab_size'] = VOCAB_SIZE 
result_BiRNN['num_words'] = num_words 
result_BiRNN['num_articles'] = num_articles
result_BiRNN['min_token_in_a_article'] = min_token_in_a_article 
result_BiRNN['max_token_in_a_article'] = max_token_in_a_article 
new_col = ['Bidirectional RNN']   
result_BiRNN.insert(loc=0, column='Model', value=new_col)

In [None]:
assignment_result_table = assignment_result_table.append(result_BiRNN,ignore_index=True)
assignment_result_table

### Multiplelayer Bidirectional RNN

In [None]:
result_MBiRNN = compile_train_model(Multiplelayer_BidirectionalRNN, 1)

In [None]:
result_MBiRNN['vocab_size'] = VOCAB_SIZE 
result_MBiRNN['num_words'] = num_words 
result_MBiRNN['num_articles'] = num_articles
result_MBiRNN['min_token_in_a_article'] = min_token_in_a_article 
result_MBiRNN['max_token_in_a_article'] = max_token_in_a_article 
new_col = ['Multiplelayer Bidirectional RNN']   
result_MBiRNN.insert(loc=0, column='Model', value=new_col)

In [None]:
assignment_result_table = assignment_result_table.append(result_MBiRNN,ignore_index=True)
assignment_result_table

### Multiplelayer Bidirectional RNN Dropout

In [None]:
result_MBiDRNN = compile_train_model(Multiplelayer_BidirectionalRNN_Dropout, 1)

In [None]:
result_MBiDRNN['vocab_size'] = VOCAB_SIZE 
result_MBiDRNN['num_words'] = num_words 
result_MBiDRNN['num_articles'] = num_articles
result_MBiDRNN['min_token_in_a_article'] = min_token_in_a_article 
result_MBiDRNN['max_token_in_a_article'] = max_token_in_a_article 
new_col = ['Multiplelayer Bidirectional RNN with Dropout']   
result_MBiDRNN.insert(loc=0, column='Model', value=new_col)

In [None]:
assignment_result_table = assignment_result_table.append(result_MBiDRNN,ignore_index=True)
assignment_result_table