In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time

SEED = 0

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

def set_global_determinism(seed=SEED):
    set_seeds(seed=seed)

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    
    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

set_global_determinism(seed=SEED) # Setting seed for a reproducible code.
print(f'Tensorflow Version: {tf.__version__}')

2022-11-03 20:59:17.546841: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-03 20:59:17.754169: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-03 20:59:18.540162: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-11-03 20:59:18.540233: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

Tensorflow Version: 2.10.0


In [2]:
# Importing dataset
dataset = np.load('char_normalized_dataset.npz')
X = dataset['x']
Y = dataset['y']
del(dataset)
print(X.shape)
print(Y.shape)

(230, 14, 1536)
(230,)


In [3]:
# Since this work is trying to create a model that is robust to every user we will be
# Splitting the dataset with Scikit-Learn Train-Test split.
# Further work can test if diferent splitting techniques enhance performance of the model
# e.g split acording to subjects/labels

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=SEED)

print(f'''
X_train shape:{X_train.shape} -> Train Labels: {y_train.shape[0]}
X_test shape:{X_test.shape} -> Test Labels: {y_test.shape[0]}''')


X_train shape:(184, 14, 1536) -> Train Labels: 184
X_test shape:(46, 14, 1536) -> Test Labels: 46


In [4]:
# Verifing label distribution of train and test samples
display('Train Label distribution', pd.DataFrame(y_train, columns=['label']).groupby(['label'])['label'].count())
display('Test Label distribution', pd.DataFrame(y_test, columns=['label']).groupby(['label'])['label'].count())

'Train Label distribution'

label
0.0    19
1.0    17
2.0    19
3.0    19
4.0    20
5.0    18
6.0    18
7.0    19
8.0    17
9.0    18
Name: label, dtype: int64

'Test Label distribution'

label
0.0    4
1.0    6
2.0    4
3.0    4
4.0    3
5.0    5
6.0    5
7.0    4
8.0    6
9.0    5
Name: label, dtype: int64

In [5]:
# Creating our Transformer from Scratch
## Based on https://github.com/Kyubyong/transformer

AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32

from _modules import get_token_embeddings, ff, positional_encoding, multihead_attention, label_smoothing, noam_scheme
from _utils import convert_idx_to_token_tensor
from tqdm import tqdm
import logging

logging.basicConfig(level=logging.INFO)