# Experiments

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/encrypted_text_proj')
!pip install wandb



In [3]:
import os

import wandb
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import random as python_random
from wandb.keras import WandbCallback
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

from src.datapipeline import Datapipeline
from src.conv1dmodel import Conv1DModel
from src.bilstmmodel import BiLSTMModel
from src.postprocess import add_prediction_to_test_data

%load_ext autoreload
%autoreload 2

# Set the random seeds
os.environ['TF_CUDNN_DETERMINISTIC'] = '1' 
python_random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
tf.random.set_seed(hash("by removing stochasticity") % 2**32 - 1)

In [4]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
train_path = '/content/drive/MyDrive/encrypted_text_proj/data/train.csv'
test_path = '/content/drive/MyDrive/encrypted_text_proj/data/test.csv'
max_sequence_len = 250
label_mapping = { "class_1": 0, "class_2": 1, "class_3": 2 }

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# train val split
test_size = 0.25
random_state = 0
train_data, val_data = train_test_split(train_df, 
                                        test_size=test_size, 
                                        random_state=random_state, 
                                        shuffle=True)

# Preprocess data
datapipeline = Datapipeline(max_sequence_len, label_mapping)
X_train, y_train = datapipeline.transform_train_data(train_data)
X_val, y_val = datapipeline.transform_test_data(val_data, is_validation=True)

# 1D CNN

In [6]:
config = {'optimizer':'adam',
          'learning_rate': 0.002,
          'loss': 'sparse_categorical_crossentropy',
          'batch_size': 32,
          'epochs': 50,
          'model_params':{'input_size': max_sequence_len,
                          'num_classes': 3,
                          'embedding_size': 50,
                          'dropout_p': 0.5,
                          'vocab_size': 38,
                          'batch_normalisation': True,
                          'conv_layers': [[32,3,-1], [32 ,3,-1],[32 ,3,-1]],
                          'dense_layers': [1024]}}
run = wandb.init(project='encrypt_text_1dcnn', config=config)
config = wandb.config 


# Initialize model like you usually do.
tf.keras.backend.clear_session()

model = Conv1DModel(**config['model_params'])
model = model.build_model()
optimizer = Adam(learning_rate=config['learning_rate'])
model.compile(optimizer=optimizer, 
              loss=config['loss'], 
              metrics=['accuracy'])
model.summary()

# callbacks
early_stopping = EarlyStopping(monitor='val_accuracy', patience=10)
wandb_callback = WandbCallback(monitor='val_accuracy',save_weights_only=True)

# Training
history = model.fit(X_train, 
                    y_train,
                    validation_data=(X_val,y_val),
                    batch_size=config['batch_size'],
                    epochs=config['epochs'],
                    callbacks=[wandb_callback, early_stopping])

run.join()

[34m[1mwandb[0m: Currently logged in as: [33mvtzc630[0m (use `wandb login --relogin` to force relogin)


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 250)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 250, 50)           1900      
_________________________________________________________________
conv1d (Conv1D)              (None, 248, 32)           4832      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 246, 32)           3104      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 244, 32)           3104      
_________________________________________________________________
batch_normalization (BatchNo (None, 244, 32)           128       
_________________________________________________________________
flatten (Flatten)            (None, 7808)              0     

VBox(children=(Label(value=' 0.06MB of 0.06MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,41.0
loss,0.1233
accuracy,0.96165
val_loss,1.66134
val_accuracy,0.72303
_runtime,186.0
_timestamp,1622376213.0
_step,41.0
best_val_accuracy,0.73136
best_epoch,31.0


0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
loss,███▇▇▆▆▅▅▄▄▃▃▃▃▃▂▂▂▂▂▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
accuracy,▁▁▂▂▃▃▄▅▅▅▆▆▆▇▇▇▇▇▇▇████████████████████
val_loss,▁▂▁▂█▁▃▂▃▁▂▂▂▂▂▁▂▃▂▂▂▂▂▂▃▂▂▃▃▂▃▂▃▃▂▃▂▃▃▂
val_accuracy,▂▁▂▂▂▃▃▄▅▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇▇█▇████████████
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██


# BiLSTM

In [7]:
# max_embedding_len = 250
config = {'optimizer':'adam',
          'learning_rate': 0.01,
          'loss': 'sparse_categorical_crossentropy',
          'batch_size': 32,
          'epochs': 30,
          'model_params':{'input_size': max_sequence_len,
                          'num_classes': 3,
                          'embedding_size': 100,
                          'dropout_p': 0.5,
                          'vocab_size': 38,
                          'batch_normalisation': True,
                          'bilstm_layers': [[16,0]],
                          'dense_layers': [1024]}}
run = wandb.init(project='encrypt_text_bilstm', config=config)
config = wandb.config 

# Initialize model like you usually do.
tf.keras.backend.clear_session()

model = BiLSTMModel(**config['model_params'])
model = model.build_model()
optimizer = Adam(learning_rate=config['learning_rate'])
model.compile(optimizer=optimizer, 
              loss=config['loss'], 
              metrics=['accuracy'])
model.summary()

# callbacks
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
wandb_callback = WandbCallback(monitor='val_accuracy',save_weights_only=True)

# Training
history = model.fit(X_train, 
                    y_train,
                    validation_data=(X_val,y_val),
                    batch_size=config['batch_size'],
                    epochs=config['epochs'],
                    callbacks=[wandb_callback, early_stopping])

run.join()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 250)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 250, 100)          3800      
_________________________________________________________________
bidirectional (Bidirectional (None, 250, 32)           14976     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 32)                6272      
_________________________________________________________________
batch_normalization (BatchNo (None, 32)                128       
_________________________________________________________________
dense (Dense)                (None, 1024)              33792     
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0     

VBox(children=(Label(value=' 0.27MB of 0.27MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,17.0
loss,1.09654
accuracy,0.34982
val_loss,1.09505
val_accuracy,0.3427
_runtime,282.0
_timestamp,1622376518.0
_step,17.0
best_val_accuracy,0.3839
best_epoch,12.0


0,1
epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
loss,▇▃▃▂▃▄▁▂▁▁▁▂▁▁██▂▂
accuracy,▄▂▇█▃▃▂▆▇█▅▃▄▁▂▄▄▁
val_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▄▁▁
val_accuracy,▃▅▁▇▅▅▅▇▆▅▅▅█▅▅▅▅▂
_runtime,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
_timestamp,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
