In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import tensorflow_ranking as tfr
from sklearn.model_selection import KFold
from tqdm import tqdm 
# import tensorflow_recommenders as tfrs

In [3]:
## constants ##
test_size_percent = 0.2 # 20% test/val data
val_data_path = "./train/validation_data.csv"
more_toxic_key = "more_toxic"
less_toxic_key = "less_toxic"
dense_dim = 768
hidden_dim = dense_dim*3 # 3 models
batch_size=32

margin = 0.5 # maybe less
log_dir = "./log/unitaryAI-dense-layer-ensemble/"

In [4]:
val_df = pd.read_csv(val_data_path)
val_df.shape

(30108, 3)

In [5]:
with open('./output/unitaryAI_validation_data_dense_output.npy', 'rb') as f:
    dense_output = np.load(f)

dense_output.shape #  (dataloaders, examples, models, dense_dim)

(2, 30108, 3, 768)

In [6]:
combined_output =  np.concatenate([dense_output[:, :, 0, :], dense_output[:, :, 1, :], dense_output[:, :, 2, :]], axis=-1)
combined_output.shape

(2, 30108, 2304)

In [7]:
tr_ind, val_ind = train_test_split(list(range(combined_output.shape[1])) ,test_size = test_size_percent, random_state = 2343)
len(tr_ind), len(val_ind)

(24086, 6022)

In [8]:
## cross check
# print(dense_output[0, 0, 0, :5])
# print(dense_output[0, 0, 1, :5])
# print(dense_output[0, 0, 2, :5])

# print("idx 1")
# print(dense_output[1, 0, 0, :5])
# print(dense_output[1, 0, 1, :5])
# print(dense_output[1, 0, 2, :5])

# model_idx = 2
# combined_output[0, 0, model_idx*dense_dim: model_idx*dense_dim + 5]

In [9]:
class CombinedEmbeddingGenerator(tf.keras.utils.Sequence):
    def __init__(self, less_toxic_combined_embeddings, more_toxic_combined_embeddings,  batch_size=batch_size, shuffle=True):
        self.less_toxic_combined_embeddings = less_toxic_combined_embeddings
        self.more_toxic_combined_embeddings = more_toxic_combined_embeddings
        self.batch_size = batch_size
        self.shuffle = shuffle

        self.indexes = np.arange(len(self.less_toxic_combined_embeddings))
        self.on_epoch_end() # shuffle once

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.less_toxic_combined_embeddings) // self.batch_size + 1 if (len(self.less_toxic_combined_embeddings) % self.batch_size) != 0 else 0
    
    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

    def __getitem__(self, idx): # idx -> index batch
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        less_toxic_combined_embeddings = tf.convert_to_tensor(self.less_toxic_combined_embeddings[indexes], dtype=tf.float64)
        more_toxic_combined_embeddings = tf.convert_to_tensor(self.more_toxic_combined_embeddings[indexes], dtype=tf.float64)
        # targets = tf.convert_to_tensor([-1]*len(indexes), dtype=tf.float32) # for pytorch marginranking loss
        targets = tf.convert_to_tensor([[0, 1]]*len(indexes), dtype=tf.float32) # for tfr.keras.losses.PairwiseHingeLoss()

        return [less_toxic_combined_embeddings, more_toxic_combined_embeddings], targets


In [10]:
combined_embeddings_train = CombinedEmbeddingGenerator(combined_output[0, tr_ind, :], combined_output[1, tr_ind, :]) # tr_ind

combined_embeddings_val = CombinedEmbeddingGenerator(combined_output[0, val_ind, :], combined_output[1, val_ind, :]) # val_ind

In [11]:
def getModels():

    def loss(margin=1):  # https://keras.io/examples/vision/siamese_contrastive/
        # def contrastive_loss(y_true, y_pred):
        #     print(y_true, y_pred)
        #     square_pred = tf.math.square(y_pred)
        #     margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
        #     return tf.math.reduce_mean(
        #         (1 - y_true) * square_pred + (y_true) * margin_square
        #     )

        # return contrastive_loss
        def margin_loss(y_true, diff):
            # tf.print("y_true", y_true, "diff", diff)
            loss = tf.math.maximum( -y_true * diff + margin, 0)
            # tf.print("loss", loss)
            return tf.math.reduce_mean(loss)

        return margin_loss

    input = layers.Input((combined_output.shape[-1],))
    # print("tox_input", input)
    x = layers.Dense(combined_output.shape[-1], activation="tanh")(input) # same size as input 
    x = layers.Dense(combined_output.shape[-1]//2, activation="tanh")(x)
    output = layers.Dense(1)(x) # toxicity score
    # print("tox_output", output)

    tox_model = keras.Model(inputs=input, outputs=output)
    tox_model.summary()


    less_toxic_input = layers.Input((combined_output.shape[-1],))
    more_toxic_input = layers.Input((combined_output.shape[-1],))

    tower_1 = tox_model(less_toxic_input)
    tower_2 = tox_model(more_toxic_input)
    # merge_layer = tower_1 - tower_2
    # siamese = keras.Model(inputs=[less_toxic_input, more_toxic_input], outputs=merge_layer) # try to separate predictions using our embeddings from unitary ai !
    # siamese.compile(loss=loss(margin=margin), optimizer=tf.keras.optimizers.Adam(1e-4), metrics=[loss(margin=margin)])

    merge_layer = tf.keras.layers.Concatenate(axis=-1,)([tower_1, tower_2])
    siamese = keras.Model(inputs=[less_toxic_input, more_toxic_input], outputs=merge_layer)
    siamese.compile(loss=tfr.keras.losses.PairwiseHingeLoss(), optimizer=tf.keras.optimizers.Adam(1e-4), )#metrics=[tfr.keras.losses.PairwiseHingeLoss()])
    # siamese.summary()
    return siamese, tox_model

In [12]:
siamese, tox_model = getModels()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 2304)]            0         
_________________________________________________________________
dense (Dense)                (None, 2304)              5310720   
_________________________________________________________________
dense_1 (Dense)              (None, 1152)              2655360   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1153      
Total params: 7,967,233
Trainable params: 7,967,233
Non-trainable params: 0
_________________________________________________________________


2022-02-06 23:58:33.580099: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-06 23:58:33.651045: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-06 23:58:33.651521: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-06 23:58:33.653135: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow wi

In [13]:
n_epochs = 10

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, update_freq=1)

history = siamese.fit(
    combined_embeddings_train, # target inside it
    validation_data=combined_embeddings_val,
    epochs=n_epochs,
    callbacks=[tensorboard_callback]
)

2022-02-06 23:58:35.097972: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-02-06 23:58:35.098002: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-02-06 23:58:35.098987: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1614] Profiler found 1 GPUs
2022-02-06 23:58:35.099689: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcupti.so.11.2'; dlerror: libcupti.so.11.2: cannot open shared object file: No such file or directory
2022-02-06 23:58:35.181961: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-02-06 23:58:35.182143: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed
2022-02-06 23:58:35.267839: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
  2/753 [..............................] - ETA: 2:08 - loss: 0.4785 

2022-02-06 23:58:37.092877: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-02-06 23:58:37.126608: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-02-06 23:58:37.126658: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-02-06 23:58:37.292957: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2022-02-06 23:58:37.293837: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed


 13/753 [..............................] - ETA: 21s - loss: 0.5516

2022-02-06 23:58:37.303369: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 139 callback api events and 134 activity events. 
2022-02-06 23:58:37.305332: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-02-06 23:58:37.315194: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_06_23_58_37

2022-02-06 23:58:37.319294: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_06_23_58_37/DESKTOP-KPOCLK7.trace.json.gz
2022-02-06 23:58:37.328821: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_06_23_58_37

2022-02-06 23:58:37.331479: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped too

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
def calculateAcc(preds_less_toxic, preds_more_toxic):
    accuracy = np.sum((preds_more_toxic > preds_less_toxic))/preds_more_toxic.shape[0]
    return accuracy

In [15]:
# tox_model.predict(combined_embeddings_val)
preds = np.zeros((len(val_ind), 2))
lastidx = 0
for batch in combined_embeddings_val:
    less_tox, more_tox = batch[0][0], batch[0][1]
    preds[lastidx: lastidx + len(less_tox), 0] = tox_model.predict(less_tox).squeeze(1)
    preds[lastidx: lastidx + len(more_tox), 1] = tox_model.predict(more_tox).squeeze(1)
    lastidx += len(less_tox)

print("acc:", calculateAcc(preds[:, 0], preds[:, 1]))

acc: 0.7143806044503487


In [16]:
# tox_model.save("./output/unitaryAI-dense-layer")

## LOGS ##

1. v1 version: margin ranking from pytorch and distance as  simple diff. better than all my current models. weird thing : val_loss < train_loss. think valid_loss nad train_loss neeed to go down and can be improved 

loss: 0.3318 - margin_loss: 0.3318 - val_loss: 0.3283 - val_margin_loss: 0.3281 acc: 0.7163732979076719

2. adding batch normalization between hidden dense layer 1 and 2 reduced train_loss 10 times but has worse effect on val_loss 3 times and reduced accuracy

loss: 0.0300 - margin_loss: 0.0307 - val_loss: 1.0422 - val_margin_loss: 1.0428 acc: 0.617734971770176

3. adding batch normalization after input reduced train_loss 10 times but  has worse effect on val_loss 3 times and reduced accuracy to half!

loss: 0.0254 - margin_loss: 0.0254 - val_loss: 0.8329 - val_margin_loss: 0.8331 acc: 0.5059780803719695

4. Both batch normalization -> way worse

loss: 0.0226 - margin_loss: 0.0226 - val_loss: 2.2827 - val_margin_loss: 2.2818 acc: 0.4858850880106277

NOTE: dont apply batch normalization to NLP ig (but good for images and cnn i think)

5. changing the 2 hidden dense layer activation from relu to tanh didnt help. A bit better perf. """Using tanh now"""

loss: 0.3335 - margin_loss: 0.3335 - val_loss: 0.3306 - val_margin_loss: 0.3303 acc: 0.7178678180006642
loss: 0.3331 - margin_loss: 0.3331 - val_loss: 0.3252 - val_margin_loss: 0.3248 acc: 0.7205247426104284

6. adding one more dense layer with dense_dim//4 -> didnt help much. Total dense layer: 4

loss: 0.3341 - margin_loss: 0.3342 - val_loss: 0.3298 - val_margin_loss: 0.3288 acc: 0.7196944536698772
loss: 0.3341 - margin_loss: 0.3342 - val_loss: 0.3260 - val_margin_loss: 0.3250 acc: 0.7123879109930256

7. remove one last hidden dense layer. -> didnt help Total dense layer: 2. Using 3 dense layer only
loss: 0.3328 - margin_loss: 0.3328 - val_loss: 0.3293 - val_margin_loss: 0.3292 acc: 0.7122218532049153
loss: 0.3319 - margin_loss: 0.3319 - val_loss: 0.3302 - val_margin_loss: 0.3293 acc: 0.7062437728329458
after 20 epoch loss: 0.3281 - margin_loss: 0.3280 - val_loss: 0.3300 - val_margin_loss: 0.3295 acc: 0.7125539687811359

8. Adding layer normalization between dense layer -> didnt help.

loss: 0.3337 - margin_loss: 0.3337 - val_loss: 0.3279 - val_margin_loss: 0.3284  acc: 0.7165393556957821

9. pytorch marginrank loss -> -1 (ie second input is greater)  (less_toxic, more_toxic)
loss: 0.3334 - margin_loss: 0.3333 - val_loss: 0.3283 - val_margin_loss: 0.3282 acc: acc: 0.7155430089671205

pytorch marginrank loss -> -1 (ie first input is greater)  (more_toxic, less_toxic, )

loss: 0.3331 - margin_loss: 0.3331 - val_loss: 0.3286 - val_margin_loss: 0.3284 acc: 0.7168714712720027

Note: custom marginranking loss is correct atleast

10. Changed loss from pytorch margin ranking to tfr.keras.losses.PairwiseHingeLoss(). Looks more stable but less strict (less configurable) than the former.

loss: 0.3331 - val_loss: 0.3270 acc: 0.7175357024244438
after 20 epoch: loss: 0.3284 - val_loss: 0.3270 acc: 0.7186981069412155
after 30 epoch: loss: 0.3243 - val_loss: 0.3261 acc: 0.7138824310860179

10 epochs is good enough. 

## K fold

In [17]:
k_folds = 5
n_epochs = 10
output_model_path = "./output/unitaryAI-dense-layer-v2/folds/model_%s_%s"
kf = KFold(n_splits=k_folds, random_state=111, shuffle=True)
for f, (train_index, test_index) in enumerate(kf.split(list(range(combined_output.shape[1])))):
    combined_embeddings_train = CombinedEmbeddingGenerator(combined_output[0, train_index, :], combined_output[1, train_index, :]) # tr_ind
    combined_embeddings_val = CombinedEmbeddingGenerator(combined_output[0, test_index, :], combined_output[1, test_index, :]) # val_ind

    
    siamese, tox_model = getModels()

    history = siamese.fit(
        combined_embeddings_train, # target inside it
        validation_data=combined_embeddings_val,
        epochs=n_epochs,
    )

    preds = np.zeros((len(val_ind), 2))
    lastidx = 0
    for batch in combined_embeddings_val:
        less_tox, more_tox = batch[0][0], batch[0][1]
        preds[lastidx: lastidx + len(less_tox), 0] = tox_model.predict(less_tox).squeeze(1)
        preds[lastidx: lastidx + len(more_tox), 1] = tox_model.predict(more_tox).squeeze(1)
        lastidx += len(less_tox)

    acc = calculateAcc(preds[:, 0], preds[:, 1])
    print("f", f, "acc:", acc)
    model_name = output_model_path%(str(acc), "fold_"+str(f))
    tox_model.save(model_name)

    

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 2304)]            0         
_________________________________________________________________
dense_3 (Dense)              (None, 2304)              5310720   
_________________________________________________________________
dense_4 (Dense)              (None, 1152)              2655360   
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 1153      
Total params: 7,967,233
Trainable params: 7,967,233
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
f 0 acc: 0.7147127200265693


2022-02-07 00:02:03.519674: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./output/unitaryAI-dense-layer-v2/folds/model_0.7147127200265693_fold_0/assets
Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 2304)]            0         
_________________________________________________________________
dense_6 (Dense)              (None, 2304)              5310720   
_________________________________________________________________
dense_7 (Dense)              (None, 1152)              2655360   
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 1153      
Total params: 7,967,233
Trainable params: 7,967,233
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
f 1 acc: 0.71072733311