In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import tensorflow_ranking as tfr
from sklearn.model_selection import KFold
from tqdm import tqdm 
# import tensorflow_recommenders as tfrs

In [2]:
## constants ##
test_size_percent = 0.2 # 20% test/val data
ruddit_data_path = "./train/removed_redundant_ruddit_with_text.csv"
val_data_path = "./train/validation_data.csv"
more_toxic_key = "more_toxic"
less_toxic_key = "less_toxic"
score_key = "score"

dense_dim = 768
hidden_dim = dense_dim*3 # 3 models
batch_size=32

margin = 0.5 # maybe less
log_dir = "./log/unitaryAI-dense-layer-ensemble/"

## Ruddit

In [3]:
ruddit_df = pd.read_csv(ruddit_data_path)
ruddit_df.shape

(5710, 3)

In [4]:
with open('./output/unitaryAI_ruddit_dense_output.npy', 'rb') as f:
    ruddit_dense_output = np.load(f)

ruddit_dense_output.shape # (examples, models, dense_dim)

(5710, 3, 768)

In [5]:
ruddit_combined_dense_output = np.concatenate([ruddit_dense_output[:, 0, :], ruddit_dense_output[:, 1, :], ruddit_dense_output[:, 2, :]], axis=-1)
ruddit_combined_dense_output.shape

(5710, 2304)

In [6]:
tr_ind, val_ind = train_test_split(list(range(ruddit_combined_dense_output.shape[0])) ,test_size = test_size_percent, random_state = 321)
len(tr_ind), len(val_ind)

(4568, 1142)

In [7]:
ruddit_train = ruddit_combined_dense_output[tr_ind, :]
ruddit_val = ruddit_combined_dense_output[val_ind, :]

y_train = ruddit_df[score_key][tr_ind].values
y_val = ruddit_df[score_key][val_ind].values

ruddit_train.shape, ruddit_val.shape, y_train.shape, y_val.shape

((4568, 2304), (1142, 2304), (4568,), (1142,))

In [8]:
def getTOXModel():
    input = layers.Input((ruddit_combined_dense_output.shape[-1],))
    # print("tox_input", input)
    x = layers.Dense(ruddit_combined_dense_output.shape[-1])(input) # same size as input 
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = layers.Dense(ruddit_combined_dense_output.shape[-1]//2)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    output = layers.Dense(1, activation="tanh")(x) # toxicity score # needed tanh for ruddit # not used in another notebook with valid_data only
    # print("tox_output", output)

    tox_model = keras.Model(inputs=input, outputs=output)
    tox_model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss= 'mse',# tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    tox_model.summary()
    return tox_model

In [9]:
tox_model = getTOXModel()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 2304)]            0         
_________________________________________________________________
dense (Dense)                (None, 2304)              5310720   
_________________________________________________________________
batch_normalization (BatchNo (None, 2304)              9216      
_________________________________________________________________
activation (Activation)      (None, 2304)              0         
_________________________________________________________________
dropout (Dropout)            (None, 2304)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1152)              2655360   
_________________________________________________________________
batch_normalization_1 (Batch (None, 1152)              4608  

2022-02-08 23:24:19.035095: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-08 23:24:19.057376: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-08 23:24:19.057805: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-08 23:24:19.058254: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow wi

In [10]:
n_epochs = 30

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, update_freq=1)

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True,)

tox_model.fit(ruddit_train, y_train, epochs=n_epochs, validation_data=(ruddit_val, y_val), callbacks=[tensorboard_callback, callback], batch_size=batch_size, shuffle=True)
# old one acc: 0.7074061773497177 # acc: 0.7080704085021587 # acc: 0.7092328130189306

2022-02-08 23:24:20.113730: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-02-08 23:24:20.113762: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-02-08 23:24:20.113793: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1614] Profiler found 1 GPUs
2022-02-08 23:24:20.114289: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcupti.so.11.2'; dlerror: libcupti.so.11.2: cannot open shared object file: No such file or directory


Epoch 1/30


2022-02-08 23:24:20.191764: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-02-08 23:24:20.191915: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed
2022-02-08 23:24:20.251033: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


  1/143 [..............................] - ETA: 3:04 - loss: 0.4123 - root_mean_squared_error: 0.6421

2022-02-08 23:24:21.539018: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-02-08 23:24:21.563267: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-02-08 23:24:21.563305: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.


 11/143 [=>............................] - ETA: 6s - loss: 0.6002 - root_mean_squared_error: 0.7747

2022-02-08 23:24:21.863160: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2022-02-08 23:24:21.863670: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed
2022-02-08 23:24:21.877272: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 181 callback api events and 178 activity events. 
2022-02-08 23:24:21.879550: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-02-08 23:24:21.886840: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_08_23_24_21

2022-02-08 23:24:21.891923: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_08_23_24_21/DESKTOP-KPOCLK7.trace.json.gz
2022-02-08 23:24:21.899320: I tensorflow/core/profiler/

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30


<keras.callbacks.History at 0x7f5e504ab520>

In [11]:
# tox_model.save("./output/unitaryAI-dense-layer-ruddit")

## Validation data tuning

In [12]:
# val_df = pd.read_csv(val_data_path)
# val_df.shape

In [13]:
with open('./output/unitaryAI_validation_data_dense_output.npy', 'rb') as f:
    dense_output = np.load(f)

dense_output.shape #  (dataloaders, examples, models, dense_dim)

(2, 30108, 3, 768)

In [14]:
combined_output =  np.concatenate([dense_output[:, :, 0, :], dense_output[:, :, 1, :], dense_output[:, :, 2, :]], axis=-1)
combined_output.shape

(2, 30108, 2304)

In [15]:
tr_ind, val_ind = train_test_split(list(range(combined_output.shape[1])) ,test_size = test_size_percent, random_state = 2343)
len(tr_ind), len(val_ind)

(24086, 6022)

In [16]:
## cross check
# print(dense_output[0, 0, 0, :5])
# print(dense_output[0, 0, 1, :5])
# print(dense_output[0, 0, 2, :5])

# print("idx 1")
# print(dense_output[1, 0, 0, :5])
# print(dense_output[1, 0, 1, :5])
# print(dense_output[1, 0, 2, :5])

# model_idx = 2
# combined_output[0, 0, model_idx*dense_dim: model_idx*dense_dim + 5]

In [17]:
class CombinedEmbeddingGenerator(tf.keras.utils.Sequence): # for validation data
    def __init__(self, less_toxic_combined_embeddings, more_toxic_combined_embeddings,  batch_size=batch_size, shuffle=True):
        self.less_toxic_combined_embeddings = less_toxic_combined_embeddings
        self.more_toxic_combined_embeddings = more_toxic_combined_embeddings
        self.batch_size = batch_size
        self.shuffle = shuffle

        self.indexes = np.arange(len(self.less_toxic_combined_embeddings))
        self.on_epoch_end() # shuffle once

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.less_toxic_combined_embeddings) // self.batch_size + 1 if (len(self.less_toxic_combined_embeddings) % self.batch_size) != 0 else 0
    
    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

    def __getitem__(self, idx): # idx -> index batch
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        less_toxic_combined_embeddings = tf.convert_to_tensor(self.less_toxic_combined_embeddings[indexes], dtype=tf.float64)
        more_toxic_combined_embeddings = tf.convert_to_tensor(self.more_toxic_combined_embeddings[indexes], dtype=tf.float64)
        # targets = tf.convert_to_tensor([-1]*len(indexes), dtype=tf.float32) # for pytorch marginranking loss
        targets = tf.convert_to_tensor([[0, 1]]*len(indexes), dtype=tf.float32) # for tfr.keras.losses.PairwiseHingeLoss()

        return [less_toxic_combined_embeddings, more_toxic_combined_embeddings], targets


In [18]:
combined_embeddings_train = CombinedEmbeddingGenerator(combined_output[0, tr_ind, :], combined_output[1, tr_ind, :]) # tr_ind

combined_embeddings_val = CombinedEmbeddingGenerator(combined_output[0, val_ind, :], combined_output[1, val_ind, :]) # val_ind

In [19]:
def getModels(tox_model):

    def loss(margin=1):  # https://keras.io/examples/vision/siamese_contrastive/
        # def contrastive_loss(y_true, y_pred):
        #     print(y_true, y_pred)
        #     square_pred = tf.math.square(y_pred)
        #     margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
        #     return tf.math.reduce_mean(
        #         (1 - y_true) * square_pred + (y_true) * margin_square
        #     )

        # return contrastive_loss
        def margin_loss(y_true, diff): # https://pytorch.org/docs/stable/generated/torch.nn.MarginRankingLoss.html 
            # tf.print("y_true", y_true, "diff", diff)
            loss = tf.math.maximum( -y_true * diff + margin, 0)
            # tf.print("loss", loss)
            return tf.math.reduce_mean(loss)

        return margin_loss

    less_toxic_input = layers.Input((combined_output.shape[-1],))
    more_toxic_input = layers.Input((combined_output.shape[-1],))

    tower_1 = tox_model(less_toxic_input)
    tower_2 = tox_model(more_toxic_input)
    merge_layer = tower_1 - tower_2
    siamese = keras.Model(inputs=[less_toxic_input, more_toxic_input], outputs=merge_layer) # try to separate predictions using our embeddings from unitary ai !
    siamese.compile(loss=loss(margin=margin), optimizer=tf.keras.optimizers.Adam(1e-4), metrics=[loss(margin=margin)])

    # merge_layer = tower_1 - tower_2
    # siamese = keras.Model(inputs=[less_toxic_input, more_toxic_input], outputs=merge_layer)
    # siamese.compile(loss=tfr.keras.losses.PairwiseHingeLoss(), optimizer=tf.keras.optimizers.Adam(1e-5), )#metrics=[tfr.keras.losses.PairwiseHingeLoss()])
    # siamese.summary()
    return siamese, tox_model

In [20]:
# tox_model = tf.keras.models.load_model("./output/unitaryAI-dense-layer-ruddit") ### acc: 0.7074061773497177 without finetuning on validation data!!! ###

# NOTE: finetuning on valid data does not help at lot. training loss goes down but validation goes up. Clear overfitting :/ One reason being ruddit and validation data have different rates/standards of toxicity (ie different distribution). So cant really train/fine tune on both :/ ig

In [21]:
siamese, tox_model = getModels(tox_model)

In [22]:
n_epochs = 30

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, update_freq=1)
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True,)

history = siamese.fit(
    combined_embeddings_train, # target inside it
    validation_data=combined_embeddings_val,
    epochs=n_epochs,
    callbacks=[tensorboard_callback, callback]
)

2022-02-08 23:25:07.623738: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-02-08 23:25:07.623773: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-02-08 23:25:08.125266: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-02-08 23:25:08.125413: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed


Epoch 1/30
  2/753 [..............................] - ETA: 1:46 - loss: 0.5154 - margin_loss: 0.5154

2022-02-08 23:25:08.743626: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-02-08 23:25:08.743658: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-02-08 23:25:08.896249: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2022-02-08 23:25:08.896556: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed
2022-02-08 23:25:08.909310: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 311 callback api events and 306 activity events. 
2022-02-08 23:25:08.912953: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-02-08 23:25:08.920996: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_08_23_25_08

2022-02-08 23:25:08.927492: I tensorflow/core/profiler/rpc/client/save_p

  5/753 [..............................] - ETA: 1:15 - loss: 0.5034 - margin_loss: 0.5034

2022-02-08 23:25:08.955166: I tensorflow/core/profiler/rpc/client/capture_profile.cc:251] Creating directory: ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_08_23_25_08
Dumped tool data for xplane.pb to ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_08_23_25_08/DESKTOP-KPOCLK7.xplane.pb
Dumped tool data for overview_page.pb to ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_08_23_25_08/DESKTOP-KPOCLK7.overview_page.pb
Dumped tool data for input_pipeline.pb to ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_08_23_25_08/DESKTOP-KPOCLK7.input_pipeline.pb
Dumped tool data for tensorflow_stats.pb to ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_08_23_25_08/DESKTOP-KPOCLK7.tensorflow_stats.pb
Dumped tool data for kernel_stats.pb to ./log/unitaryAI-dense-layer-ensemble/train/plugins/profile/2022_02_08_23_25_08/DESKTOP-KPOCLK7.kernel_stats.pb



Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30


In [23]:
def calculateAcc(preds_less_toxic, preds_more_toxic):
    accuracy = np.sum((preds_more_toxic > preds_less_toxic))/preds_more_toxic.shape[0]
    return accuracy

In [24]:
# tox_model.predict(combined_embeddings_val)
preds = np.zeros((len(val_ind), 2))
lastidx = 0
for batch in combined_embeddings_val:
    less_tox, more_tox = batch[0][0], batch[0][1]
    preds[lastidx: lastidx + len(less_tox), 0] = tox_model.predict(less_tox).squeeze(1)
    preds[lastidx: lastidx + len(more_tox), 1] = tox_model.predict(more_tox).squeeze(1)
    lastidx += len(less_tox)

print("acc:", calculateAcc(preds[:, 0], preds[:, 1]))

acc: 0.3249750913317835


In [25]:
# tox_model.save("./output/unitaryAI-dense-layer")

## LOGS ##

1. v1 version: margin ranking from pytorch and distance as  simple diff. better than all my current models. weird thing : val_loss < train_loss. think valid_loss nad train_loss neeed to go down and can be improved 

loss: 0.3318 - margin_loss: 0.3318 - val_loss: 0.3283 - val_margin_loss: 0.3281 acc: 0.7163732979076719

2. adding batch normalization between hidden dense layer 1 and 2 reduced train_loss 10 times but has worse effect on val_loss 3 times and reduced accuracy

loss: 0.0300 - margin_loss: 0.0307 - val_loss: 1.0422 - val_margin_loss: 1.0428 acc: 0.617734971770176

3. adding batch normalization after input reduced train_loss 10 times but  has worse effect on val_loss 3 times and reduced accuracy to half!

loss: 0.0254 - margin_loss: 0.0254 - val_loss: 0.8329 - val_margin_loss: 0.8331 acc: 0.5059780803719695

4. Both batch normalization -> way worse

loss: 0.0226 - margin_loss: 0.0226 - val_loss: 2.2827 - val_margin_loss: 2.2818 acc: 0.4858850880106277

NOTE: dont apply batch normalization to NLP ig (but good for images and cnn i think)

5. changing the 2 hidden dense layer activation from relu to tanh didnt help. A bit better perf. """Using tanh now"""

loss: 0.3335 - margin_loss: 0.3335 - val_loss: 0.3306 - val_margin_loss: 0.3303 acc: 0.7178678180006642
loss: 0.3331 - margin_loss: 0.3331 - val_loss: 0.3252 - val_margin_loss: 0.3248 acc: 0.7205247426104284

6. adding one more dense layer with dense_dim//4 -> didnt help much. Total dense layer: 4

loss: 0.3341 - margin_loss: 0.3342 - val_loss: 0.3298 - val_margin_loss: 0.3288 acc: 0.7196944536698772
loss: 0.3341 - margin_loss: 0.3342 - val_loss: 0.3260 - val_margin_loss: 0.3250 acc: 0.7123879109930256

7. remove one last hidden dense layer. -> didnt help Total dense layer: 2. Using 3 dense layer only
loss: 0.3328 - margin_loss: 0.3328 - val_loss: 0.3293 - val_margin_loss: 0.3292 acc: 0.7122218532049153
loss: 0.3319 - margin_loss: 0.3319 - val_loss: 0.3302 - val_margin_loss: 0.3293 acc: 0.7062437728329458
after 20 epoch loss: 0.3281 - margin_loss: 0.3280 - val_loss: 0.3300 - val_margin_loss: 0.3295 acc: 0.7125539687811359

8. Adding layer normalization between dense layer -> didnt help.

loss: 0.3337 - margin_loss: 0.3337 - val_loss: 0.3279 - val_margin_loss: 0.3284  acc: 0.7165393556957821

9. pytorch marginrank loss -> -1 (ie second input is greater)  (less_toxic, more_toxic)
loss: 0.3334 - margin_loss: 0.3333 - val_loss: 0.3283 - val_margin_loss: 0.3282 acc: acc: 0.7155430089671205

pytorch marginrank loss -> -1 (ie first input is greater)  (more_toxic, less_toxic, )

loss: 0.3331 - margin_loss: 0.3331 - val_loss: 0.3286 - val_margin_loss: 0.3284 acc: 0.7168714712720027

Note: custom marginranking loss is correct atleast

10. Changed loss from pytorch margin ranking to tfr.keras.losses.PairwiseHingeLoss(). Looks more stable but less strict (less configurable) than the former.

loss: 0.3331 - val_loss: 0.3270 acc: 0.7175357024244438
after 20 epoch: loss: 0.3284 - val_loss: 0.3270 acc: 0.7186981069412155
after 30 epoch: loss: 0.3243 - val_loss: 0.3261 acc: 0.7138824310860179

10 epochs is good enough. 

## K fold

In [26]:
# k_folds = 5
# n_epochs = 10
# output_model_path = "./output/unitaryAI-dense-layer-v2/folds/model_%s_%s"
# kf = KFold(n_splits=k_folds, random_state=111, shuffle=True)
# for f, (train_index, test_index) in enumerate(kf.split(list(range(combined_output.shape[1])))):
#     combined_embeddings_train = CombinedEmbeddingGenerator(combined_output[0, train_index, :], combined_output[1, train_index, :]) # tr_ind
#     combined_embeddings_val = CombinedEmbeddingGenerator(combined_output[0, test_index, :], combined_output[1, test_index, :]) # val_ind

    
#     siamese, tox_model = getModels()

#     history = siamese.fit(
#         combined_embeddings_train, # target inside it
#         validation_data=combined_embeddings_val,
#         epochs=n_epochs,
#     )

#     preds = np.zeros((len(val_ind), 2))
#     lastidx = 0
#     for batch in combined_embeddings_val:
#         less_tox, more_tox = batch[0][0], batch[0][1]
#         preds[lastidx: lastidx + len(less_tox), 0] = tox_model.predict(less_tox).squeeze(1)
#         preds[lastidx: lastidx + len(more_tox), 1] = tox_model.predict(more_tox).squeeze(1)
#         lastidx += len(less_tox)

#     acc = calculateAcc(preds[:, 0], preds[:, 1])
#     print("f", f, "acc:", acc)
#     model_name = output_model_path%(str(acc), "fold_"+str(f))
#     tox_model.save(model_name)

    