In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import transformers
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras import backend as K

In [2]:
## Constants
target_key = "score"
text_key = "txt"
batch_size = 32
bert_path = "bert-base-uncased"
train_path = "./train/civil_with_downsample.csv"
lstm_hidden_dim = 64 # from 768 (bert) to 64  # the decrease is steep; u may lose info
dropout_rate = 0.3
output_dim = 1
max_length = 350

checkpoint_filepath = "./output/bert-2-bilstm-fine-tuning/ckpt-loss={loss:.5f}-epoch={epoch}-batch={batch}"
final_train_filepath = "./output/bert-2-bilstm-fine-tuning/bert/final_model"
save_after_batches = 3000
test_size_percent =  0.05

n_pass = 1
log_dir = "./log/bert-2-bilstm-fine-tuning/"
log_freq = 500 # batches
# n_steps_per_epoch = 500*15 # batches # val = 50k # 500*15 = 240k === persomerd after 5 times the val size
## improve # not worth
# total_batches = n_pass * len(train_data) # psuedo epoch to run  # len(train_data) gives batches
# n_epochs = total_batches//n_steps_per_epoch + 1 if total_batches%n_steps_per_epoch !=0 else 1
# print(total_batches, n_epochs)
n_epochs = 1
## 


load_model_path = "./output/bert-2-bilstm/bert/final_model"# "./train/jisawbert-1-bilstm-1-epoch/ckpt-loss0.14776-epoch1-batch30000"


In [3]:
whole_df = pd.read_csv(train_path)
whole_df.shape

(1109778, 2)

In [4]:
# tr_ind, val_ind = train_test_split(list(range(len(whole_df))) ,test_size = test_size_percent, random_state = 23)
# len(tr_ind), len(val_ind)

In [5]:
# idx = whole_df["txt"].str.len().idxmax() # split().len() # too slow
# max_length = len(whole_df["txt"][idx].split())
# # max_length = max_length+10 # for safety 
# max_length # 323 -> 350

In [4]:
class CivilDataGenerator(tf.keras.utils.Sequence): # could optimize more like BucketIterator for padding
    def __init__(self, texts, scores, tokenizer, batch_size=batch_size, shuffle=True, include_targets=True): # texts -> numpy array
        self.texts = texts
        self.scores = scores
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        self.tokenizer =  tokenizer # 
        self.indexes = np.arange(len(self.texts))
        self.on_epoch_end()
        
    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.texts) // self.batch_size + 1 if (len(self.texts) % self.batch_size) != 0 else 0
    
    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)
            
    def __getitem__(self, idx): # idx -> index batch
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        texts = self.texts[indexes]
        
        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            texts.tolist(), # num
            add_special_tokens=True, # not really needed in our case. 
            max_length=max_length, # bert has 512 max length # providing our own
            return_attention_mask=True, # need bcos to pad to max length
            return_token_type_ids=False, # not needed # needed when u have two sentences
            padding='max_length', #pad_to_max_length=True, # needed
            return_tensors="tf",
            truncation=True,
        )
        
        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        
        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            scores = np.array(self.scores[indexes], dtype="float32")
            return [input_ids, attention_masks], scores
        else:
            return [input_ids, attention_masks]
        
        
        

In [7]:
# Encoded token ids from BERT tokenizer.
# input_ids = tf.keras.layers.Input(
#     shape=(max_length,), dtype=tf.int32, name="input_ids"
# )
# # Attention masks indicates to the model which tokens should be attended to.
# attention_masks = tf.keras.layers.Input(
#     shape=(max_length,), dtype=tf.int32, name="attention_masks"
# )

# # Loading pretrained BERT model.
# bert_model = transformers.TFBertModel.from_pretrained(bert_path)
# # Freeze the BERT model to reuse the pretrained features without modifying them.
# bert_model.trainable = False ## not training bert ##

# bert_output = bert_model.bert(input_ids, attention_mask=attention_masks) # by default hidden_size = 768
# sequence_output = bert_output.last_hidden_state  # (batch_size, sequence_length, hidden_size) # ie each word representation 

# ## for the warning we are good. not using pooled_output for now, https://github.com/huggingface/transformers/issues/5421
# # pooled_output = bert_output.pooler_output # (batch_size, hidden_size) # ie whole text representational (kinda)

# # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
# bi_lstm = tf.keras.layers.Bidirectional(
#     tf.keras.layers.LSTM(lstm_hidden_dim, return_sequences=True) 
# )(sequence_output) # (batch_size,  sequence_length, lstm_hidden_dim*2) # merge_mode="concat"

# # bi_lstm = tf.keras.layers.Bidirectional(
# #     tf.keras.layers.LSTM(lstm_hidden_dim, return_sequences=True)
# # )(bi_lstm) # (batch_size,  sequence_length, lstm_hidden_dim*2) # stacked one more BiLSTM bcos with one stack its converging slowly (kind of plateau)

# # Applying hybrid pooling approach to bi_lstm sequence output.
# avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm) # averages over sequence length # (batch_size, lstm_hidden_dim*2)
# max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm) # (batch_size, lstm_hidden_dim*2)
# concat = tf.keras.layers.concatenate([avg_pool, max_pool]) #(batch_size, lstm_hidden_dim*3)
# dropout = tf.keras.layers.Dropout(dropout_rate)(concat) #(batch_size, lstm_hidden_dim*3)

# output = tf.keras.layers.Dense(output_dim)(dropout) # 1 since our target is 1 bcos regression 


# model = tf.keras.models.Model(
#     inputs=[input_ids, attention_masks], outputs=output
# )
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(),
#     loss= 'mse',# tf.keras.losses.MeanSquaredError(),
#     metrics=[tf.keras.metrics.MeanSquaredError(), tf.keras.metrics.RootMeanSquaredError()],
# )

# model.summary()

In [5]:
tokenizer = transformers.BertTokenizer.from_pretrained(bert_path, do_lower_case=True)

In [9]:
# resume # 21000/34681 and 3000/13681 was done => remaining = 13681 batches
# whole_df = whole_df[24000*batch_size:]

In [6]:
train_data = CivilDataGenerator(
    whole_df[text_key].values,#whole_df[text_key][tr_ind].values, # not using validation data
    whole_df[target_key].values, #whole_df[target_key][tr_ind].values,
    tokenizer,
    batch_size=batch_size,
    shuffle=True,
)
# valid_data = CivilDataGenerator(
#     whole_df[text_key][val_ind].values,
#     whole_df[target_key][val_ind].values,
#     tokenizer,
#     batch_size=batch_size,
#     shuffle=False,
# )

In [11]:
# print("batch size", batch_size)
# n_examples_train = (1-test_size_percent)*whole_df.shape[0]
# print("examples in train_data", (1-test_size_percent)*whole_df.shape[0] )
# print("number of batches in training", n_examples_train//batch_size )

batch size 32
examples in train_data 1054289.0999999999
number of batches in training 32946.0


In [7]:
# model = tf.keras.models.load_model('./bert/ckpt-loss=0.49327-epoch=1-batch=10') #model is around 479.4MB 
# model.load_weights(load_model_path)
model = tf.keras.models.load_model(load_model_path)

2022-02-03 04:17:00.051460: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-03 04:17:00.109700: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-03 04:17:00.109944: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-03 04:17:00.111082: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow wi

In [13]:
## fine tuning ##
model.layers[2].trainable = True
print(model.layers[2], model.layers[2].trainable)
# for l in model.layers:
#     print(l, l.trainable) # BERT LAYER IS false


# dunno why compiling again is leading to OOM :/
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(1e-5), # default is 0.001
#     loss= 'mse',# tf.keras.losses.MeanSquaredError(),
#     metrics=[tf.keras.metrics.MeanSquaredError(), tf.keras.metrics.RootMeanSquaredError()],
# )

K.set_value(model.optimizer.learning_rate, 1e-5)
model.optimizer.learning_rate
# K.set_value(model.optimizer.learning_rate, 1e-5)


## fine tuning ## 

<keras.saving.saved_model.load.Custom>TFBertMainLayer object at 0x7fa81d280340> True


<tf.Variable 'learning_rate:0' shape=() dtype=float32, numpy=1e-05>

In [9]:
len(train_data) # number of batches

34681

In [10]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, update_freq=log_freq)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor= 'loss',#'val_loss',
    save_freq=save_after_batches, # save after x batches
#     save_weights_only=True # aroud 444MB # not saving much
    save_best_only=True, # to save some space
    mode="min", # for loss
)

with tf.device('/device:GPU:0'):
# batch_size is used in generators so not specified here
    history = model.fit(
        train_data,
        # validation_data=valid_data, 
        epochs=n_epochs,
        use_multiprocessing=True, # can only be used when x, y are generators
        workers=-1,
        # steps_per_epoch=n_steps_per_epoch,
        callbacks=[model_checkpoint_callback, tensorboard_callback],
    )
    model.save(final_train_filepath)

2022-02-03 04:17:19.586042: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-02-03 04:17:19.586080: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-02-03 04:17:19.586934: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1614] Profiler found 1 GPUs
2022-02-03 04:17:19.587227: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcupti.so.11.2'; dlerror: libcupti.so.11.2: cannot open shared object file: No such file or directory
2022-02-03 04:17:19.625414: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-02-03 04:17:19.625557: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed
2022-02-03 04:17:19.717292: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-02-03 04:17:26.711991: I tensorflow/stream

    1/34681 [..............................] - ETA: 87:07:51 - loss: 0.1210 - mean_squared_error: 0.1210 - root_mean_squared_error: 0.3479

2022-02-03 04:17:29.071666: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-02-03 04:17:29.071712: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.


    2/34681 [..............................] - ETA: 15:00:51 - loss: 0.1129 - mean_squared_error: 0.1129 - root_mean_squared_error: 0.3360

2022-02-03 04:17:30.342916: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2022-02-03 04:17:30.343145: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed
2022-02-03 04:17:30.365388: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 7144 callback api events and 7178 activity events. 
2022-02-03 04:17:30.422742: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-02-03 04:17:30.485055: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./log/bert-2-bilstm-fine-tuning/train/plugins/profile/2022_02_03_04_17_30

2022-02-03 04:17:30.541983: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to ./log/bert-2-bilstm-fine-tuning/train/plugins/profile/2022_02_03_04_17_30/DESKTOP-KPOCLK7.trace.json.gz
2022-02-03 04:17:30.608203: I tensorflow/core/profiler/rpc/clie

 3000/34681 [=>............................] - ETA: 6:21:01 - loss: 0.1111 - mean_squared_error: 0.1111 - root_mean_squared_error: 0.3333

2022-02-03 04:53:40.167291: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/ckpt-loss=0.11108-epoch=1-batch=3000/assets


INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/ckpt-loss=0.11108-epoch=1-batch=3000/assets


 6000/34681 [====>.........................] - ETA: 5:45:05 - loss: 0.1102 - mean_squared_error: 0.1102 - root_mean_squared_error: 0.3320



INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/ckpt-loss=0.11019-epoch=1-batch=6000/assets


INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/ckpt-loss=0.11019-epoch=1-batch=6000/assets






INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/ckpt-loss=0.11003-epoch=1-batch=18000/assets


INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/ckpt-loss=0.11003-epoch=1-batch=18000/assets






INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/ckpt-loss=0.10993-epoch=1-batch=24000/assets


INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/ckpt-loss=0.10993-epoch=1-batch=24000/assets






INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/ckpt-loss=0.10986-epoch=1-batch=27000/assets


INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/ckpt-loss=0.10986-epoch=1-batch=27000/assets




2022-02-03 11:12:31.026802: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 2812907520 exceeds 10% of free system memory.


INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/bert/final_model/assets


INFO:tensorflow:Assets written to: ./output/bert-2-bilstm-fine-tuning/bert/final_model/assets


In [12]:
history.history

{'loss': [0.10993647575378418],
 'mean_squared_error': [0.10993647575378418],
 'root_mean_squared_error': [0.3315666913986206]}

In [None]:
# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())
# tf.config.list_physical_devices('GPU')[0].name

In [None]:
# finetune bert 
# use tfidf in ensemble
# figure out why enmsemble is not working
# debug where model is predicting wrong from validation data