In [20]:
# Test saving and loading models and then training them further:

import datetime
import os
import pickle
import random
import warnings

import pandas as pd
from sklearn.model_selection import train_test_split

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    from tensorflow import keras
    import tensorflow as tf
    from transformers import BertTokenizer, TFBertForSequenceClassification

from utils import local_save_dir


EVAL_DIR = os.path.join(os.getcwd(), "data", "transfer_learning_evaluation")
if not os.path.exists(EVAL_DIR):
    os.mkdir(EVAL_DIR)
ENCODING_DIR = os.path.join(os.path.dirname(os.getcwd()), "encodings")
MODEL_DIR = os.path.join(os.path.dirname(os.getcwd()), "models")


def get_google_drive_download_url(raw_url: str):
    return "https://drive.google.com/uc?id=" + raw_url.split("/")[-2]


def shuffle(df: pd.DataFrame):
    "Make sure data is shuffled (deterministically)."
    ix = list(df.index)
    random.seed(42)
    random.shuffle(ix)
    return df.loc[ix].reset_index(drop=True)


def base_model():
    """Create a BERT model with parameters specified in the Bilal paper:
    https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2

        - model: TFBertForSequenceClassification
        - learning rate: 2e-5
        - epsilon: 1e-8
    """
    # Using the TFBertForSequenceClassification as specified in the paper:
    bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

    # Don't freeze any layers:
    untrainable = []
    trainable = [w.name for w in bert_model.weights]

    for w in bert_model.weights:
        if w.name in untrainable:
            w._trainable = False
        elif w.name in trainable:
            w._trainable = True

    # Compile the model:
    bert_model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08),
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]
    )

    return bert_model


datasets = dict()

print("bilal")

datasets["bilal"] = dict()

bilal_train_url = "https://drive.google.com/file/d/1i54O_JSAVtvP5ivor-ARJRkwSoBFdit1/view?usp=sharing"
bilal_test_url = "https://drive.google.com/file/d/1boRdmasHB6JZDNBrlt6MRB1pUVnxxY-6/view?usp=sharing"

bilal_train_val = pd.read_csv(get_google_drive_download_url(bilal_train_url), encoding="latin1")
bilal_test = pd.read_csv(get_google_drive_download_url(bilal_test_url), encoding="latin1")
# Split train into 90-10 split for train-validation as per the paper:
bilal_train, bilal_val = train_test_split(bilal_train_val, test_size=0.1, random_state=42)

datasets["bilal"]["train"] = bilal_train
datasets["bilal"]["test"] = bilal_test
datasets["bilal"]["val"] = bilal_val

datasets["bilal"]["x_col"] = "sentence"
datasets["bilal"]["y_col"] = "label"

print(f"> train={len(bilal_train):,}, test={len(bilal_test):,}, val={len(bilal_val):,}")


print("yelp")

datasets["yelp"] = dict()

yelp_train_url = "https://drive.google.com/file/d/104W3CqRu4hUK1ht7wPfi8r8fDT7xdFCf/view?usp=sharing"
yelp_valid_url = "https://drive.google.com/file/d/1--NRor8D2x5au59_B0LCk9wOHIc8Qh46/view?usp=sharing"
yelp_test_url = "https://drive.google.com/file/d/1-3Czl0HdsMiVnnTQ4ckoAL0mcEDZGpsP/view?usp=sharing"

yelp_train = pd.read_csv(get_google_drive_download_url(yelp_train_url), encoding="utf-8")
yelp_val = pd.read_csv(get_google_drive_download_url(yelp_valid_url), encoding="utf-8")
yelp_test = pd.read_csv(get_google_drive_download_url(yelp_test_url), encoding="utf-8")

datasets["yelp"]["train"] = yelp_train
datasets["yelp"]["test"] = yelp_test
datasets["yelp"]["val"] = yelp_val

datasets["yelp"]["x_col"] = "text"
datasets["yelp"]["y_col"] = "label"

print(f"> train={len(yelp_train):,}, test={len(yelp_test):,}, val={len(yelp_val):,}")


# Make all the encodings:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(df: pd.DataFrame, x_col: str):
    encodings = bert_tokenizer(
        list(df[x_col].iloc[:100].values),  # Only 100 samples for testing.
        max_length=320,
        truncation=True,
        padding="max_length", 
        return_tensors="tf"
    )
    return encodings


# Make the encodings and save them if not already done:
for name, values in datasets.items():
    dir_path = os.path.join(ENCODING_DIR, f"TEST_{name}")
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    for key in ("train", "val", "test"):
        print(f"{name} - {key}")
        fp = os.path.join(dir_path, f"{key}_tokenized.obj")
        if not os.path.exists(fp):
            print(f"> encoding ... ", end="")
            x_col = values["x_col"]
            encodings = tokenize(values[key], x_col)
            with open(fp, "wb") as f:
                pickle.dump(encodings, f)
            print("finished!")
        else:
            print("> already encoded!")


bilal
> train=7,200, test=2,000, val=800
yelp
> train=47,146, test=5,894, val=5,893
bilal - train
> encoding ... finished!
bilal - val
> encoding ... finished!
bilal - test
> encoding ... finished!
yelp - train
> encoding ... finished!
yelp - val
> encoding ... finished!
yelp - test
> encoding ... finished!


In [21]:
model = base_model()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Load encodings:

for name, values in datasets.items():
    dir_path = os.path.join(ENCODING_DIR, f"TEST_{name}")
    for key in ("train", "val", "test"):
        encodings_name = f"{key}_tokenized" 
        fp = os.path.join(dir_path, f"{encodings_name}.obj")
        if not os.path.exists(fp):
            print(f"File not found (run make_encodings.py first):\n  {fp}")
        else:
            with open(fp, "rb") as f:
                encodings = pickle.load(f)
                datasets[name][f"{key}_tokenized"] = encodings

In [5]:
# Train model:
# Create directory for storing checkpoints after each epoch:
checkpoint_dir = local_save_dir("checkpoints", model_name = "TEST")
checkpoint_path = checkpoint_dir + "/cp-{epoch:04d}.ckpt"

# Create a callback that saves the model's weights:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    verbose=1)

train_encodings = datasets["bilal"]["train_tokenized"]
y_train = datasets["bilal"]["train"]["label"].iloc[:100]
valid_encodings = datasets["bilal"]["val_tokenized"]
y_val = datasets["bilal"]["val"]["label"].iloc[:100]

# Fit the model saving weights every epoch:
history = model.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask],
    y_train.values,
    validation_data=(
        [valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask],
        y_val.values
    ),
    batch_size=16,
    epochs=4,
    callbacks=[cp_callback]
)


Created dir: /home/tp/models/TEST/checkpoints/2022_07_23__03_35_51
Epoch 1/4
Epoch 1: saving model to /home/tp/models/TEST/checkpoints/2022_07_23__03_35_51/cp-0001.ckpt
Epoch 2/4
Epoch 2: saving model to /home/tp/models/TEST/checkpoints/2022_07_23__03_35_51/cp-0002.ckpt
Epoch 3/4
Epoch 3: saving model to /home/tp/models/TEST/checkpoints/2022_07_23__03_35_51/cp-0003.ckpt
Epoch 4/4
Epoch 4: saving model to /home/tp/models/TEST/checkpoints/2022_07_23__03_35_51/cp-0004.ckpt


In [31]:
model_dir = local_save_dir("full_model_weights", model_name = "TEST")
model.save_weights(f"{model_dir}/pretrained_weights.h5")

Created dir: /home/tp/models/TEST/full_model_weights/2022_07_23__03_51_47


In [33]:
model_weights_path = "/home/tp/models/TEST/full_model_weights/2022_07_23__03_51_47"

In [37]:
def freeze_except_classifier(model):
    """Make all layers untrainable except the final classifier layers."""
    trainable, untrainable = 0, 0
    for w in model.weights:
        if w.name.split("/")[1] == "classifier":
            w._trainable = True
            trainable += 1
        else:
            w._trainable = False
            untrainable += 1
    print(f"Model now has {trainable} trainable layers, {untrainable} untrainable.")
    return model


loaded_model = base_model()
loaded_model.load_weights(f"{model_dir}/pretrained_weights.h5")
# loaded_model = keras.models.load_model(model_path)
loaded_model = freeze_except_classifier(loaded_model)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model now has 2 trainable layers, 199 untrainable.


In [38]:
train_encodings = datasets["yelp"]["train_tokenized"]

In [39]:
dataset_name = "yelp"

finetune_name = f"TEST_CLASSIFIER_FINETUNED_ON_yelp"

# Create directory for storing checkpoints after each epoch:
checkpoint_dir = local_save_dir("checkpoints", model_name = finetune_name)
checkpoint_path = checkpoint_dir + "/cp-{epoch:04d}.ckpt"

# Create a callback that saves the model's weights:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    verbose=1)

y_col = datasets[dataset_name]["y_col"]
y_train = datasets[dataset_name]["train"][y_col].iloc[:100]
y_val = datasets[dataset_name]["val"][y_col].iloc[:100]

# Fit the model saving weights every epoch:
history = loaded_model.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask],
    y_train.values,
    validation_data=(
        [valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask],
        y_val.values
    ),
    batch_size=16,
    epochs=4,
    callbacks=[cp_callback]
)

print("Saving model ...")
model_dir = local_save_dir("full_model", model_name = finetune_name)
loaded_model.save_weights("pretrained_weights.h5")


print("Saving history ...")
hist_dir = local_save_dir("history", model_name = finetune_name)
with open(os.path.join(hist_dir, "hist_dict"), "wb") as f:
    pickle.dump(history.history, f)

print("Finished!")


Created dir: /home/tp/models/TEST_CLASSIFIER_FINETUNED_ON_yelp/checkpoints/2022_07_23__03_54_05
Epoch 1/4








Epoch 1: saving model to /home/tp/models/TEST_CLASSIFIER_FINETUNED_ON_yelp/checkpoints/2022_07_23__03_54_05/cp-0001.ckpt
Epoch 2/4
Epoch 2: saving model to /home/tp/models/TEST_CLASSIFIER_FINETUNED_ON_yelp/checkpoints/2022_07_23__03_54_05/cp-0002.ckpt
Epoch 3/4
Epoch 3: saving model to /home/tp/models/TEST_CLASSIFIER_FINETUNED_ON_yelp/checkpoints/2022_07_23__03_54_05/cp-0003.ckpt
Epoch 4/4
Epoch 4: saving model to /home/tp/models/TEST_CLASSIFIER_FINETUNED_ON_yelp/checkpoints/2022_07_23__03_54_05/cp-0004.ckpt
Saving model ...
Created dir: /home/tp/models/TEST_CLASSIFIER_FINETUNED_ON_yelp/full_model/2022_07_23__03_54_34
Saving history ...
Created dir: /home/tp/models/TEST_CLASSIFIER_FINETUNED_ON_yelp/history/2022_07_23__03_54_35
Finished!
