<a href="https://colab.research.google.com/github/toby-p/w266-final-project/blob/main/Bilal_et_al_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers --quiet

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from transformers import BertTokenizer, TFBertForSequenceClassification 

In [None]:
# Code for helping save models to GDrive after training:

import datetime
import os

from google.colab import drive

# Mount Google Drive:
drive.mount("/content/gdrive")

# Directory where models will be stored in GDrive:
MODEL_DIR = "/content/gdrive/MyDrive/models"

# Make the directories for storing results if they don't exist yet:
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)


def gdrive_save_dir(*subdir: str, model_name: str = "test_model"): 
    """Create timestamped directory in GDrive for storing checkpoints or models.
    
    Args:
        subdir: optional subdirectories of the main model directory
            (e.g. `checkpoints`, `final_model`, etc.)
        model_name: main name for directory specifying the model being saved.
    """
    model_dir = f"{MODEL_DIR}/{model_name}"
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    for s in subdir:
        model_dir = f"{model_dir}/{s}"
        if not os.path.exists(model_dir):
            os.mkdir(model_dir)
    now = datetime.datetime.now()
    now_str = now.strftime("%Y_%m_%d__%H_%M_%S")
    dir_path = f"{model_dir}/{now_str}"
    os.mkdir(dir_path)
    print(f"Created checkpoint dir: {dir_path}")
    return dir_path


gdrive_save_dir("checkpoints", model_name = "test_model")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Created checkpoint dir: /content/gdrive/MyDrive/models/test_model/checkpoints/2022_07_17__23_02_17


'/content/gdrive/MyDrive/models/test_model/checkpoints/2022_07_17__23_02_17'

## Create train and test data

In [None]:
# Using the data downloaded from here: https://sites.google.com/view/review-helpfulness-prediction/datasets
# Saved publicly in Google Drive.

def get_google_drive_download_url(raw_url: str):
    return "https://drive.google.com/uc?id=" + raw_url.split("/")[-2]


train_url = "https://drive.google.com/file/d/1i54O_JSAVtvP5ivor-ARJRkwSoBFdit1/view?usp=sharing"
test_url = "https://drive.google.com/file/d/1boRdmasHB6JZDNBrlt6MRB1pUVnxxY-6/view?usp=sharing"

bilal_train = pd.read_csv(get_google_drive_download_url(train_url), encoding="latin1")
bilal_test = pd.read_csv(get_google_drive_download_url(test_url), encoding="latin1")

In [None]:
bilal_train.tail()

Unnamed: 0,id,sentence,label
7995,89260,Easy access off the 101 lots of parking in the...,0
7996,62116,Meh. I went in for some accessories and a part...,0
7997,11115,Worst customer service ever. I called the stor...,0
7998,11885,I had my Canon Rebel T1i repaired after I drop...,0
7999,53295,Great store a little short on boys youth sizes...,0


In [None]:
bilal_test.tail()

Unnamed: 0,id,sentence,label
1995,63354,Big sale this week. All sort of little gadets ...,0
1996,45423,The new owner and management are great. I didn...,0
1997,12024,Came here to check out their Patio Furniture. ...,0
1998,89218,I brought in a flash drive with a 3-page docum...,0
1999,45672,Super helpful. Taught me exactly how to gel st...,0


In [None]:
# See if the classes are even:
print("Train class balance: ", bilal_train["label"].mean())
print("Test class balance: ", bilal_test["label"].mean())

Train class balance:  0.5
Test class balance:  0.5


In [None]:
x_train_full = bilal_train["sentence"]
y_train_full = bilal_train["label"]
x_test = bilal_test["sentence"]
y_test = bilal_test["label"]

In [None]:
# Split train into 90-10 split for train-validation as per the paper:
x_train, x_val, y_train, y_val = train_test_split(x_train_full, y_train_full, test_size=0.1)

print(f"Shape X_train: {x_train.shape}")
print(f"Shape X_valid: {x_val.shape}")
print(f"Shape y_train: {y_train.shape}")
print(f"Shape y_val: {y_val.shape}")

Shape X_train: (7200,)
Shape X_valid: (800,)
Shape y_train: (7200,)
Shape y_val: (800,)


## Tokenize inputs

In [None]:
# Using BERT base uncased tokenizer as per the paper:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Use sequence length 320, which achieved best accuracy and F1-score of all sequence lengths tried in the paper:
# https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/4
max_length = 320

train_encodings = bert_tokenizer(
    list(x_train.values), 
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf'
)

valid_encodings = bert_tokenizer(
    list(x_val.values), 
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf'
)

test_encodings = bert_tokenizer(
    list(x_test.values), 
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf'
)

## Model 1: Baseline model with unfrozen classification layers

In [None]:

MODEL_NAME = "bilal_classifier_layers_only"


def bilal_classifier_layers_only():
    """Create a BERT model using the model and parameters specified in the Bilal paper:
    https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2

        - model: TFBertForSequenceClassification
        - learning rate: 2e-5
        - epsilon: 1e-8
    """
    # Using the TFBertForSequenceClassification as specified in the paper:
    bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Freeze all layers except the last 4 which are the pooled classification layers:
    untrainable = [w.name for w in bert_model.weights[:-4]]
    trainable = [w.name for w in bert_model.weights[-4:]]

    for w in bert_model.weights:
        if w.name in untrainable:
            w._trainable = False
        elif w.name in trainable:
            w._trainable = True

    # Compile the model:
    bert_model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08),
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]
    )

    return bert_model


model = bilal_classifier_layers_only()
print(model.summary())

# Train the model using the specifications from the paper: https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2
# -- epochs = 4
# -- batch_size = 32

# Create directory for storing checkpoints after each epoch:
checkpoint_dir = gdrive_save_dir("checkpoints", model_name = MODEL_NAME)
checkpoint_path = checkpoint_dir + "/cp-{epoch:04d}.ckpt"

# Create a callback that saves the model's weights:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    verbose=1)

# Fit the model saving weights every epoch:
history = model.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
    y_train.values,
    validation_data=(
        [valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
        y_val.values
        ),
    batch_size=32, 
    epochs=4,
    callbacks=[cp_callback]
)

# Save the entire model to GDrive:
model_dir = gdrive_save_dir("full_model", model_name = MODEL_NAME)
model.save(model_dir)

# Save scores on the test set:
test_score = model.evaluate([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask], y_test)
print("Test loss:", test_score[0])
print("Test accuracy:", test_score[1])
score_fp = os.path.join(model_dir, "test_score.txt")
with open(score_fp, "w") as f:
    f.write(f"Test loss = {test_score[0]}\n")
    f.write(f"Test accuracy = {test_score[1]}\n")

# Save predictions and classification_report:
predictions = model.predict([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask])
preds_fp = os.path.join(model_dir, "test_predictions.csv")
pred_df = pd.DataFrame(predictions.to_tuple()[0], columns=["pred_prob_0", "pred_prob_1"])
pred_df["yhat"] = pred_df[["pred_prob_0", "pred_prob_1"]].values.argmax(1)
pred_df["y"] = y_test
pred_df["category"] = np.where((pred_df["yhat"] == 1) & (pred_df["y"] == 1), "tp", None)
pred_df["category"] = np.where((pred_df["yhat"] == 0) & (pred_df["y"] == 0), "tn", pred_df["category"])
pred_df["category"] = np.where((pred_df["yhat"] == 1) & (pred_df["y"] == 0), "fp", pred_df["category"])
pred_df["category"] = np.where((pred_df["yhat"] == 0) & (pred_df["y"] == 1), "fn", pred_df["category"])
pred_df.to_csv(preds_fp, encoding="utf-8", index=False)
report = classification_report(y_test, pred_df["yhat"])
report_fp = os.path.join(model_dir, "classification_report.txt")
with open(report_fp, "w") as f:
    for line in report.split("\n"):
        f.write(f"{line}\n")
print(f"{MODEL_NAME} - test set results")
print(report)

## Model 2: removing dropout

Model seems to be better at not overfitting than in the paper. Since the paper didn't mention dropout, we try the same model again but with no dropout layer.

In [None]:
def bilal_bert_model_no_dropout():
    """Create a BERT model using the model and parameters specified in the Bilal paper:
    https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2

        - model: TFBertForSequenceClassification
        - learning rate: 2e-5
        - epsilon: 1e-8

    This time removes dropout from classification layer to see if it better matches paper's results.
    """
    # Using the TFBertForSequenceClassification as specified in the paper:
    bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=0)

    # Freeze all layers except the last 4 which are the pooled classification layers:
    untrainable = [w.name for w in bert_model.weights[:-4]]
    trainable = [w.name for w in bert_model.weights[-4:]]

    for w in bert_model.weights:
        if w.name in untrainable:
            w._trainable = False
        elif w.name in trainable:
            w._trainable = True

    # Compile the model:
    bert_model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08),
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]
    )

    return bert_model

In [None]:
model2 = bilal_bert_model_no_dropout()
model2.summary()

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Train the model using the specifications from the paper: https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2
# -- epochs = 4
# -- batch_size = 32

history2 = model2.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
    y_train.values,
    validation_data=(
        [valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
        y_val.values
        ),
    batch_size=32, 
    epochs=4
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
score = model2.evaluate([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask], y_test)

print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 0.5944662690162659
Test accuracy: 0.7014999985694885


In [None]:
predictions = model2.predict([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask])
preds = predictions.to_tuple()[0].argmax(1)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.71      0.67      0.69      1000
           1       0.69      0.73      0.71      1000

    accuracy                           0.70      2000
   macro avg       0.70      0.70      0.70      2000
weighted avg       0.70      0.70      0.70      2000



## Model 3: Full finetuning (don't freeze any layers)

In [11]:

MODEL_NAME = "bilal_full_finetune"


def bilal_full_finetune():
    """Create a BERT model using the model and parameters specified in the Bilal paper:
    https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2

        - model: TFBertForSequenceClassification
        - learning rate: 2e-5
        - epsilon: 1e-8
    """
    # Using the TFBertForSequenceClassification as specified in the paper:
    bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Don't freeze any layers:
    untrainable = []
    trainable = [w.name for w in bert_model.weights]

    for w in bert_model.weights:
        if w.name in untrainable:
            w._trainable = False
        elif w.name in trainable:
            w._trainable = True

    # Compile the model:
    bert_model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08),
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]
    )

    return bert_model


model = bilal_full_finetune()
print(model.summary())

# Train the model using the specifications from the paper: https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2
# -- epochs = 4
# -- batch_size = 32

# Create directory for storing checkpoints after each epoch:
checkpoint_dir = gdrive_save_dir("checkpoints", model_name = MODEL_NAME)
checkpoint_path = checkpoint_dir + "/cp-{epoch:04d}.ckpt"

# Create a callback that saves the model's weights:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    verbose=1)

# Fit the model saving weights every epoch:
history = model.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
    y_train.values,
    validation_data=(
        [valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
        y_val.values
        ),
    batch_size=32, 
    epochs=4,
    callbacks=[cp_callback]
)

# Save the entire model to GDrive:
model_dir = gdrive_save_dir("full_model", model_name = MODEL_NAME)
model.save(model_dir)

# Save scores on the test set:
test_score = model.evaluate([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask], y_test)
print("Test loss:", test_score[0])
print("Test accuracy:", test_score[1])
score_fp = os.path.join(model_dir, "test_score.txt")
with open(score_fp, "w") as f:
    f.write(f"Test loss = {test_score[0]}\n")
    f.write(f"Test accuracy = {test_score[1]}\n")

# Save predictions and classification_report:
predictions = model.predict([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask])
preds_fp = os.path.join(model_dir, "test_predictions.csv")
pred_df = pd.DataFrame(predictions.to_tuple()[0], columns=["pred_prob_0", "pred_prob_1"])
pred_df["yhat"] = pred_df[["pred_prob_0", "pred_prob_1"]].values.argmax(1)
pred_df["y"] = y_test
pred_df["category"] = np.where((pred_df["yhat"] == 1) & (pred_df["y"] == 1), "tp", None)
pred_df["category"] = np.where((pred_df["yhat"] == 0) & (pred_df["y"] == 0), "tn", pred_df["category"])
pred_df["category"] = np.where((pred_df["yhat"] == 1) & (pred_df["y"] == 0), "fp", pred_df["category"])
pred_df["category"] = np.where((pred_df["yhat"] == 0) & (pred_df["y"] == 1), "fn", pred_df["category"])
pred_df.to_csv(preds_fp, encoding="utf-8", index=False)
report = classification_report(y_test, pred_df["yhat"])
report_fp = os.path.join(model_dir, "classification_report.txt")
with open(report_fp, "w") as f:
    for line in report.split("\n"):
        f.write(f"{line}\n")
print(f"{MODEL_NAME} - test set results")
print(report)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
None
Created checkpoint dir: /content/gdrive/MyDrive/models/bilal_full_finetune/checkpoints/2022_07_17__23_02_59
Epoch 1/4
Epoch 1: saving model to /content/gdrive/MyDrive/models/bilal_full_finetune/checkpoints/2022_07_17__23_02_59/cp-0001.ckpt
Epoch 2/4
Epoch 2: saving model to /



INFO:tensorflow:Assets written to: /content/gdrive/MyDrive/models/bilal_full_finetune/full_model/2022_07_18__04_05_30/assets


INFO:tensorflow:Assets written to: /content/gdrive/MyDrive/models/bilal_full_finetune/full_model/2022_07_18__04_05_30/assets


Test loss: 0.5919166803359985
Test accuracy: 0.6955000162124634
bilal_full_finetune - test set results
              precision    recall  f1-score   support

           0       0.69      0.71      0.70      1000
           1       0.70      0.68      0.69      1000

    accuracy                           0.70      2000
   macro avg       0.70      0.70      0.70      2000
weighted avg       0.70      0.70      0.70      2000

