In [1]:
!pip install transformers --quiet

[K     |████████████████████████████████| 4.4 MB 5.3 MB/s 
[K     |████████████████████████████████| 596 kB 66.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 50.2 MB/s 
[K     |████████████████████████████████| 101 kB 10.3 MB/s 
[?25h

In [2]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from transformers import BertTokenizer, TFBertForSequenceClassification 

## Create train and test data

In [3]:
# Using the data downloaded from here: https://sites.google.com/view/review-helpfulness-prediction/datasets
# Saved publicly in Google Drive.

def get_google_drive_download_url(raw_url: str):
    return "https://drive.google.com/uc?id=" + raw_url.split("/")[-2]


train_url = "https://drive.google.com/file/d/1i54O_JSAVtvP5ivor-ARJRkwSoBFdit1/view?usp=sharing"
test_url = "https://drive.google.com/file/d/1boRdmasHB6JZDNBrlt6MRB1pUVnxxY-6/view?usp=sharing"

bilal_train = pd.read_csv(get_google_drive_download_url(train_url), encoding="latin1")
bilal_test = pd.read_csv(get_google_drive_download_url(test_url), encoding="latin1")

In [4]:
bilal_train.tail()

Unnamed: 0,id,sentence,label
7995,89260,Easy access off the 101 lots of parking in the...,0
7996,62116,Meh. I went in for some accessories and a part...,0
7997,11115,Worst customer service ever. I called the stor...,0
7998,11885,I had my Canon Rebel T1i repaired after I drop...,0
7999,53295,Great store a little short on boys youth sizes...,0


In [5]:
bilal_test.tail()

Unnamed: 0,id,sentence,label
1995,63354,Big sale this week. All sort of little gadets ...,0
1996,45423,The new owner and management are great. I didn...,0
1997,12024,Came here to check out their Patio Furniture. ...,0
1998,89218,I brought in a flash drive with a 3-page docum...,0
1999,45672,Super helpful. Taught me exactly how to gel st...,0


In [6]:
# See if the classes are even:
print("Train class balance: ", bilal_train["label"].mean())
print("Test class balance: ", bilal_test["label"].mean())

Train class balance:  0.5
Test class balance:  0.5


In [7]:
x_train_full = bilal_train["sentence"]
y_train_full = bilal_train["label"]
x_test = bilal_test["sentence"]
y_test = bilal_test["label"]

In [8]:
# Split train into 90-10 split for train-validation as per the paper:
x_train, x_val, y_train, y_val = train_test_split(x_train_full, y_train_full, test_size=0.1)

print(f"Shape X_train: {x_train.shape}")
print(f"Shape X_valid: {x_val.shape}")
print(f"Shape y_train: {y_train.shape}")
print(f"Shape y_val: {y_val.shape}")

Shape X_train: (7200,)
Shape X_valid: (800,)
Shape y_train: (7200,)
Shape y_val: (800,)


## Tokenize inputs

In [9]:
# Using BERT base uncased tokenizer as per the paper:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Use sequence length 320, which achieved best accuracy and F1-score of all sequence lengths tried in the paper:
# https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/4
max_length = 320

train_encodings = bert_tokenizer(
    list(x_train.values), 
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf'
)

valid_encodings = bert_tokenizer(
    list(x_val.values), 
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf'
)

test_encodings = bert_tokenizer(
    list(x_test.values), 
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf'
)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Model 1: Baseline model with unfrozen classification layers

In [None]:
def bilal_bert_model():
    """Create a BERT model using the model and parameters specified in the Bilal paper:
    https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2

        - model: TFBertForSequenceClassification
        - learning rate: 2e-5
        - epsilon: 1e-8
    """
    # Using the TFBertForSequenceClassification as specified in the paper:
    bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Freeze all layers except the last 4 which are the pooled classification layers:
    untrainable = [w.name for w in bert_model.weights[:-4]]
    trainable = [w.name for w in bert_model.weights[-4:]]

    for w in bert_model.weights:
        if w.name in untrainable:
            w._trainable = False
        elif w.name in trainable:
            w._trainable = True

    # Compile the model:
    bert_model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08),
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]
    )

    return bert_model

In [None]:
model = bilal_bert_model()
model.summary()

In [None]:
# Train the model using the specifications from the paper: https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2
# -- epochs = 4
# -- batch_size = 32

history = model.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
    y_train.values,
    validation_data=(
        [valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
        y_val.values
        ),
    batch_size=32, 
    epochs=4
)

In [None]:
score = model.evaluate([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask], y_test)

print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
predictions = model.predict([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask])
preds = predictions.to_tuple()[0].argmax(1)
print(classification_report(y_test, preds))

## Model 2: removing dropout

Model seems to be better at not overfitting than in the paper. Since the paper didn't mention dropout, we try the same model again but with no dropout layer.*italicized text*

In [None]:
def bilal_bert_model_no_dropout():
    """Create a BERT model using the model and parameters specified in the Bilal paper:
    https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2

        - model: TFBertForSequenceClassification
        - learning rate: 2e-5
        - epsilon: 1e-8

    This time removes dropout from classification layer to see if it better matches paper's results.
    """
    # Using the TFBertForSequenceClassification as specified in the paper:
    bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=0)

    # Freeze all layers except the last 4 which are the pooled classification layers:
    untrainable = [w.name for w in bert_model.weights[:-4]]
    trainable = [w.name for w in bert_model.weights[-4:]]

    for w in bert_model.weights:
        if w.name in untrainable:
            w._trainable = False
        elif w.name in trainable:
            w._trainable = True

    # Compile the model:
    bert_model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08),
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]
    )

    return bert_model

In [None]:
model2 = bilal_bert_model_no_dropout()
model2.summary()

In [None]:
# Train the model using the specifications from the paper: https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2
# -- epochs = 4
# -- batch_size = 32

history2 = model2.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
    y_train.values,
    validation_data=(
        [valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
        y_val.values
        ),
    batch_size=32, 
    epochs=4
)

## Model 3: Don't freeze any layers

In [11]:
def bilal_bert_model_no_freeze():
    """Create a BERT model using the model and parameters specified in the Bilal paper:
    https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2

        - model: TFBertForSequenceClassification
        - learning rate: 2e-5
        - epsilon: 1e-8

    This time don't freeze any layer to see if it better matches paper's results.
    """
    # Using the TFBertForSequenceClassification as specified in the paper:
    bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Don't freeze any layers:
    untrainable = []
    trainable = [w.name for w in bert_model.weights]

    for w in bert_model.weights:
        if w.name in untrainable:
            w._trainable = False
        elif w.name in trainable:
            w._trainable = True

    # Compile the model:
    bert_model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08),
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]
    )

    return bert_model

In [12]:
model3 = bilal_bert_model_no_freeze()
model3.summary()

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Train the model using the specifications from the paper: https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2
# -- epochs = 4
# -- batch_size = 32

history3 = model3.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
    y_train.values,
    validation_data=(
        [valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
        y_val.values
        ),
    batch_size=32, 
    epochs=4
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
score = model3.evaluate([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask], y_test)

print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
predictions = model3.predict([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask])
preds = predictions.to_tuple()[0].argmax(1)
print(classification_report(y_test, preds))

In [None]:
model3.save(
    "./bilal_model3",
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=None,
    save_traces=True
)


In [None]:
%pwd

In [None]:
%ls