<a href="https://colab.research.google.com/github/toby-p/w266-final-project/blob/main/Amazon_product_data_transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers --quiet

[K     |████████████████████████████████| 4.4 MB 5.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 69.6 MB/s 
[K     |████████████████████████████████| 596 kB 78.2 MB/s 
[K     |████████████████████████████████| 101 kB 10.5 MB/s 
[?25h

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from transformers import BertTokenizer, TFBertForSequenceClassification 

In [None]:
# Code for helping save models to GDrive after training:

import datetime
import os

from google.colab import drive

# Mount Google Drive:
drive.mount("/content/gdrive")

# Directory where models will be stored in GDrive:
MODEL_DIR = "/content/gdrive/MyDrive/models"

# Make the directories for storing results if they don't exist yet:
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)


def gdrive_save_dir(*subdir: str, model_name: str = "test_model"): 
    """Create timestamped directory in GDrive for storing checkpoints or models.
    
    Args:
        subdir: optional subdirectories of the main model directory
            (e.g. `checkpoints`, `final_model`, etc.)
        model_name: main name for directory specifying the model being saved.
    """
    model_dir = f"{MODEL_DIR}/{model_name}"
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    for s in subdir:
        model_dir = f"{model_dir}/{s}"
        if not os.path.exists(model_dir):
            os.mkdir(model_dir)
    now = datetime.datetime.now()
    now_str = now.strftime("%Y_%m_%d__%H_%M_%S")
    dir_path = f"{model_dir}/{now_str}"
    os.mkdir(dir_path)
    print(f"Created checkpoint dir: {dir_path}")
    return dir_path


gdrive_save_dir("checkpoints", model_name = "test_model")

Mounted at /content/gdrive
Created checkpoint dir: /content/gdrive/MyDrive/models/test_model/checkpoints/2022_07_17__20_21_56


'/content/gdrive/MyDrive/models/test_model/checkpoints/2022_07_17__20_21_56'

## Create train and test data

In [None]:
# Using the datasets created in a separate notebook and saved to Github:
train_url = "https://raw.githubusercontent.com/toby-p/w266-final-project/main/data/amazon/train.csv"
test_url = "https://raw.githubusercontent.com/toby-p/w266-final-project/main/data/amazon/test.csv"
val_url = "https://raw.githubusercontent.com/toby-p/w266-final-project/main/data/amazon/val.csv"

amazon_train = pd.read_csv(train_url, encoding="latin1")
amazon_test = pd.read_csv(test_url, encoding="latin1")
amazon_val = pd.read_csv(val_url, encoding="latin1")

In [None]:
amazon_train.tail()

Unnamed: 0,id,sentence,label
7995,89260,Easy access off the 101 lots of parking in the...,0
7996,62116,Meh. I went in for some accessories and a part...,0
7997,11115,Worst customer service ever. I called the stor...,0
7998,11885,I had my Canon Rebel T1i repaired after I drop...,0
7999,53295,Great store a little short on boys youth sizes...,0


In [None]:
amazon_test.tail()

Unnamed: 0,id,sentence,label
1995,63354,Big sale this week. All sort of little gadets ...,0
1996,45423,The new owner and management are great. I didn...,0
1997,12024,Came here to check out their Patio Furniture. ...,0
1998,89218,I brought in a flash drive with a 3-page docum...,0
1999,45672,Super helpful. Taught me exactly how to gel st...,0


In [None]:
amazon_val.tail()

In [None]:
x_train_full = amazon_train["reviewText"]
y_train_full = amazon_train["label"]
x_val = amazon_val["reviewText"]
y_val = amazon_val["label"]
x_test = amazon_test["reviewText"]
y_test = amazon_test["label"]

In [None]:
print(f"Shape x_train: {x_train.shape}")
print(f"Shape x_val: {x_val.shape}")
print(f"Shape x_test: {x_test.shape}")
print(f"Shape y_train: {y_train.shape}")
print(f"Shape y_val: {y_val.shape}")
print(f"Shape y_test: {y_test.shape}")

Shape X_train: (7200,)
Shape X_valid: (800,)
Shape y_train: (7200,)
Shape y_val: (800,)


## Tokenize inputs

In [None]:
# Using BERT base uncased tokenizer as per the paper:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Use sequence length 320, which achieved best accuracy and F1-score of all sequence lengths tried in the paper:
# https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/4
max_length = 320

train_encodings = bert_tokenizer(
    list(x_train.values), 
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf'
)

valid_encodings = bert_tokenizer(
    list(x_val.values), 
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf'
)

test_encodings = bert_tokenizer(
    list(x_test.values), 
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf'
)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Model 1: Baseline Amazon product data fine-tuned model

* All layers unfrozen;
* Same parameters as best Bilal baseline model;
* Trained for 4 epochs.

In [None]:

MODEL_NAME = "amazon_finetune"


def amazon_finetune():
    """Create a BERT model using the model and parameters specified in the Bilal paper:
    https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2

        - model: TFBertForSequenceClassification
        - learning rate: 2e-5
        - epsilon: 1e-8
    """
    # Using the TFBertForSequenceClassification as specified in the paper:
    bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Don't freeze any layers:
    untrainable = []
    trainable = [w.name for w in bert_model.weights]

    for w in bert_model.weights:
        if w.name in untrainable:
            w._trainable = False
        elif w.name in trainable:
            w._trainable = True

    # Compile the model:
    bert_model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08),
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]
    )

    return bert_model


model = amazon_finetune()
print(model.summary())

# Train the model using the specifications from the paper: https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2
# -- epochs = 4
# -- batch_size = 32

# Create directory for storing checkpoints after each epoch:
checkpoint_dir = gdrive_save_dir("checkpoints", model_name = MODEL_NAME)
checkpoint_path = checkpoint_dir + "/cp-{epoch:04d}.ckpt"

# Create a callback that saves the model's weights:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    verbose=1)

# Fit the model saving weights every epoch:
history = model.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
    y_train.values,
    validation_data=(
        [valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
        y_val.values
        ),
    batch_size=32, 
    epochs=4,
    callbacks=[cp_callback]
)

# Save the entire model to GDrive:
model_dir = gdrive_save_dir("full_model", model_name = MODEL_NAME)
model.save(model_dir)

# Save scores on the test set:
test_score = model.evaluate([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask], y_test)
print("Test loss:", test_score[0])
print("Test accuracy:", test_score[1])
score_fp = os.path.join(model_dir, "test_score.txt")
with open(score_fp, "w") as f:
    f.write(f"Test loss = {test_score[0]}\n")
    f.write(f"Test accuracy = {test_score[1]}\n")

# Save predictions and classification_report:
predictions = model.predict([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask])
preds_fp = os.path.join(model_dir, "test_predictions.csv")
pred_df = pd.DataFrame(predictions.to_tuple()[0], columns=["pred_prob_0", "pred_prob_1"])
pred_df["yhat"] = pred_df[["pred_prob_0", "pred_prob_1"]].values.argmax(1)
pred_df["y"] = y_test
pred_df["category"] = np.where((pred_df["yhat"] == 1) & (pred_df["y"] == 1), "tp", None)
pred_df["category"] = np.where((pred_df["yhat"] == 0) & (pred_df["y"] == 0), "tn", pred_df["category"])
pred_df["category"] = np.where((pred_df["yhat"] == 1) & (pred_df["y"] == 0), "fp", pred_df["category"])
pred_df["category"] = np.where((pred_df["yhat"] == 0) & (pred_df["y"] == 1), "fn", pred_df["category"])
pred_df.to_csv(preds_fp, encoding="utf-8", index=False)
report = classification_report(y_test, pred_df["yhat"])
report_fp = os.path.join(model_dir, "classification_report.txt")
with open(report_fp, "w") as f:
    for line in report.split("\n"):
        f.write(f"{line}\n")
print(f"{MODEL_NAME} - test set results")
print(report)