In [1]:
# Study: Natural Language Processing with Deep Learning
# Dataset: Dead By Daylight Steam Reviews
# Author: Willian Oliveira and Julierme Silva
# Start: 10/04/2023
# Study Motivation: Train a machine to classify products based on user reviews
# Notebook Motivation: The purpose of this notebook is to train a BERT Transformer to classify the reviews of the Dead By Daylight game on Steam
# Study Status: Finished

In [2]:
# Importing the libraries and setting up the environment

import random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    accuracy_score,
)


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)


SEED = 0
set_seed(SEED)  # Setting seed for reproducibility

In [3]:
# Importing the dataset and dropping the null values from the dataset

df = pd.read_csv('data/raw/dbd_english_reviews.csv')
df = df.dropna()
df.head()

Unnamed: 0,review_id,review_text,recommended
0,136024101,good game\n,True
1,136022433,BHVR implemented dc penalty cos they apparentl...,False
2,136022116,เกมหมาๆ คิลเก่งก็เก่งไปเลย กากก็กากสัส กดโซโล่...,False
3,136019421,This game is very fun unless you get fucked by...,True
4,136019209,Why?,False


In [4]:
# Splitting the dataset into the Training, Validation and Test set with imbalanced-learn, splitting first and then undersampling to avoid data leakage

X = df["review_text"]
y = df["recommended"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=SEED
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=SEED
)

# Undersampling the training set

rus = RandomUnderSampler(random_state=SEED)
X_train, y_train = rus.fit_resample(X_train.values.reshape(-1, 1), y_train)

# Converting X_train resampled back to a string to be used by the tokenizer

X_train = np.ravel(X_train)

In [5]:
X_train.shape, X_val.shape, X_test.shape

((46178,), (25985,), (25986,))

In [5]:
# Tokenizing the reviews with the BERT tokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def tokenize_data(texts, labels):
    input_ids, attention_masks, target_labels = [], [], []
    
    for text, label in zip(texts, labels):
        tokenized = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='tf'
        )
        input_ids.append(tokenized['input_ids'][0])
        attention_masks.append(tokenized['attention_mask'][0])
        target_labels.append(label)
    
    return np.array(input_ids), np.array(attention_masks), np.array(target_labels)

X_train_ids, X_train_masks, y_train_labels = tokenize_data(X_train, y_train)
X_val_ids, X_val_masks, y_val_labels = tokenize_data(X_val, y_val)
X_test_ids, X_test_masks, y_test_labels = tokenize_data(X_test, y_test)


In [6]:
# Creating the BERT model

config = BertConfig.from_pretrained("bert-base-cased", num_labels=2)
model = TFBertForSequenceClassification.from_pretrained(
    "bert-base-cased", config=config
)

# Compiling the model

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ["accuracy"]
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

Downloading tf_model.h5:   0%|          | 0.00/527M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Training the model

history = model.fit(
    [X_train_ids, X_train_masks],
    y_train_labels,
    validation_data=([X_val_ids, X_val_masks], y_val_labels),
    epochs=3,
    batch_size=16,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [20]:
# Saving the model

model.save_pretrained("models/dbd_reviews_bert")
tokenizer.save_pretrained("models/dbd_reviews_bert_tokenizer")

('models/dbd_reviews_bert_tokenizer\\tokenizer_config.json',
 'models/dbd_reviews_bert_tokenizer\\special_tokens_map.json',
 'models/dbd_reviews_bert_tokenizer\\vocab.txt',
 'models/dbd_reviews_bert_tokenizer\\added_tokens.json')

In [21]:
# Evaluating the model

y_pred_probs = model.predict([X_test_ids, X_test_masks])
y_pred_probs = y_pred_probs.logits
y_pred_labels = np.argmax(y_pred_probs, axis=-1)
y_pred_labels = y_pred_labels.squeeze()

accuracy = accuracy_score(y_test_labels, y_pred_labels)
precision = precision_score(y_test_labels, y_pred_labels)
recall = recall_score(y_test_labels, y_pred_labels)
f1 = f1_score(y_test_labels, y_pred_labels)
roc_auc = roc_auc_score(y_test_labels, y_pred_probs[:, 1])

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")



Accuracy: 0.86
Precision: 0.96
Recall: 0.86
F1 Score: 0.91
ROC AUC Score: 0.93
