In [1]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import time

from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from transformers import BertTokenizer, TFBertModel, BertModel, BertConfig, create_optimizer

from sklearn.model_selection import train_test_split
from sklearn import metrics

from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.notebook import tqdm
from torch import cuda
from scipy import interp
from itertools import cycle

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
df_train= pd.read_excel("labeled_raw_data.xlsx")
df_train.shape

(1955, 7)

In [4]:
df_train=df_train.dropna()
df_train.shape

(1937, 7)

In [5]:
labels_to_check = ['food_quality', 'environment', 'service', 'convenience', 'cost_effectiveness']

for label in labels_to_check:
    pos_col = f'{label}_pos'
    neu_col = f'{label}_neu'
    neg_col = f'{label}_neg'

    df_train[pos_col] = np.where(df_train[label] == 1, 1, 0)
    df_train[neu_col] = np.where(df_train[label] == 0, 1, 0)
    df_train[neg_col] = np.where(df_train[label] == -1, 1, 0)

In [6]:
df_train = df_train.drop(labels_to_check, axis=1)
df_train.shape

(1937, 17)

In [7]:
label_cols = ['food_quality_pos', 'food_quality_neu', 'food_quality_neg',	'environment_pos', 'environment_neu','environment_neg',
              'service_pos',	'service_neu', 'service_neg', 'convenience_pos', 'convenience_neu', 'convenience_neg',
              'cost_effectiveness_pos',	'cost_effectiveness_neu',	'cost_effectiveness_neg']
label_cols

['food_quality_pos',
 'food_quality_neu',
 'food_quality_neg',
 'environment_pos',
 'environment_neu',
 'environment_neg',
 'service_pos',
 'service_neu',
 'service_neg',
 'convenience_pos',
 'convenience_neu',
 'convenience_neg',
 'cost_effectiveness_pos',
 'cost_effectiveness_neu',
 'cost_effectiveness_neg']

In [8]:
bert_model_name = 'bert-base-multilingual-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 128

def tokenize_sentences(sentences, tokenizer, max_seq_len = MAX_LEN):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                            )

        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

input_ids = tokenize_sentences(df_train['text'], tokenizer, MAX_LEN)
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
attention_masks = create_attention_masks(input_ids)

  0%|          | 0/1937 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [9]:
labels =  df_train[label_cols].values
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=0, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=0, test_size=0.2)

train_size = len(train_inputs)
validation_size = len(validation_inputs)

In [21]:
BATCH_SIZE=32
NR_EPOCHS=3
def create_dataset(data_tuple, epochs=1, batch_size=BATCH_SIZE, buffer_size=10000, train=True):
    dataset = tf.data.Dataset.from_tensor_slices(data_tuple)
    if train:
        dataset = dataset.shuffle(buffer_size=buffer_size)
    dataset = dataset.repeat(epochs)
    dataset = dataset.batch(batch_size)
    if train:
        dataset = dataset.prefetch(1)

    return dataset

train_dataset = create_dataset((train_inputs, train_masks, train_labels), epochs=NR_EPOCHS, batch_size=BATCH_SIZE)
validation_dataset = create_dataset((validation_inputs, validation_masks, validation_labels), epochs=NR_EPOCHS, batch_size=BATCH_SIZE)

In [22]:
class BertClassifier(tf.keras.Model):
        def __init__(self, bert: TFBertModel, num_classes: int):
            super().__init__()
            self.bert = bert
            self.classifier = Dense(num_classes, activation='sigmoid')

        @tf.function
        def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
            outputs = self.bert(input_ids,
                                   attention_mask=attention_mask,
                                   token_type_ids=token_type_ids,
                                   position_ids=position_ids,
                                   head_mask=head_mask)
            cls_output = outputs[1]
            cls_output = self.classifier(cls_output)

            return cls_output

model = BertClassifier(TFBertModel.from_pretrained(bert_model_name), len(label_cols))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [23]:
lw=2
steps_per_epoch = train_size // BATCH_SIZE
validation_steps = validation_size // BATCH_SIZE

# Loss Function
loss_object = tf.keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
train_loss = tf.keras.metrics.Mean(name='train_loss')
validation_loss = tf.keras.metrics.Mean(name='test_loss')

# Optimizer
warmup_steps = steps_per_epoch // 3
total_steps = steps_per_epoch * NR_EPOCHS - warmup_steps
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

# Metrics
train_precision_metrics = [tf.keras.metrics.Precision() for _ in range(len(label_cols))]
train_recall_metrics = [tf.keras.metrics.Recall() for _ in range(len(label_cols))]
train_f1_metrics = [tf.keras.metrics.Mean(name='train_f1') for _ in range(len(label_cols))]
train_accuracy_metrics = [tf.keras.metrics.BinaryAccuracy() for i in range(len(label_cols))]

validation_precision_metrics = [tf.keras.metrics.Precision() for _ in range(len(label_cols))]
validation_recall_metrics = [tf.keras.metrics.Recall() for _ in range(len(label_cols))]
validation_f1_metrics = [tf.keras.metrics.Mean(name='validation_f1') for _ in range(len(label_cols))]
validation_accuracy_metrics = [tf.keras.metrics.BinaryAccuracy() for i in range(len(label_cols))]

@tf.function
def train_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)

    with tf.GradientTape() as tape:
        predictions = model(token_ids, attention_mask=masks)
        loss = loss_object(labels, predictions)
    #Loss Function into gradient
    gradients = tape.gradient(loss, model.trainable_variables)
    #Apply gradient to optimizer
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)

    for i, precision in enumerate(train_precision_metrics):
        precision.update_state(labels[:, i], predictions[:, i])

    for i, recall in enumerate(train_recall_metrics):
        recall.update_state(labels[:, i], predictions[:, i])

    for i, accuracy in enumerate(train_accuracy_metrics):
        accuracy.update_state(labels[:, i], predictions[:, i])

    for i, f1 in enumerate(train_f1_metrics):
        precision_value = train_precision_metrics[i].result()
        recall_value = train_recall_metrics[i].result()
        f1_value = 2 * ((precision_value * recall_value) / (precision_value + recall_value + 1e-10))
        f1.update_state(f1_value)

@tf.function
def validation_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)

    predictions = model(token_ids, attention_mask=masks, training=False)
    v_loss = loss_object(labels, predictions)

    validation_loss(v_loss)

    for i, precision in enumerate(validation_precision_metrics):
        precision.update_state(labels[:, i], predictions[:, i])

    for i, recall in enumerate(validation_recall_metrics):
        recall.update_state(labels[:, i], predictions[:, i])

    for i, accuracy in enumerate(validation_accuracy_metrics):
        accuracy.update_state(labels[:,i], predictions[:,i])

    for i, f1 in enumerate(validation_f1_metrics):
        precision_value = validation_precision_metrics[i].result()
        recall_value = validation_recall_metrics[i].result()
        f1_value = 2 * ((precision_value * recall_value) / (precision_value + recall_value + 1e-10))
        f1.update_state(f1_value)

def print_metrics(metrics, label_cols, metric_name):
    for i, label_name in enumerate(label_cols):
        print(f"{label_name} {metric_name}: {metrics[i].result()}")
        metrics[i].reset_states()

def train(model, train_dataset, val_dataset, train_steps_per_epoch, val_steps_per_epoch, epochs):
    for epoch in range(epochs):
        print('=' * 50, f"EPOCH {epoch+1}", '=' * 50)

        start = time.time()

        for i, (token_ids, masks, labels) in enumerate(tqdm(train_dataset, total=train_steps_per_epoch)):
            train_step(model, token_ids, masks, labels)
            if i % 500 == 0:
                print(f'\nTrain Step: {i}, Loss: {train_loss.result()}')
                for i, label_name in enumerate(label_cols):
                    print(f"{label_name} precision {train_precision_metrics[i].result()}")
                    print(f"{label_name} recall {train_recall_metrics[i].result()}")
                    print(f"{label_name} f1-score {train_f1_metrics[i].result()}")
                    print(f"{label_name} accuracy {train_accuracy_metrics[i].result()}")
                    train_precision_metrics[i].reset_states()
                    train_recall_metrics[i].reset_states()
                    train_f1_metrics[i].reset_states()
                    train_accuracy_metrics[i].reset_states()

        for i, (token_ids, masks, labels) in enumerate(tqdm(val_dataset, total=val_steps_per_epoch)):
            validation_step(model, token_ids, masks, labels)

        print(f'\nEpoch {epoch+1}, Validation Loss: {validation_loss.result()}, Time: {time.time()-start}\n')

        for i, label_name in enumerate(label_cols):
            print(f"{label_name} precision {validation_precision_metrics[i].result()}")
            print(f"{label_name} recall {validation_recall_metrics[i].result()}")
            print(f"{label_name} f1-score {validation_f1_metrics[i].result()}")
            print(f"{label_name} accuracy {validation_accuracy_metrics[i].result()}")
            validation_precision_metrics[i].reset_states()
            validation_recall_metrics[i].reset_states()
            validation_f1_metrics[i].reset_states()
            validation_accuracy_metrics[i].reset_states()
        print('\n')


train(model, train_dataset, validation_dataset, steps_per_epoch, validation_steps, NR_EPOCHS)



  0%|          | 0/48 [00:00<?, ?it/s]


Train Step: 0, Loss: 0.730596661567688
food_quality_pos precision 0.5625
food_quality_pos recall 1.0
food_quality_pos f1-score 0.7200000286102295
food_quality_pos accuracy 0.5625
food_quality_neu precision 0.25
food_quality_neu recall 1.0
food_quality_neu f1-score 0.4000000059604645
food_quality_neu accuracy 0.25
food_quality_neg precision 0.19354838132858276
food_quality_neg recall 1.0
food_quality_neg f1-score 0.3243243098258972
food_quality_neg accuracy 0.21875
environment_pos precision 0.0
environment_pos recall 0.0
environment_pos f1-score 0.0
environment_pos accuracy 0.65625
environment_neu precision 0.8125
environment_neu recall 0.5199999809265137
environment_neu f1-score 0.6341463327407837
environment_neu accuracy 0.53125
environment_neg precision 0.0625
environment_neg recall 1.0
environment_neg f1-score 0.11764705926179886
environment_neg accuracy 0.0625
service_pos precision 0.3125
service_pos recall 1.0
service_pos f1-score 0.4761904776096344
service_pos accuracy 0.3125
se

  0%|          | 0/12 [00:00<?, ?it/s]


Epoch 1, Validation Loss: 0.2836870551109314, Time: 86.66949653625488

food_quality_pos precision 0.8512820601463318
food_quality_pos recall 0.8383838534355164
food_quality_pos f1-score 0.8368892669677734
food_quality_pos accuracy 0.842783510684967
food_quality_neu precision 0.9103448390960693
food_quality_neu recall 0.8571428656578064
food_quality_neu f1-score 0.8737377524375916
food_quality_neu accuracy 0.9097937941551208
food_quality_neg precision 0.0
food_quality_neg recall 0.0
food_quality_neg f1-score 0.0
food_quality_neg accuracy 0.907216489315033
environment_pos precision 0.7272727489471436
environment_pos recall 0.22857142984867096
environment_pos f1-score 0.360847145318985
environment_pos accuracy 0.8453608155250549
environment_neu precision 0.8158640265464783
environment_neu recall 0.9599999785423279
environment_neu f1-score 0.8804826736450195
environment_neu accuracy 0.8015463948249817
environment_neg precision 0.0
environment_neg recall 0.0
environment_neg f1-score 0.0
en

  0%|          | 0/48 [00:00<?, ?it/s]


Train Step: 0, Loss: 0.36106494069099426
food_quality_pos precision 0.7169917821884155
food_quality_pos recall 0.929618775844574
food_quality_pos f1-score 0.7670541405677795
food_quality_pos accuracy 0.7432752251625061
food_quality_neu precision 0.6987847089767456
food_quality_neu recall 0.5453929305076599
food_quality_neu f1-score 0.4459371566772461
food_quality_neu accuracy 0.7809339165687561
food_quality_neg precision 0.10256410390138626
food_quality_neg recall 0.03611738234758377
food_quality_neg f1-score 0.09592805802822113
food_quality_neg accuracy 0.8779857754707336
environment_pos precision 0.4059829115867615
environment_pos recall 0.11229314655065536
environment_pos f1-score 0.147520050406456
environment_pos accuracy 0.808478593826294
environment_neu precision 0.7893133163452148
environment_neu recall 0.9407821297645569
environment_neu f1-score 0.7662460803985596
environment_neu accuracy 0.7609210014343262
environment_neg precision 0.024911031126976013
environment_neg recall 

  0%|          | 0/12 [00:00<?, ?it/s]


Epoch 2, Validation Loss: 0.270094633102417, Time: 38.80980086326599

food_quality_pos precision 0.89570552110672
food_quality_pos recall 0.7373737096786499
food_quality_pos f1-score 0.811716616153717
food_quality_pos accuracy 0.8221649527549744
food_quality_neu precision 0.7833333611488342
food_quality_neu recall 0.9155844449996948
food_quality_neu f1-score 0.8432148694992065
food_quality_neu accuracy 0.8659793734550476
food_quality_neg precision 0.800000011920929
food_quality_neg recall 0.2222222238779068
food_quality_neg f1-score 0.3782379925251007
food_quality_neg accuracy 0.9226804375648499
environment_pos precision 0.8799999952316284
environment_pos recall 0.3142857253551483
environment_pos f1-score 0.4813916087150574
environment_pos accuracy 0.8685566782951355
environment_neu precision 0.8529411554336548
environment_neu recall 0.9666666388511658
environment_neu f1-score 0.9059138894081116
environment_neu accuracy 0.8453608155250549
environment_neg precision 0.0
environment_neg 

  0%|          | 0/48 [00:00<?, ?it/s]


Train Step: 0, Loss: 0.27752986550331116
food_quality_pos precision 0.9279857277870178
food_quality_pos recall 0.9531307220458984
food_quality_pos f1-score 0.9268237352371216
food_quality_pos accuracy 0.9289864301681519
food_quality_neu precision 0.940536618232727
food_quality_neu recall 0.8781313300132751
food_quality_neu f1-score 0.8908878564834595
food_quality_neu accuracy 0.9436195492744446
food_quality_neg precision 0.9120879173278809
food_quality_neg recall 0.1890660524368286
food_quality_neg f1-score 0.09999527037143707
food_quality_neg accuracy 0.9216699004173279
environment_pos precision 0.8857142925262451
environment_pos recall 0.6905040740966797
environment_pos f1-score 0.7036013007164001
environment_pos accuracy 0.9268345236778259
environment_neu precision 0.905793309211731
environment_neu recall 0.9717245101928711
environment_neu f1-score 0.926302433013916
environment_neu accuracy 0.9005810022354126
environment_neg precision 0.0
environment_neg recall 0.0
environment_neg 

  0%|          | 0/12 [00:00<?, ?it/s]


Epoch 3, Validation Loss: 0.26378822326660156, Time: 38.82088351249695

food_quality_pos precision 0.8418604731559753
food_quality_pos recall 0.9141414165496826
food_quality_pos f1-score 0.8774168491363525
food_quality_pos accuracy 0.8685566782951355
food_quality_neu precision 0.9275362491607666
food_quality_neu recall 0.8311688303947449
food_quality_neu f1-score 0.8825584650039673
food_quality_neu accuracy 0.907216489315033
food_quality_neg precision 0.5483871102333069
food_quality_neg recall 0.4722222089767456
food_quality_neg f1-score 0.4936988353729248
food_quality_neg accuracy 0.9149484634399414
environment_pos precision 0.6440678238868713
environment_pos recall 0.5428571701049805
environment_pos f1-score 0.5732854008674622
environment_pos accuracy 0.8634020686149597
environment_neu precision 0.8999999761581421
environment_neu recall 0.9300000071525574
environment_neu f1-score 0.9132111072540283
environment_neu accuracy 0.8659793734550476
environment_neg precision 0.6666666865348

In [26]:
model.summary()

Model: "bert_classifier_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tf_bert_model_2 (TFBertMod  multiple                  167356416 
 el)                                                             
                                                                 
 dense_2 (Dense)             multiple                  11535     
                                                                 
Total params: 167367951 (638.46 MB)
Trainable params: 167367951 (638.46 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Test data
df_test = pd.read_csv('test_data_text.csv')

In [None]:
df_test.columns

Index(['Unnamed: 0', 'review_id', 'text'], dtype='object')

In [None]:
df_test.shape

(1000000, 3)

In [None]:
df_test=df_test.dropna()
df_test.shape

(1000000, 3)

In [None]:
df_test.columns

Index(['Unnamed: 0', 'review_id', 'text'], dtype='object')

In [None]:
test_input_ids = tokenize_sentences(df_test['text'], tokenizer, MAX_LEN)
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
test_attention_masks = create_attention_masks(test_input_ids)

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [None]:
def create_inference_dataset(inputs, masks, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((inputs, masks))
    dataset = dataset.batch(batch_size)
    return dataset

In [None]:
TEST_BATCH_SIZE = 128
test_dataset = create_inference_dataset(test_input_ids, test_attention_masks, batch_size=TEST_BATCH_SIZE)

In [None]:
predictions_list = []
test_steps = len(test_dataset)
for i, (token_ids, masks) in enumerate(tqdm(test_dataset, total=test_steps)):
    predictions = model(token_ids, attention_mask=masks).numpy()
    predictions_list.append(predictions)

# Concatenate predictions from different batches
all_predictions = np.concatenate(predictions_list, axis=0)

  0%|          | 0/7813 [00:00<?, ?it/s]

In [None]:
all_predictions.shape

(1000000, 15)

In [None]:
all_predictions[1]

array([0.6700232 , 0.04468255, 0.2521374 , 0.02636506, 0.9393675 ,
       0.0547648 , 0.389549  , 0.05211012, 0.5224159 , 0.03187032,
       0.966963  , 0.01328482, 0.015187  , 0.934005  , 0.0546991 ],
      dtype=float32)

In [None]:
num_labels = all_predictions.shape[1] // 3

# Reshape the array into groups of three values
reshaped_predictions = all_predictions.reshape(-1, num_labels, 3)
reshaped_predictions[1]

array([[0.6700232 , 0.04468255, 0.2521374 ],
       [0.02636506, 0.9393675 , 0.0547648 ],
       [0.389549  , 0.05211012, 0.5224159 ],
       [0.03187032, 0.966963  , 0.01328482],
       [0.015187  , 0.934005  , 0.0546991 ]], dtype=float32)

In [None]:
reshaped_predictions.shape

(1000000, 5, 3)

In [None]:
max_indices = np.argmax(reshaped_predictions, axis=2)

# Create new labels based on the maximum value
new_labels = np.zeros_like(reshaped_predictions)
new_labels[np.arange(len(new_labels))[:, np.newaxis], np.arange(reshaped_predictions.shape[1]), max_indices] = 1


new_labels[1]

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [None]:
def map_labels(label):
    if np.array_equal(label, [1, 0, 0]):
        return 1
    elif np.array_equal(label, [0, 1, 0]):
        return 0
    elif np.array_equal(label, [0, 0, 1]):
        return -1
    else:
        return None

# Apply the mapping function to each sub-array
mapped_labels = np.apply_along_axis(map_labels, axis=2, arr=new_labels)
mapped_labels[1]

array([ 1,  0, -1,  0,  0])

In [None]:
mapped_labels.shape

(1000000, 5)

In [None]:
columns = ['food_quality', 'environment', 'service', 'convenience', 'cost_effectiveness']
df_mapped_labels = pd.DataFrame(mapped_labels.reshape(len(df_test), -1), columns=columns)
df_test_labeled = pd.concat([df_test[['review_id', 'text']], df_mapped_labels], axis=1)
df_test_labeled.shape

(1000000, 7)

In [None]:
df_test_labeled.loc[999, 'text']

'Had a fantastic brunch with Ashleigh this afternoon! Bottle of champagne and juice sides with entrees only $50 steal! Will def be returning!'

In [None]:
print(df_test_labeled.iloc[999, 2:])

food_quality          1
environment           0
service               0
convenience           0
cost_effectiveness    1
Name: 999, dtype: object


In [None]:
# Store in a CSV file
df_test_labeled.to_csv('df_test_labeled.csv', index=False)