In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
from official.nlp import optimization  # to create AdamW optimizer

tf.get_logger().setLevel('ERROR')

In [None]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_dir = os.path.join(dataset_dir, 'train')

# remove unused folders to make it easier to load the data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [None]:
# Pick encoder and preprocessing model
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
# Define Classifier model
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net) # helps prevent overfitting
  net = tf.keras.layers.Dense(9, activation='softmax', name='classifier')(net) # 9 categories for financial transactions
  return tf.keras.Model(text_input, net)

In [None]:
import os

# Print the current working directory
print("Current working directory:", os.getcwd())

# List files in 'data/train'
print("Files in 'data/train':", os.listdir('data/train'))

In [None]:
# Prepare training, validation, and test datasets

import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

seed = 1
data = pd.read_csv('data/train/financial_transactions_training_dataset_bert.csv')
features = data.drop('category_num', axis=1)
labels = data['category_num']

# Split the data into training, validation, and test sets
features_train, features_temp, labels_train, labels_temp = train_test_split(features, labels, test_size=0.4, random_state=seed)  # 60% training
features_val, features_test, labels_val, labels_test = train_test_split(features_temp, labels_temp, test_size=0.5, random_state=seed)  # 20% validation, 20% test

# Convert the pandas DataFrames into TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((features_train.values, labels_train.values))
val_dataset = tf.data.Dataset.from_tensor_slices((features_val.values, labels_val.values))
test_dataset = tf.data.Dataset.from_tensor_slices((features_test.values, labels_test.values))

# Shuffle and batch the datasets
train_batch_size = 1
test_batch_size = 1
train_dataset = train_dataset.shuffle(len(features_train)).batch(train_batch_size)
val_dataset = val_dataset.shuffle(len(features_val)).batch(train_batch_size)
test_dataset = test_dataset.batch(test_batch_size)


In [None]:
# Define optimizer for training
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
# Build and compile the classifier model
classifier_model = build_classifier_model()
classifier_model.compile(optimizer=optimizer,
                         loss='sparse_categorical_crossentropy',
                         metrics=['accuracy'])

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_dataset,
                               validation_data=val_dataset,
                               epochs=epochs)

In [None]:
# Evaluate the model on the test dataset
loss, accuracy = classifier_model.evaluate(test_dataset)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

In [None]:
# Get actual and predicted categories for each transaction in the test dataset in a dataframe
actual_categories = []
predicted_categories = []
for text, label in test_dataset:
  actual_categories.append(label.numpy()[0])
  predicted_categories.append(tf.argmax(classifier_model.predict(text), axis=1).numpy()[0])

test_data = pd.DataFrame({'text': features_test['description'], 'actual_category': actual_categories, 'predicted_category': predicted_categories})
test_data