In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

In [None]:
### Load data

# Load training data into dataframe
df = pd.read_csv('data/financial_transactions_training_dataset.csv')

# Define the number of folds
n_folds = 5

# Assuming you have a pandas DataFrame `df` with 'description' and 'category' columns
descriptions = df['description'].values
categories = df['category'].values

# Convert categories to integers
categories_num_map = {
        'Food': 0, 'Personal & Miscellaneous': 1, 'Savings & Investments': 2,
        'Entertainment': 3, 'Education': 4, 'Living Expenses': 5,
        'Transportation': 6, 'Healthcare': 7, 'Travel': 8,
    }
categories_text = list(categories_num_map.keys())
categories = np.array([categories_num_map[cat] for cat in categories])
# categories, categories_text = pd.factorize(categories)

In [None]:
### Test model using k-fold cross-validation

# Create a StratifiedKFold object
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize results
scores = []

# Loop over the folds
for train_index, test_index in skf.split(descriptions, categories):
    descriptions_train, descriptions_test = descriptions[train_index], descriptions[test_index]
    categories_train, categories_test = categories[train_index], categories[test_index]

    # Create a TextVectorization layer
    vectorizer = TextVectorization(output_mode='int')
    vectorizer.adapt(descriptions_train)

    # Create the model
    model = tf.keras.models.Sequential([
        vectorizer,
        layers.Embedding(input_dim=len(vectorizer.get_vocabulary()), output_dim=64, mask_zero=True),
        layers.Bidirectional(layers.LSTM(64)),
        layers.Dense(32, activation='relu'),
        layers.Dense(len(np.unique(categories_text)), activation='softmax')
    ])

    # Compile the model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    model.fit(descriptions_train, categories_train, epochs=10, validation_data=(descriptions_test, categories_test))

    # Evaluate the model
    score = model.evaluate(descriptions_test, categories_test, verbose=0)
    scores.append(score)

# Print the mean score
accuracies = [score[1] for score in scores]
print('Accuracies for each fold: ', accuracies)
print('Mean Accuracy: ', np.mean(accuracies))


In [None]:
### Train RNN model on entire dataset, then save

# Create a TextVectorization layer
vectorizer = TextVectorization(output_mode='int')
vectorizer.adapt(descriptions)

# Create the model
model = tf.keras.models.Sequential([
    vectorizer,
    layers.Embedding(input_dim=len(vectorizer.get_vocabulary()), output_dim=64, mask_zero=True),
    layers.Bidirectional(layers.LSTM(64)),
    layers.Dense(32, activation='relu'),
    layers.Dense(len(np.unique(categories_text)), activation='softmax')
])

# Compile and fit the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(descriptions, categories, epochs=10)

# Save the model
model.save('./models/financial_transactions_categorization_rnn_model.keras')

In [None]:
# Import saved model and make predictions
model = tf.keras.models.load_model('./models/financial_transactions_categorization_rnn_model.keras')
test_descriptions = ['Rent', 'Chipotle', 'Pizza', 'McDonalds', 'Food', 'Thai Food', 'PAYPAL  TWITCHINTER', 'Persona 3 Reload', 'WEX INC DIRECT DEP']
predictions = model.predict(test_descriptions)
predicted_categories = [categories_text[prediction.argmax(axis=-1)] for prediction in predictions]
for desc, pred in zip(test_descriptions, predicted_categories):
    print(f'{desc}: {pred}')