# Categorizing Memo Data

## Imports and Packages

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split


import re
import nltk
import string
import os

2023-11-05 22:39:37.137570: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-05 22:39:37.186661: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.version.cuda, device)

11.8 cuda:0


## Preprocessing

In [3]:
df = pd.read_parquet('Transacation_outflows_3k.pqt')

We only keep the most important 9 categories since other categories have a relatively low frequency. At this point, we will prioritize the data with the 9 categories below. 

In [4]:
df = df[df["category_description"] != df['memo_clean']]
df = df.reset_index(drop=True)
df['category_description'].value_counts()

category_description
GENERAL_MERCHANDISE    516039
FOOD_AND_BEVERAGES     467667
GROCERIES              220227
TRAVEL                  59555
PETS                     8539
EDUCATION                3895
RENT                     3453
OVERDRAFT                3324
MORTGAGE                 1047
Name: count, dtype: int64

In [5]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/wxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Lowercase and Stopwords Removal

In [6]:
#lowercase all
df['memo_clean'] = df['memo_clean'].str.lower()

In [7]:
stop = stopwords.words('english')
def remove_stop(df):

        #remove stopwords in the list
        df['memo_clean'] = df['memo_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
remove_stop(df)

### Punctuation Removal

In [8]:
df['memo_clean'] = df['memo_clean'].str.strip()

punctuation_to_keep = {'-', "'"}
punctuation_to_remove = ''.join(set(string.punctuation) - punctuation_to_keep)

# Escape punctuation characters that need to be escaped
punctuation_to_remove = re.escape(punctuation_to_remove)

# Remove specified punctuation and handle '-'
df['memo_clean'] = df['memo_clean'].str.replace(f'[{punctuation_to_remove}-]', '', regex=True)

# Replace underscores with spaces
df['memo_clean'] = df['memo_clean'].str.replace('_', ' ')

Since there is censored personal information in the transaction code, we will remove all XXXX in the memo_clean data. 

In [9]:
# alphabet set

alphabet = set('abcdefghijklmnopqrstuvwxyz')

def process_memo(memo):
    splits = memo.split(' ')
    results = [s for s in splits if not alphabet.intersection(set(s)) == set('x') and s not in ['dates', 'date'] and s.count('x') < 3]
    return ' '.join(results)

df['memo_clean'] = df['memo_clean'].apply(process_memo)

In [10]:
df = df.sample(50_000, random_state=42)

## Train, Validation, and Test Split

We split the data into training, validation, and test data. To ensure that they all have a similar proportion of each category, we will enable Stratify in the train_test_split function. 

In [11]:
X = df['memo_clean']
y = df['category_description']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [12]:
# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(set(y)))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Tokenize the data
X_train_encoded = tokenizer(X_train.tolist(), truncation=True, padding=True)
X_val_encoded = tokenizer(X_val.tolist(), truncation=True, padding=True)
X_test_encoded = tokenizer(X_test.tolist(), truncation=True, padding=True)

label_to_id = {label: idx for idx, label in enumerate(sorted(set(y)))}
y_train_numeric = [label_to_id[label] for label in y_train]
y_val_numeric = [label_to_id[label] for label in y_val]
y_test_numeric = [label_to_id[label] for label in y_test]

# Convert the numeric labels to a tensor
y_train_tensor = torch.tensor(y_train_numeric)

class MemoDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
# Create a dataset
train_dataset = MemoDataset(X_train_encoded, y_train_numeric)
val_dataset = MemoDataset(X_val_encoded, y_val_numeric)
test_dataset = MemoDataset(X_test_encoded, y_test_numeric)

We convert the categorical data into numeric so it's ready to be processed. 

In [14]:
label_to_id

{'EDUCATION': 0,
 'FOOD_AND_BEVERAGES': 1,
 'GENERAL_MERCHANDISE': 2,
 'GROCERIES': 3,
 'MORTGAGE': 4,
 'OVERDRAFT': 5,
 'PETS': 6,
 'RENT': 7,
 'TRAVEL': 8}

In [15]:
# Prepare the training arguments and trainer
training_args = TrainingArguments(
    output_dir='./model',
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy='epoch',
#     evaluation_strategy='steps',
#     eval_steps=50,
#     logging_steps=50,
)

# Adjust the Trainer (same as before)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
training_args.device

device(type='cuda', index=0)

## Fine Tuning

We now fine tune and save our model. This may take a while if not using a GPU, so if you have downloaded the pretrained `./bank_transaction_model`, feel free to move to skip this section and move to **Model Evaluation**.

In [17]:
trainer.train()

Step,Training Loss
500,0.7645
1000,0.4442
1500,0.4026
2000,0.3707
2500,0.3572
3000,0.3311
3500,0.304
4000,0.25
4500,0.2138
5000,0.2088


TrainOutput(global_step=11250, training_loss=0.23618417680528428, metrics={'train_runtime': 716.7076, 'train_samples_per_second': 125.574, 'train_steps_per_second': 15.697, 'total_flos': 1942621674840000.0, 'train_loss': 0.23618417680528428, 'epoch': 3.0})

In [18]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 0.2889961302280426, 'eval_runtime': 13.1271, 'eval_samples_per_second': 761.782, 'eval_steps_per_second': 95.223, 'epoch': 3.0}


In [19]:
# Save the model
trainer.save_model('./bank_transaction_model')

## Model Evaluation 

We save the fine-tuned model so it's ready to be used for inference. 

In [20]:
# Load the saved model
model = BertForSequenceClassification.from_pretrained('./bank_transaction_model')
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

We run predictions on our test dataset.

In [21]:
pred = trainer.predict(test_dataset=test_dataset)

In [22]:
pred

PredictionOutput(predictions=array([[-1.3649408 ,  0.2511525 ,  6.5304484 , ..., -2.4363623 ,
        -2.3003874 , -1.867918  ],
       [-1.8531574 ,  9.03224   , -0.61177796, ..., -2.5696285 ,
        -2.8026402 , -1.5367205 ],
       [-2.004621  ,  9.03414   , -0.50371367, ..., -2.8041766 ,
        -2.8890438 , -1.5023757 ],
       ...,
       [-1.9455248 ,  9.025847  , -0.51613456, ..., -2.782323  ,
        -2.8183486 , -1.4846706 ],
       [-2.180538  , -0.2927844 ,  9.481965  , ..., -2.608956  ,
        -2.294376  , -1.3611125 ],
       [-1.9396539 ,  9.02098   , -0.4382331 , ..., -2.8683412 ,
        -2.8205342 , -1.4936892 ]], dtype=float32), label_ids=array([2, 1, 1, ..., 1, 2, 1]), metrics={'test_loss': 0.278065025806427, 'test_runtime': 16.1549, 'test_samples_per_second': 619.007, 'test_steps_per_second': 77.376})

In [23]:
y_pred = pred.predictions.argmax(1)

In [26]:
#accuracy rate 
(y_pred == y_test_numeric).mean()

0.9509

In [27]:
id_to_label = {val:key for key, val in label_to_id.items()}

In [29]:
# Inference
memo_statement = "Amazon"
input_data = tokenizer(memo_statement, padding=True, truncation=True, return_tensors='pt')
output = model(**input_data)
predicted_label = torch.argmax(output.logits).item()
print(f"Predicted Label: {predicted_label}, {id_to_label[predicted_label]}")

Predicted Label: 2, GENERAL_MERCHANDISE
