In [1]:
import pandas as pd
import numpy as np
import re
import Levenshtein
from datetime import datetime
from thefuzz import process
import os
import yaml
from utils import *

## Import Data `transactions` and `categories`

In [2]:
# Load the transactions csv file
transactions_df = import_transactions('dataset_transactions') # Insert the name of the CSV file here. For the path, see utils.py

# Load categories .yaml file and parse the structure
categories = parse_categories()

## Preprocess the `description` field and Run the Primary Match 

In [3]:
# clean descriptions
transactions_df['string_keywords'] = preprocess_desc(transactions_df)

# Apply the primary match
finance_data_score_df = primary_match(categories['key'], transactions_df['string_keywords'], transactions_df)

## Sequence the `description` and Run the Secondary Match

In [4]:
# Preprocess the descriptions' strings to form a list of single words
dataset_keywords = sequence_keywords(transactions_df['string_keywords'])

# Apply the secondary match
finance_matches_df = secondary_match(categories['key'], dataset_keywords, finance_data_score_df)

## Compute the `Categorisation Logic`

In [5]:
# Apply the categorisation logic
finance_matches_df['key_final'] = finance_matches_df.apply(categorisation_logic, axis=1)

# From the chosen keywords, select the brand and category 
categories_unique = categories.drop_duplicates(subset='key', keep='first')
transactions_categorised_df = pd.merge(finance_matches_df, categories_unique, how='left', left_on='key_final', right_on='key')

## Filter for Unassigned categories and save to CSV

In [6]:
# Filter unassigned categories
unassigned_cat = filter_unassigned(transactions_categorised_df)

# save the unassigned categories dataset as csv
file_path = os.path.join('..', 'datasets', 'unassigned_cat_output.csv')
unassigned_cat.to_csv(file_path)

## Save the `Categorised Transaction` data to CSV

In [7]:
# save the file as csv
file_path = os.path.join('..', 'datasets', 'finance_matches_df_output.csv')
transactions_categorised_df.to_csv(file_path)

# preview how many rows were in the initial and final datasets and how amny unassigned categories there are to work on
count_unassigned = len(unassigned_cat)
count_transactions_categorised = len(transactions_categorised_df)

print(f'Input transaction dataset contains {len(transactions_df)} lines')
print(f'Output categorised dataset contains {count_transactions_categorised}')
print(f'There are {count_unassigned} unassigned categories')

Input transaction dataset contains 2020 lines
Output categorised dataset contains 2020
There are 15 unassigned categories


## Pivot and Visualise the data

In [9]:
pivot = transactions_categorised_df.pivot_table(index='category', values='euro value', aggfunc='sum', dropna=False)
pivot

Unnamed: 0_level_0,euro value
category,Unnamed: 1_level_1
1. house,-31337.48
2. shopping,-14764.38
3. food,-14521.29
4. services,-9516.33
5. transport,-9213.78
6. leisure,-12503.96
7. other,-10826.94
,-529.9
