In [24]:
import pandas as pd
import numpy as np
import re
import Levenshtein
from datetime import datetime
from thefuzz import process
import os
import yaml
from utils import *

# Import the transactions file
transactions_df = import_transactions()

# Preprocess descriptions
transactions_df['string_keywords'] = preprocess_desc(transactions_df)

# Load categories
categories = load_categories()

# Primary match
# write the best match for each keyword using thefuzz process.Extract method
best_matches = primary_match(categories['brand'], transactions_df['string_keywords'])
finance_data_score_df = pd.merge(transactions_df, best_matches, how='inner', right_index=True, left_on='string_keywords')

# Prepare the text for the secondary match: make each description into a list of words
dataset_keywords = sequence_keywords(transactions_df['string_keywords'])

# Secondary match
# Calculate the Levenshtein distance of each word to each brand and return the best match for each description
best_word_match_df = secondary_match(categories['brand'], dataset_keywords)

finance_matches_df = pd.merge(finance_data_score_df, best_word_match_df, how='inner', left_index=True, right_on='description_index')
finance_matches_df.drop('description_index', axis=1, inplace=True)

# Prepare for the final category matach: do the primary and secondary matching methods agree?
finance_matches_df['methods_match'] = False
finance_matches_df.loc[finance_matches_df['string_match_brand'] == finance_matches_df['word_match_brand'], 'methods_match'] = True

# Apply the categorization logic to find the best matching brand for each description
finance_matches_df['brand_final'] = finance_matches_df.apply(categorisation_logic, axis=1)

# Translate the brands to higher level categories (e.g. amazon: Shopping, rent: House)
transactions_categorised_df = pd.merge(finance_matches_df, categories, how='inner', left_on='brand_final', right_on='brand')
transactions_categorised_df.drop('brand', axis=1, inplace=True)

# Write the final categorized dataframe to a CSV file
file_path = os.path.join('..', 'datasets', 'finance_matches_df_output.csv')
transactions_categorised_df.to_csv(file_path)

display(transactions_categorised_df.tail())

Unnamed: 0,owner,date,description,euro value,account,string_keywords,string_match_brand,string_match_score,word_in_desc,word_position,word_match_brand,word_match_distance,methods_match,brand_final,category
2631,filippo+charlotte,2023-10-27,Überweisung - OTTO payments - CXA-VJC-AZV-E2B,-52.94,DKB 3033,überweisung otto payments e2b,otto payments,90,otto,2,netto,2,False,otto payments,1. house
2632,filippo+charlotte,2023-10-27,Überweisung - OTTO payments - CXA-VJC-AZV-E2B,-52.94,DKB 3033,überweisung otto payments e2b,otto payments,90,otto,2,netto,2,False,otto payments,1. house
2633,filippo+charlotte,2023-06-05,Überweisung - Sehsterne EV - Kostenbeteiligung...,-174.0,DKB 3033,überweisung sehsterne ev kostenbeteiligung ter...,charlotte quirin - charlotte quirin und filipp...,86,fuer,8,uber,2,False,charlotte quirin - charlotte quirin und filipp...,1. house
2634,filippo+charlotte,2023-04-13,Überweisung - Sehsterne EV - Kostenbeteiligung...,-46.0,DKB 3033,überweisung sehsterne ev kostenbeteiligung ter...,sehsterne ev,90,sehsterne,2,sehsterne ev,3,True,sehsterne ev,4. services
2635,filippo+charlotte,2023-06-13,Überweisung - Waldschloss Parow GbR - Anzahlun...,-126.8,DKB 3033,überweisung waldschloss parow anzahlung res. nr.,waldschloss parow,90,res.,5,rewe,2,False,waldschloss parow,6. leisure
