In [None]:
# !pip3 install langchain_community

In [2]:
from langchain_community.llms import Ollama

In [3]:
llm = Ollama(model="llama2")
llm.invoke("The first man on the moon was ...")

'\nThe first man on the moon was Neil Armstrong. He stepped foot on the moon\'s surface on July 20, 1969, during the Apollo 11 mission. Armstrong famously declared "That\'s one small step for man, one giant leap for mankind" as he became the first person to walk on the lunar surface.'

In [8]:
llm.invoke("Can you add an appropriate category next to each of the following expenses. Respond with a list of categories separated by commas. For example, Spotify AB by Adyen - \
Entertainment, Beta Boulders Ams Amsterdam Nld - Sports, etc.: \
ISS Catering Services De Meern, Vishandel Sier AMSTELVEEN, Ministerie van Justitie en Veiligheid, Etos AMSTERDAM NLD, Bistro Bar Amsterdam")

'\nCertainly! Here are the categories for each of the expenses you provided:\n\n* Spotify AB by Adyen - Entertainment\n* Beta Boulders Ams Amsterdam Nld - Sports\n* ISS Catering Services De Meern - Food and Beverage\n* Vishandel Sier AMSTELVEEN - Retail\n* Ministerie van Justitie en Veiligheid - Government\n* Etos AMSTERDAM NLD - Retail\n* Bistro Bar Amsterdam - Food and Beverage'

### Read transaction data

In [5]:
# Read the transactions_2022_2023.csv file 
import pandas as pd
df = pd.read_csv("transactions_2022_2023.csv")
df.head()

Unnamed: 0,Date,Name / Description,Expense/Income,Amount (EUR)
0,2023-12-30,Belastingdienst,Expense,9.96
1,2023-12-30,Tesco Breda,Expense,17.53
2,2023-12-30,Monthly Appartment Rent,Expense,451.0
3,2023-12-30,Vishandel Sier Amsterdam,Expense,12.46
4,2023-12-29,Selling Paintings,Income,13.63


In [14]:
# Get unique transactions in the Name / Description column
unique_transactions = df["Name / Description"].unique()
len(unique_transactions)

23

In [45]:
unique_transactions[1:10]

array(['Tesco Amstelveen', 'Monthly Appartment Rent',
       'Vishandel Sier Amstelveen', 'Selling Paintings',
       'Spotify Ab By Adyen', 'Tls Bv Inz Ov-Chipkaart',
       'Tikkie Zakelijk', 'Tk Maxx Amsterdam Da', 'Consulting'],
      dtype=object)

### Categorise bank transactions with Llama2

In [15]:
# Get index list
#https://stackoverflow.com/questions/47518609/for-loop-range-and-interval-how-to-include-last-step
def hop(start, stop, step):
    for i in range(start, stop, step):
        yield i
    yield stop

index_list = list(hop(0, len(unique_transactions), 30))
index_list

[0, 23]

In [7]:
# Output validation
from pydantic import BaseModel, field_validator
from typing import List

# Validate response format - check if it actually contains hyphen ("-")
class ResponseChecks(BaseModel):
    data: List[str]

    @field_validator("data")
    def check(cls, value):
        for item in value:
            if len(item) > 0:
                assert "-" in item, "String does not contain hyphen."

# Test validation
ResponseChecks(data = ['Hello - World', 'Hello - there!'])

ResponseChecks(data=None)

In [10]:
def categorize_transactions(transaction_names, llm):
    response = llm.invoke("Can you add an appropriate category to the following expenses. For example: Spotify AB by Adyen - Entertainment, Beta Boulders Ams Amsterdam Nld - Sport, etc.. Categories should be less than 4 words. " + transaction_names)
    response = response.split('\n')

    # Keep only the lines in between blank lines (removing the explaination lines at the beginning and end of the response)
    blank_indexes = [index for index in range(len(response)) if response[index] == '']
    if len(blank_indexes) == 1:
        response = response[(blank_indexes[0] + 1):]
    else:
        response = response[(blank_indexes[0] + 1) : blank_indexes[1]]

    # Print response and validate if it is in the correct format
    print(response)
    ResponseChecks(data = response)
    
    # Put in dataframe
    categories_df = pd.DataFrame({'Transaction vs category': response})
    categories_df[['Transaction', 'Category']] = categories_df['Transaction vs category'].str.split(' - ', expand=True)
    
    return categories_df

In [11]:
# Test out the function
categorize_transactions('ISS Catering Services De Meern, Vishandel Sier AMSTELVEEN, Etos AMSTERDAM NLD, Bistro Bar Amsterdam',
                        llm)

['1. Spotify AB by Adyen - Entertainment', '2. Beta Boulders Ams Amsterdam Nld - Sport', '3. ISS Catering Services De Meern - Food', '4. Vishandel Sier AMSTELVEEN - Grocery', '5. Etos AMSTERDAM NLD - Convenience', '6. Bistro Bar Amsterdam - Food']


Unnamed: 0,Transaction vs category,Transaction,Category
0,1. Spotify AB by Adyen - Entertainment,1. Spotify AB by Adyen,Entertainment
1,2. Beta Boulders Ams Amsterdam Nld - Sport,2. Beta Boulders Ams Amsterdam Nld,Sport
2,3. ISS Catering Services De Meern - Food,3. ISS Catering Services De Meern,Food
3,4. Vishandel Sier AMSTELVEEN - Grocery,4. Vishandel Sier AMSTELVEEN,Grocery
4,5. Etos AMSTERDAM NLD - Convenience,5. Etos AMSTERDAM NLD,Convenience
5,6. Bistro Bar Amsterdam - Food,6. Bistro Bar Amsterdam,Food


In [16]:
# Intialise the categories_df_all dataframe
categories_df_all = pd.DataFrame()
max_tries = 7

# Loop through the index_list
for i in range(0, len(index_list)-1):
    transaction_names = unique_transactions[index_list[i]:index_list[i+1]]
    transaction_names = ','.join(transaction_names)

    # Try and validate output, if it fails, try again for max_tries=7 times
    for j in range(1, max_tries):
        try:
            categories_df = categorize_transactions(transaction_names, llm)
            categories_df_all = pd.concat([categories_df_all, categories_df], ignore_index=True)
            
        except:
            if j < max_tries:
                continue
            else:
                raise Exception(f"Cannot categorise transactions indexes {i} to {i+1}.")
        break

['1. Belastingdienst - Taxes', '2. Tesco Breda - Groceries', '3. Monthly Appartment Rent - Housing', '4. Vishandel Sier Amsterdam - Food', '5. Selling Paintings - Art/Crafts', '6. Spotify Ab By Adyen - Entertainment', '7. Tk Maxx Amsterdam Da - Shopping', '8. Consulting - Professional Services', '9. Aidsfonds - Charity', '10. TLS BV Inz Ov-Chipkaart - Transportation', '11. Etos Amsterdam - Groceries', '12. Beta Boulders Ams Amsterdam - Fitness', '13. Salary - Personal Finance', '14. Bouldermuur Bv Amsterdam - Professional Services', '15. Birtat Restaurant Amsterdam - Dining', '16. Freelancing - Personal Finance', '17. Tikkie - Shopping', '18. Blogging - Personal Finance', '19. Taxi Utrecht - Transportation', '20. Apple Services - Technology', '21. Amazon Lux - Online Shopping', '22. Classpass* Monthly - Fitness', '23. Audible Uk AdblCo/Pymt Gbr - Entertainment']


In [18]:
categories_df_all.head()

Unnamed: 0,Transaction vs category,Transaction,Category
0,1. Belastingdienst - Taxes,1. Belastingdienst,Taxes
1,2. Tesco Breda - Groceries,2. Tesco Breda,Groceries
2,3. Monthly Appartment Rent - Housing,3. Monthly Appartment Rent,Housing
3,4. Vishandel Sier Amsterdam - Food,4. Vishandel Sier Amsterdam,Food
4,5. Selling Paintings - Art/Crafts,5. Selling Paintings,Art/Crafts


In [50]:
# categories_df_all.to_csv("categories_df_all.csv", index=False)

In [5]:
# Get unique categories in categories_df_all
unique_categories = categories_df_all["Category"].unique()
unique_categories

array([nan, 'Taxes', 'Groceries', 'Housing', 'Shopping', 'Art/Crafts',
       'Entertainment', 'Transportation', 'Business Services',
       'Professional Services', 'Charity/Donations', 'Food/Beverage',
       'Travel', 'Self-Employment', 'Financial Services', 'Technology',
       'Home Improvement', 'Business', 'Miscellaneous', 'Food & Beverage',
       'Health & Beauty', 'Grocery', 'Finance', 'Sport',
       'Food and Beverage', 'Travel and Transportation', 'Retail',
       'Health and Wellness', 'Education', 'Construction and Maintenance',
       'Health and Beauty', 'Clothing', 'Clothing and Accessories',
       'Travel and Leisure', 'Art & Food', 'Local Government',
       'E-commerce', 'Shipping', 'Sports & Fitness', 'Food and Drink',
       'Accommodation', 'Energy', 'Marketing and Advertising',
       'Convenience Store', 'Home and Garden', 'Real Estate',
       'Language Services', 'Legal Services', 'Sports and Fitness',
       'Business and Services', 'Travel and Tourism', '

In [7]:
# Drop NA values
categories_df_all = categories_df_all.dropna()

# If category contains "Food", then categorise as "Food and Drinks"
categories_df_all.loc[categories_df_all['Category'].str.contains("Food"), 'Category'] = "Food and Drinks"
# If category contains "Clothing", then categorise as "Clothing"
categories_df_all.loc[categories_df_all['Category'].str.contains("Clothing"), 'Category'] = "Clothing"
# If category contains "Services", then categorise as "Services"
categories_df_all.loc[categories_df_all['Category'].str.contains("Services"), 'Category'] = "Services"
# If category contains "Health" or "Wellness", then categorise as "Health and Wellness"
categories_df_all.loc[categories_df_all['Category'].str.contains("Health|Wellness"), 'Category'] = "Health and Wellness"
# If category contains "Sport", then categorise as "Sport
#  and Fitness"
categories_df_all.loc[categories_df_all['Category'].str.contains("Sport"), 'Category'] = "Sport and Fitness"
# If category contains "Travel", then categorise as "Travel"
categories_df_all.loc[categories_df_all['Category'].str.contains("Travel"), 'Category'] = "Travel"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categories_df_all.loc[categories_df_all['Category'].str.contains("Food"), 'Category'] = "Food and Drinks"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categories_df_all.loc[categories_df_all['Category'].str.contains("Clothing"), 'Category'] = "Clothing"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categories_df_all.loc[categories_df_all['Category'].str.contains("Services"), 'Category'] = "Services"
A value is trying to be set on a copy of a slice from a DataFram

In [8]:
# Remove the numbering eg "1. " from Transaction column
categories_df_all['Transaction'] = categories_df_all['Transaction'].str.replace(r'\d+\.\s+', '')
categories_df_all

  categories_df_all['Transaction'] = categories_df_all['Transaction'].str.replace(r'\d+\.\s+', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categories_df_all['Transaction'] = categories_df_all['Transaction'].str.replace(r'\d+\.\s+', '')


Unnamed: 0,Transaction vs category,Transaction,Category
2,1. Belastingdienst - Taxes,Belastingdienst,Taxes
3,2. Tesco Amstelveen - Groceries,Tesco Amstelveen,Groceries
4,3. Monthly Appartment Rent - Housing,Monthly Appartment Rent,Housing
5,4. Vishandel Sier Amstelveen - Shopping,Vishandel Sier Amstelveen,Shopping
6,5. Selling Paintings - Art/Crafts,Selling Paintings,Art/Crafts
...,...,...,...
351,14. Amazon Lux - Online Shopping,Amazon Lux,Online Shopping
352,15. Classpass* Monthly Missoula Usa - Subscrip...,Classpass* Monthly Missoula Usa,Subscription
353,16. Flowingdata Livermore Usa - Business,Flowingdata Livermore Usa,Business
354,17. Audible Uk AdblCo/Pymt Gbr - Entertainment,Audible Uk AdblCo/Pymt Gbr,Entertainment


In [32]:
# Merge the categories_df_all with the transactions_2022_2023.csv dataframe (df)
df = pd.read_csv("transactions_2022_2023.csv")
df.loc[df['Name / Description'].str.contains("Spotify"), 'Name / Description'] = "Spotify Ab By Adyen"
df = pd.merge(df, categories_df_all, left_on='Name / Description', right_on='Transaction', how='left')
df

Unnamed: 0,Date,Name / Description,Expense/Income,Amount (EUR),Transaction vs category,Transaction,Category
0,2023-12-30,Belastingdienst,Expense,9.96,1. Belastingdienst - Taxes,Belastingdienst,Taxes
1,2023-12-30,Tesco Amstelveen,Expense,17.53,2. Tesco Amstelveen - Groceries,Tesco Amstelveen,Groceries
2,2023-12-30,Monthly Appartment Rent,Expense,451.00,3. Monthly Appartment Rent - Housing,Monthly Appartment Rent,Housing
3,2023-12-30,Vishandel Sier Amstelveen,Expense,12.46,4. Vishandel Sier Amstelveen - Shopping,Vishandel Sier Amstelveen,Shopping
4,2023-12-29,Selling Paintings,Income,13.63,5. Selling Paintings - Art/Crafts,Selling Paintings,Art/Crafts
...,...,...,...,...,...,...,...
1561,2022-01-16,Amazon Lux,Expense,24.11,14. Amazon Lux - Online Shopping,Amazon Lux,Online Shopping
1562,2022-01-15,Classpass* Monthly Missoula Usa,Expense,30.08,15. Classpass* Monthly Missoula Usa - Subscrip...,Classpass* Monthly Missoula Usa,Subscription
1563,2022-01-15,Flowingdata Livermore Usa,Expense,17.98,16. Flowingdata Livermore Usa - Business,Flowingdata Livermore Usa,Business
1564,2022-01-14,Audible Uk AdblCo/Pymt Gbr,Expense,11.00,,,


In [45]:
df.to_csv("transactions_2022_2023_categorized.csv", index=False)