In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os

#Dataframe Visual Settings
pd.set_option('display.max_rows',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',400)

def clean_bank_statement_file(filename):
    """
    Extract, modify & clean bank statement raw file for neater format
    """
    #Cleaning data
    # csv_file = '\Bank Statements\'' + filename
    df = pd.read_csv(filename,
                     usecols=['Transaction Number','Date','Memo','Amount Debit','Amount Credit'],
                     skiprows=3)
    
    df.fillna({'Amount Debit':0, 'Amount Credit':0},inplace=True)
    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
    df.sort_values(by=['Date'],ascending=False,ignore_index=True,inplace=True)
    df['Memo'].fillna('Default',inplace=True)
    df['Transaction Number'] = df['Transaction Number'].apply(lambda x: x.split('**')[1])
    df['Purchase Date'] = df[df['Memo'].str.contains('1574 ',na=False)]['Memo']\
        .apply(lambda x: x.split('1574 ')[1])
    df['Purchase Date'] = pd.to_datetime(df['Purchase Date'],format = '%m/%d %H:%M')
    df['Purchase Date'] = df['Purchase Date']+pd.DateOffset(years=120)
    df.loc[df['Purchase Date'].isna(),'Purchase Date'] = df['Date'].copy()
    df['Memo'] = df['Memo'].apply(lambda x: x.split('*****')[0])
    df['Content'] = df['Transaction Number'] + ' ' + df['Memo']
    df['Amount'] = df['Amount Debit'].astype(float) + df['Amount Credit'].astype(float)
    df['Category'] = 'Default'
    df.drop(df[df['Content'].str.contains("INTERNET TRANSFER",na=False)].index,inplace=True)
    df.drop(df[df['Content'].str.contains("DDXXXX5941",na=False)].index,inplace=True)
    df.reset_index(drop=True,inplace=True)
    df.drop(['Transaction Number','Memo','Amount Debit','Amount Credit'],
            axis=1,inplace=True)
    df.sort_values(by=['Purchase Date'],ascending=False, inplace=True)
    return df

def categorize(df):
    """
    For common transactions, categorize these early, and use to train a model
    """
    #Easier Categorization
    df.loc[df['Content'].str.contains("UBER EATS",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("HARRIS",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("GIANT",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("USAA",na=False),'Category'] = 'USAA Insurance'
    df.loc[df['Content'].str.contains("Accenture",na=False),'Category'] = 'Pay Check'
    df.loc[df['Content'].str.contains("XSPORT",na=False),'Category'] = 'Gym'
    df.loc[df['Content'].str.contains("DISTRICT MARTIAL ARTS",na=False),'Category'] = 'Gym'
    df.loc[df['Content'].str.contains("PARKING",na=False),'Category'] = 'Tolls/Uber/Metro/Parking'
    df.loc[df['Content'].str.contains("NAZRET",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("TAJ OF INDIA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("DCPILLAR",na=False),'Category'] = 'Tithe'
    df.loc[df['Content'].str.contains("GOOGLE",na=False),'Category'] = 'Entertainment'
    df.loc[df['Content'].str.contains("VENMO/CASHOUT",na=False),'Category'] = 'Venmo Extra'
    df.loc[df['Content'].str.contains("CITGO",na=False),'Category'] = 'Gas'
    df.loc[df['Content'].str.contains("SHELL",na=False),'Category'] = 'Gas'
    df.loc[df['Content'].str.contains("PUPATELLA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("GOOD COMPANY DONUT",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("STARBUCKS",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("UBER TRIP",na=False),'Category'] = 'Tolls/Uber/Metro/Parking'
    df.loc[df['Content'].str.contains("VERIZON",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("WASHINGTON GAS",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("ENERGY",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("TOM COLEMAN",na=False),'Category'] = 'Phone'
    df.loc[df['Content'].str.contains("STDNT LOAN",na=False),'Category'] = 'Student Loans'
    df.loc[(df['Content'].str.contains("VENMO/PAYMENTWALTER COLEMAN Default",na=False)) &
           (df['Amount'] == -668.75),'Category'] = 'Rent'
    df.loc[df['Content'].str.contains("Margaret Coleman",na=False),'Category'] = 'Extra'
    df.loc[df['Content'].str.contains("Person-to-Person TransferPAYPAL",na=False),'Category'] = 'Extra'
    return df 

In [3]:
filename='JanuaryChecking.csv'
df = clean_bank_statement_file(filename)
df = categorize(df)

In [4]:
df.head()

Unnamed: 0,Date,Purchase Date,Content,Amount,Category
0,2021-01-29,2021-01-29,Security deposit TO: DDXXXX5941 Default,-442.82,Default
2,2021-01-29,2021-01-29,USAA PC/PAYMENTWALTER COLEMAN Default,-187.68,USAA Insurance
5,2021-01-27,2021-01-27,VENMO/CASHOUTWALTER COLEMAN Default,669.8,Venmo Extra
41,2021-01-21,2021-01-21,"Accenture Federa/PAYRLL DEPColeman, Walter J",2099.81,Pay Check
43,2021-01-20,2021-01-20,VENMO/PAYMENTWALTER COLEMAN Default,-48.22,Default


In [5]:
df.to_csv('JanuaryCheckingCleaned.csv')

In [10]:
df = pd.read_csv('JanuaryCheckingCleaned.csv')

In [11]:
df.head()

Unnamed: 0,Date,Purchase Date,Content,Amount,Category
0,1/4/2021 0:00,12/31/2020 2:52,POS PURCHASE Non-PINMCDONALD'S F4976 MECHANICSVILL VA1,-3.18,Dining Out
1,1/4/2021 0:00,12/28/2020 10:13,POS PURCHASE Non-PINEL CUSCATLECO RESTAURAN ALEXANDRIA VA791732,-6.67,Dining Out
2,1/27/2021 0:00,1/26/2020 23:49,POS PURCHASE Non-PINTST* LE KON ARLINGTON VA 504325,-23.75,Dining Out
3,1/27/2021 0:00,1/25/2020 20:50,POS PURCHASE Non-PINSUPER POLLO ARLINGTON VA IN8700,-15.16,Dining Out
4,1/25/2021 0:00,1/24/2020 20:21,POS PURCHASE Non-PINSNOOZE NORTH LAMAR AUSTIN TX 0000JY,-34.12,Dining Out


In [12]:
df.groupby('Category').sum()

Unnamed: 0_level_0,Amount
Category,Unnamed: 1_level_1
Dining Out,-475.57
Entertainment,-12.97
Extra,775.0
Gas,-109.01
Groceries,-386.08
Gym,-180.95
Misc,-1057.17
Pay Check,4361.81
Rent,-845.0
Tithe,-399.23


In [29]:
filename='Feb19Checking.csv'
df = clean_bank_statement_file(filename)
df = categorize(df)

In [30]:
df.to_csv('Feb19Checking.csv')

PermissionError: [Errno 13] Permission denied: 'Feb19Checking.csv'

In [17]:
df.head()

Unnamed: 0,Date,Purchase Date,Content,Amount,Category
1,2021-02-19,2021-02-19,"Accenture Federa/PAYRLL DEPColeman, Walter J",2099.81,Pay Check
27,2021-02-08,2021-02-08,Pillar DC/Pillar DCPILLAR CHURCH OF WASHI,-282.7,Tithe
28,2021-02-08,2021-02-08,VZ WIRELESS VE/VZW WEBPAYWALTER *COLEMAN,-79.08,Default
29,2021-02-08,2021-02-08,VENMO/PAYMENTWALTER COLEMAN Default,-25.0,Default
30,2021-02-08,2021-02-08,VENMO/PAYMENTWALTER COLEMAN Default,-18.03,Default


In [18]:
df[df['Category'] == 'Default']

Unnamed: 0,Date,Purchase Date,Content,Amount,Category
28,2021-02-08,2021-02-08 00:00:00,VZ WIRELESS VE/VZW WEBPAYWALTER *COLEMAN,-79.08,Default
29,2021-02-08,2021-02-08 00:00:00,VENMO/PAYMENTWALTER COLEMAN Default,-25.0,Default
30,2021-02-08,2021-02-08 00:00:00,VENMO/PAYMENTWALTER COLEMAN Default,-18.03,Default
0,2021-02-19,2020-02-18 22:03:00,POS PURCHASE Non-PINPAYPAL *BUSINESSINS TRI 402-935-7733 NYINC000,-1.0,Default
11,2021-02-16,2020-02-15 03:41:00,POS PURCHASE Non-PINMIDAS. ARLINGTON VA 00,-214.57,Default
10,2021-02-16,2020-02-14 12:11:00,POS PURCHASE Non-PINLA MADELEINE #025 BAILEYS XRDS VA124133,-5.61,Default
7,2021-02-16,2020-02-13 16:26:00,POS PURCHASE Non-PINMIDAS. ARLINGTON VA 00,-100.0,Default
8,2021-02-16,2020-02-13 00:01:00,POS PURCHASE Non-PINTUPELO HONEY L010 ARLINGTON VA608475,-30.96,Default
6,2021-02-16,2020-02-12 10:07:00,POS PURCHASE Non-PINQueen Mothers 703-9978474 VA 000000,-24.29,Default
14,2021-02-12,2020-02-11 23:56:00,POS PURCHASE Non-PINTST* TORTAS Y TACOS LA ARLINGTON VA539499,-12.56,Default


In [19]:
df[df['Category'] == 'Default'].sum()

Content     VZ WIRELESS VE/VZW WEBPAYWALTER *COLEMANVENMO/PAYMENTWALTER COLEMAN DefaultVENMO/PAYMENTWALTER COLEMAN DefaultPOS PURCHASE Non-PINPAYPAL *BUSINESSINS TRI 402-935-7733 NYINC000 POS PURCHASE Non-PINMIDAS. ARLINGTON VA 00POS PURCHASE Non-PINLA MADELEINE #025 BAILEYS XRDS VA124133 POS PURCHASE Non-PINMIDAS. ARLINGTON VA 00POS PURCHASE Non-PINTUPELO HONEY L010 ARLINGTON VA608475 POS PURCHASE Non-PI...
Amount                                                                                                                                                                                                                                                                                                                                                                                                             -1387.67
Category                                                                                                                                                                        

In [25]:
for cat in df[df['Category'] == 'Default']['Content']:
    print("Content", cat)

VZ WIRELESS VE/VZW WEBPAYWALTER *COLEMAN
VENMO/PAYMENTWALTER COLEMAN Default
VENMO/PAYMENTWALTER COLEMAN Default
POS PURCHASE Non-PINPAYPAL *BUSINESSINS TRI 402-935-7733 NYINC000 
POS PURCHASE Non-PINMIDAS. ARLINGTON VA 00
POS PURCHASE Non-PINLA MADELEINE #025 BAILEYS XRDS VA124133 
POS PURCHASE Non-PINMIDAS. ARLINGTON VA 00
POS PURCHASE Non-PINTUPELO HONEY L010 ARLINGTON VA608475 
POS PURCHASE Non-PINQueen Mothers 703-9978474 VA 000000
POS PURCHASE Non-PINTST* TORTAS Y TACOS LA ARLINGTON VA539499 
POS PURCHASE Non-PINQueen Mothers 703-9978474 VA 000000
POS PURCHASE Non-PINAMAZON.COM*EX1UJ8P23 AM AMZN.COM/BILL WAIN7400 
POS PURCHASE Non-PINBOB EDITHS DINER - C 703-8541401 VA762551 
POS PURCHASE Non-PINGEORGE MASON LIBERTY ARLINGTON VAGGML00 
POS PURCHASE Non-PINBOWL AMERICA INC DRANES STERLING VA781201 
POS PURCHASE Non-PINBOWL AMERICA INC DRANES STERLING VA781201 
Person-to-Person TransferCASH APP*MICHEE MUT 8774174551 CAIN1070 
POS PURCHASE Non-PINTST* NORTHSIDE SOCIAL C ARLINGTON VA

In [31]:
for cat in df[df['Category'] == 'Default']['Content']:
    print("Content", cat)
    print("Category is: ")
    inp = input()
    df.loc[df['Content'] == cat, 'Category'] = inp

Content VZ WIRELESS VE/VZW WEBPAYWALTER *COLEMAN
Category is: 
Phone
Content VENMO/PAYMENTWALTER COLEMAN Default
Category is: 
Utilities
Content VENMO/PAYMENTWALTER COLEMAN Default
Category is: 
Utilities
Content POS PURCHASE Non-PINPAYPAL *BUSINESSINS TRI 402-935-7733 NYINC000 
Category is: 
Misc
Content POS PURCHASE Non-PINMIDAS. ARLINGTON VA 00
Category is: 
Car Misc
Content POS PURCHASE Non-PINLA MADELEINE #025 BAILEYS XRDS VA124133 
Category is: 
Restaurants
Content POS PURCHASE Non-PINMIDAS. ARLINGTON VA 00
Category is: 
Car Misc
Content POS PURCHASE Non-PINTUPELO HONEY L010 ARLINGTON VA608475 
Category is: 
Restaurants
Content POS PURCHASE Non-PINQueen Mothers 703-9978474 VA 000000
Category is: 
Restaurants
Content POS PURCHASE Non-PINTST* TORTAS Y TACOS LA ARLINGTON VA539499 
Category is: 
Restaurants
Content POS PURCHASE Non-PINQueen Mothers 703-9978474 VA 000000
Category is: 
Restaurants
Content POS PURCHASE Non-PINAMAZON.COM*EX1UJ8P23 AM AMZN.COM/BILL WAIN7400 
Category is: 

In [32]:
df.groupby('Category').sum()

Unnamed: 0_level_0,Amount
Category,Unnamed: 1_level_1
Car Misc,-314.57
Dining Out,-84.37
Entertainment,-16.99
Gas,-38.33
Groceries,-257.75
Gym,-180.95
Misc,-660.13
Pay Check,4037.43
Phone,-79.08
Restaurants,-240.53


In [28]:
df['Category']

1                                                                                              Pay Check
27                                                                                                 Tithe
28    <bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x00000137B3957130>>
29    <bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x00000137B3957130>>
30    <bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x00000137B3957130>>
34                                                                                             Pay Check
0     <bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x00000137B3957130>>
2                                                                                          Entertainment
3                                                                                              Groceries
5                                                      

In [None]:
groupby
upload to excel
that's literally all you do

Data vis
line of each month
pie for individual new month
bar for all/new (so two bar charts)

Extra extra credit
Boxplot for restaurant spending
Hourly timeseries chart for spending
Day-of-week timeseries chart for spending