In [4]:
# https://www.kaggle.com/shakedzy/alone-in-the-woods-using-theil-s-u-for-survival
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import math
from collections import Counter
# import numpy as np
import seaborn as sns
# import pandas as pd
import scipy.stats as ss
# import matplotlib.pyplot as plt
import sklearn.preprocessing as sp
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from subprocess import check_output


#Dataframe Visual Settings
pd.set_option('display.max_rows',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',400)

def PIN_col(x):
    if("POS PURCHASE Non-PIN" in x):
        return "POS PURCHASE Non-PIN"
    elif("POS PURCHASE with PIN" in x):
        return "POS PURCHASE with PIN"
    elif("VENMO" in x):
        return "VENMO"
    else:
        return "Other"
    
def remove_payment_method(x):
    if("POS PURCHASE Non-PIN" in x):
        return x.split("POS PURCHASE Non-PIN")[1]
    elif("POS PURCHASE with PIN" in x):
        return x.split("POS PURCHASE with PIN")[1]
    else:
        return x

def middle_words(x):
    if(len(x) > 2):
        return x[1:-1]
    else:
        return "N/A"

def clean_bank_statement_file(filename):
    """
    Extract, modify & clean bank statement raw file for neater format
    """
    #Cleaning data
    # csv_file = '\Bank Statements\'' + filename
    df = pd.read_csv(filename,
                     usecols=['Transaction Number','Date','Memo','Amount Debit','Amount Credit'],
                     skiprows=3)
    
    df.fillna({'Amount Debit':0, 'Amount Credit':0},inplace=True)
    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
    df.sort_values(by=['Date'],ascending=False,ignore_index=True,inplace=True)
    df['Memo'].fillna('Default',inplace=True)
    df['Transaction Number'] = df['Transaction Number'].apply(lambda x: x.split('**')[1])
    df['Purchase Date'] = df[df['Memo'].str.contains('1574 ',na=False)]['Memo']\
        .apply(lambda x: x.split('1574 ')[1])
    df['Purchase Date'] = pd.to_datetime(df['Purchase Date'],format = '%m/%d %H:%M')
    df['Purchase Date'] = df['Purchase Date']+pd.DateOffset(years=120)
    df.loc[df['Purchase Date'].isna(),'Purchase Date'] = df['Date'].copy()
    df['Memo'] = df['Memo'].apply(lambda x: x.split('*****')[0])
    df['Content'] = df['Transaction Number'] + ' ' + df['Memo']
    df['Amount'] = df['Amount Debit'].astype(float) + df['Amount Credit'].astype(float)
    df['Category'] = 'Default'
    df['Payment_Method'] = df['Content'].apply(lambda x: PIN_col(x))
    df["Content"] = df["Content"].apply(lambda x: remove_payment_method(x))
    df['Purchase Time'] = [d.time() for d in df['Purchase Date']]
    df['Purchase Date'] = [d.date() for d in df['Purchase Date']]
    df['Verification Date'] = df['Date'].copy()
    df['Content_Word_Length'] = df['Content'].apply(lambda x: len(x.split(' ')))
    df['First_Word'] = df['Content'].apply(lambda x: x.split(' ')[0])
    df['Last_Word'] = df['Content'].apply(lambda x: x.split(' ')[-1])
    df['Middle_Words'] = df['Content'].apply(lambda x: middle_words(x.split(' ')))
    df.drop(df[df['Content'].str.contains("INTERNET TRANSFER",na=False)].index,inplace=True)
    df.drop(df[df['Content'].str.contains('DDXXXX5941',na=False)].index, inplace=True)
    df.reset_index(drop=True,inplace=True)
    df.drop(['Transaction Number','Memo','Amount Debit','Amount Credit'],
            axis=1,inplace=True)
    df.sort_values(by=['Purchase Date'],ascending=False, inplace=True)
    return df

def categorize(df):
    """
    For common transactions, categorize these early, and use to train a model
    """
    #Easier Categorization
    df.loc[df['Content'].str.contains("UBER EATS",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("HARRIS",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("GIANT",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("USAA",na=False),'Category'] = 'USAA Insurance'
    df.loc[df['Content'].str.contains("Accenture",na=False),'Category'] = 'Pay Check'
    df.loc[df['Content'].str.contains("XSPORT",na=False),'Category'] = 'Gym'
    df.loc[df['Content'].str.contains("DISTRICT MARTIAL ARTS",na=False),'Category'] = 'Gym'
    df.loc[df['Content'].str.contains("PARKING",na=False),'Category'] = 'Tolls/Uber/Metro/Parking'
    df.loc[df['Content'].str.contains("NAZRET",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("TAJ OF INDIA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("DCPILLAR",na=False),'Category'] = 'Tithe'
    df.loc[df['Content'].str.contains("GOOGLE",na=False),'Category'] = 'Entertainment'
    df.loc[df['Content'].str.contains("VENMO/CASHOUT",na=False),'Category'] = 'Venmo Extra'
    df.loc[df['Content'].str.contains("CITGO",na=False),'Category'] = 'Gas'
    df.loc[df['Content'].str.contains("SHELL",na=False),'Category'] = 'Gas'
    df.loc[df['Content'].str.contains("PUPATELLA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("GOOD COMPANY DONUT",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("STARBUCKS",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("UBER TRIP",na=False),'Category'] = 'Tolls/Uber/Metro/Parking'
    df.loc[df['Content'].str.contains("VERIZON",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("WASHINGTON GAS",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("ENERGY",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("TOM COLEMAN",na=False),'Category'] = 'Phone'
    df.loc[df['Content'].str.contains("STDNT LOAN",na=False),'Category'] = 'Student Loans'
    df.loc[(df['Content'].str.contains("VENMO/PAYMENTWALTER COLEMAN Default",na=False)) &
           (df['Amount'] == -845),'Category'] = 'Rent'
    df.loc[df['Content'].str.contains("Margaret Coleman",na=False),'Category'] = 'Extra'
    df.loc[df['Content'].str.contains("Person-to-Person TransferPAYPAL",na=False),'Category'] = 'Extra'
    df.loc[df['Content'].str.contains("Tortas y Tacos",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("CROWNE PLAZA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("CROWNE PLAZA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("Emmaus Family Couns",na=False),'Category'] = 'Medical'
    df.loc[df['Content'].str.contains("ADVANCED HEALTH CARE",na=False),'Category'] = 'Medical'
    df.loc[df['Content'].str.contains("AMZN Mktp",na=False),'Category'] = 'Misc'
    df.loc[df['Content'].str.contains("Amazon web services",na=False),'Category'] = 'Misc'
    df.loc[df['Content'].str.contains("ALDI",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("FOOD LION",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("Audible",na=False),'Category'] = 'Entertainment'
    df.loc[df['Content'].str.contains("PIZZA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("CROWNE PLAZA",na=False),'Category'] = 'Dining Out'
    #set positive defaults to Misc
    df.loc[(df['Category'] == 'Default') & (df['Amount'] > 0), 'Category'] = 'Extra'
    df.loc[(df['Purchase Time'] == dt.time(0,0,0)) & (df['Category'] == 'Default') & \
             (df['First_Word'] != 'VENMO/PAYMENTWALTER'), 'Category'] = 'Misc'
    df.loc[(df['Category'] == 'Default') & (df['First_Word'] == 'SQ'), 'Category'] = 'Dining Out'
    df.loc[(df['Category'] == "Default") & (df["First_Word"] != "VENMO/PAYMENTWALTER"), "Category"] = "Misc"
    df.loc[df['Content'].str.contains("Pizza",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("Amzn",na=False),'Category'] = 'Misc'
    df.loc[df['Content'].str.contains("Pollo",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("VZ WIRELESS",na=False),'Category'] = 'Phone'
    df.loc[df['Content'].str.contains("PARKMOBILE",na=False),'Category'] = 'Tolls/Uber/Metro/Parking'
    return df 

def create_grand_file(directory):
    """
    Extract based on string file names
    """
    grand_df = pd.DataFrame()
    temp_df = pd.DataFrame()

    #Get list of files
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            temp_df = clean_bank_statement_file(filename)
            temp_df = categorize(temp_df)
            grand_df = pd.concat([grand_df,temp_df],ignore_index=True)
    
    return grand_df

In [65]:
# Start with February 2021
filename = 'FebruaryChecking.csv'
feb_df = clean_bank_statement_file(filename)
feb_df = categorize(feb_df)

In [16]:
feb_df

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
2,2021-02-25,2021-02-25,Feb 2nd Paycheck Savings TO: DDXXXX5941,-173.82,Misc,Other,00:00:00,2021-02-25,6,Feb,DDXXXX5941,"[2nd, Paycheck, Savings, TO:]"
3,2021-02-24,2021-02-24,VZ WIRELESS VE/VZW WEBPAYWALTER *COLEMAN,-104.2,Misc,Other,00:00:00,2021-02-24,5,VZ,*COLEMAN,"[WIRELESS, VE/VZW, WEBPAYWALTER]"
4,2021-02-24,2021-02-24,Pillar DC/Pillar DCPILLAR CHURCH OF WASHI,-310.97,Tithe,Other,00:00:00,2021-02-24,6,Pillar,WASHI,"[DC/Pillar, DCPILLAR, CHURCH, OF]"
6,2021-02-23,2021-02-23,VENMO/PAYMENTWALTER COLEMAN Default,-46.41,Default,VENMO,00:00:00,2021-02-23,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
16,2021-02-22,2021-02-22,USAA PC/PAYMENTWALTER COLEMAN Default,-187.68,USAA Insurance,Other,00:00:00,2021-02-22,4,USAA,Default,"[PC/PAYMENTWALTER, COLEMAN]"
18,2021-02-22,2021-02-22,VENMO/PAYMENTWALTER COLEMAN Default,-845.0,Rent,VENMO,00:00:00,2021-02-22,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
17,2021-02-22,2021-02-22,VENMO/PAYMENTWALTER COLEMAN Default,-91.03,Default,VENMO,00:00:00,2021-02-22,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
21,2021-02-19,2021-02-19,"Accenture Federa/PAYRLL DEPColeman, Walter J",2099.81,Pay Check,Other,00:00:00,2021-02-19,5,Accenture,J,"[Federa/PAYRLL, DEPColeman,, Walter]"
27,2021-02-16,2021-02-16,Midas and groceries FROM: DDXXXX5941,200.0,Extra,Other,00:00:00,2021-02-16,5,Midas,DDXXXX5941,"[and, groceries, FROM:]"
26,2021-02-16,2021-02-16,Midas repair FROM: DDXXXX5941 Default,459.0,Extra,Other,00:00:00,2021-02-16,5,Midas,Default,"[repair, FROM:, DDXXXX5941]"


In [None]:
for cat in df[df['Category'] == 'Default']['Content']:
    print("Content", cat)
    print("Category is: ")
    inp = input()
    df.loc[df['Content'] == cat, 'Category'] = inp

In [66]:
feb_df[feb_df['Category'] == 'Default']

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
5,2021-02-23,2021-02-23,VENMO/PAYMENTWALTER COLEMAN Default,-46.41,Default,VENMO,00:00:00,2021-02-23,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
16,2021-02-22,2021-02-22,VENMO/PAYMENTWALTER COLEMAN Default,-91.03,Default,VENMO,00:00:00,2021-02-22,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
48,2021-02-08,2021-02-08,VENMO/PAYMENTWALTER COLEMAN Default,-25.0,Default,VENMO,00:00:00,2021-02-08,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
49,2021-02-08,2021-02-08,VENMO/PAYMENTWALTER COLEMAN Default,-18.03,Default,VENMO,00:00:00,2021-02-08,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]


In [67]:
# 52 internet
# 51 dining out w Eric
# 17 combined utilities (water, trash, sewage)
# 6 electricity
feb_df.loc[48, 'Category'] = 'Dining Out'
feb_df.loc[[5,16,49],'Category'] = 'Utilities'

In [78]:
# Start with February 2021
filename = 'March2021.csv'
mar_df = clean_bank_statement_file(filename)
mar_df = categorize(mar_df)
mar_df[mar_df['Category'] == 'Default']

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
3,2021-03-29,2021-03-29,VENMO/PAYMENTWALTER COLEMAN Default,-68.0,Default,VENMO,00:00:00,2021-03-29,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
12,2021-03-25,2021-03-25,VENMO/PAYMENTWALTER COLEMAN Default,-43.21,Default,VENMO,00:00:00,2021-03-25,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
60,2021-03-18,2021-03-18,VENMO/PAYMENTWALTER COLEMAN Default,-2.0,Default,VENMO,00:00:00,2021-03-18,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
59,2021-03-18,2021-03-18,VENMO/PAYMENTWALTER COLEMAN Default,-15.0,Default,VENMO,00:00:00,2021-03-18,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
99,2021-03-01,2021-03-01,VENMO/PAYMENTWALTER COLEMAN Default,-7.0,Default,VENMO,00:00:00,2021-03-01,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
98,2021-03-01,2021-03-01,VENMO/PAYMENTWALTER COLEMAN Default,-2.0,Default,VENMO,00:00:00,2021-03-01,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]


In [21]:
mar_df

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
2,2021-03-30,2021-03-30,ARLINGTON COUNTY/ARLCO PMTWALTER Default,-50.0,Misc,Other,00:00:00,2021-03-30,4,ARLINGTON,Default,"[COUNTY/ARLCO, PMTWALTER]"
3,2021-03-29,2021-03-29,VENMO/PAYMENTWALTER COLEMAN Default,-68.0,Default,VENMO,00:00:00,2021-03-29,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
10,2021-03-29,2021-03-29,Wtf FROM: DDXXXX5941 Default,418.59,Extra,Other,00:00:00,2021-03-29,4,Wtf,Default,"[FROM:, DDXXXX5941]"
14,2021-03-25,2021-03-25,USAA PC/PAYMENTWALTER COLEMAN Default,-187.72,USAA Insurance,Other,00:00:00,2021-03-25,4,USAA,Default,"[PC/PAYMENTWALTER, COLEMAN]"
15,2021-03-25,2021-03-25,VENMO/PAYMENTWALTER COLEMAN Default,-845.0,Rent,VENMO,00:00:00,2021-03-25,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
13,2021-03-25,2021-03-25,VENMO/PAYMENTWALTER COLEMAN Default,-43.21,Default,VENMO,00:00:00,2021-03-25,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
20,2021-03-24,2021-03-24,April bills FROM: DDXXXX5941 Default,1436.6,Extra,Other,00:00:00,2021-03-24,5,April,Default,"[bills, FROM:, DDXXXX5941]"
44,2021-03-22,2021-03-22,Test FROM: DDXXXX5941 Default,40.0,Extra,Other,00:00:00,2021-03-22,4,Test,Default,"[FROM:, DDXXXX5941]"
48,2021-03-22,2021-03-22,ATM WD FEE WA2703CHASE 85 PIKE ST SEATTLE WA,-3.0,Misc,Other,00:00:00,2021-03-22,9,ATM,WA,"[WD, FEE, WA2703CHASE, 85, PIKE, ST, SEATTLE]"
41,2021-03-22,2021-03-22,Clarence FROM: DDXXXX5941 Default,200.0,Extra,Other,00:00:00,2021-03-22,4,Clarence,Default,"[FROM:, DDXXXX5941]"


In [26]:
mar_df.drop(mar_df[mar_df['Content'].str.contains('DDXXXX5941')],axis=1,inplace=True)

In [48]:
feb_df

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
2,2021-02-24,2021-02-24,VZ WIRELESS VE/VZW WEBPAYWALTER *COLEMAN,-104.2,Utilities,Other,00:00:00,2021-02-24,5,VZ,*COLEMAN,"[WIRELESS, VE/VZW, WEBPAYWALTER]"
3,2021-02-24,2021-02-24,Pillar DC/Pillar DCPILLAR CHURCH OF WASHI,-310.97,Tithe,Other,00:00:00,2021-02-24,6,Pillar,WASHI,"[DC/Pillar, DCPILLAR, CHURCH, OF]"
5,2021-02-23,2021-02-23,VENMO/PAYMENTWALTER COLEMAN Default,-46.41,Utilities,VENMO,00:00:00,2021-02-23,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
15,2021-02-22,2021-02-22,USAA PC/PAYMENTWALTER COLEMAN Default,-187.68,USAA Insurance,Other,00:00:00,2021-02-22,4,USAA,Default,"[PC/PAYMENTWALTER, COLEMAN]"
17,2021-02-22,2021-02-22,VENMO/PAYMENTWALTER COLEMAN Default,-845.0,Rent,VENMO,00:00:00,2021-02-22,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
16,2021-02-22,2021-02-22,VENMO/PAYMENTWALTER COLEMAN Default,-91.03,Utilities,VENMO,00:00:00,2021-02-22,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
20,2021-02-19,2021-02-19,"Accenture Federa/PAYRLL DEPColeman, Walter J",2099.81,Pay Check,Other,00:00:00,2021-02-19,5,Accenture,J,"[Federa/PAYRLL, DEPColeman,, Walter]"
47,2021-02-08,2021-02-08,VZ WIRELESS VE/VZW WEBPAYWALTER *COLEMAN,-79.08,Utilities,Other,00:00:00,2021-02-08,5,VZ,*COLEMAN,"[WIRELESS, VE/VZW, WEBPAYWALTER]"
48,2021-02-08,2021-02-08,VENMO/PAYMENTWALTER COLEMAN Default,-25.0,Dining Out,VENMO,00:00:00,2021-02-08,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
49,2021-02-08,2021-02-08,VENMO/PAYMENTWALTER COLEMAN Default,-18.03,Utilities,VENMO,00:00:00,2021-02-08,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]


In [51]:
feb_df[feb_df['Category'] == 'Misc']

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
6,2021-02-23,2020-02-22,PAYPAL *MMAHQ BJJHQ 845-786-1900 NYIN8700,-94.0,Misc,POS PURCHASE Non-PIN,05:24:00,2021-02-23,6,PAYPAL,,"[*MMAHQ, BJJHQ, 845-786-1900, NYIN8700]"
11,2021-02-22,2020-02-21,AMAZON.COM*ZB81U4C73 AM AMZN.COM/BILL WAIN7600,-17.66,Misc,POS PURCHASE Non-PIN,04:45:00,2021-02-22,5,AMAZON.COM*ZB81U4C73,,"[AM, AMZN.COM/BILL, WAIN7600]"
13,2021-02-22,2020-02-20,FAVCHEFENDALKACHE 844-226-2449 AZIN4400,-10.0,Misc,POS PURCHASE Non-PIN,05:17:00,2021-02-22,4,FAVCHEFENDALKACHE,,"[844-226-2449, AZIN4400]"
18,2021-02-19,2020-02-18,PAYPAL *BUSINESSINS TRI 402-935-7733 NYINC000,-1.0,Misc,POS PURCHASE Non-PIN,22:03:00,2021-02-19,6,PAYPAL,,"[*BUSINESSINS, TRI, 402-935-7733, NYINC000]"
31,2021-02-16,2020-02-15,MIDAS. ARLINGTON VA 00,-214.57,Misc,POS PURCHASE Non-PIN,03:41:00,2021-02-16,4,MIDAS.,0.0,"[ARLINGTON, VA]"
25,2021-02-16,2020-02-13,MIDAS. ARLINGTON VA 00,-100.0,Misc,POS PURCHASE Non-PIN,16:26:00,2021-02-16,4,MIDAS.,0.0,"[ARLINGTON, VA]"
44,2021-02-08,2020-02-08,AMAZON.COM*EX1UJ8P23 AM AMZN.COM/BILL WAIN7400,-28.61,Misc,POS PURCHASE Non-PIN,10:55:00,2021-02-08,5,AMAZON.COM*EX1UJ8P23,,"[AM, AMZN.COM/BILL, WAIN7400]"
43,2021-02-08,2020-02-06,GEORGE MASON LIBERTY ARLINGTON VAGGML00,-38.33,Misc,POS PURCHASE Non-PIN,12:52:00,2021-02-08,6,GEORGE,,"[MASON, LIBERTY, ARLINGTON, VAGGML00]"
40,2021-02-08,2020-02-06,Person-to-Person TransferCASH APP*MICHEE MUT 8774174551 CAIN1070,-12.0,Misc,Other,05:34:00,2021-02-08,7,Person-to-Person,,"[TransferCASH, APP*MICHEE, MUT, 8774174551, CAIN1070]"
35,2021-02-08,2020-02-05,PAYPAL *AIRBNB HM2BPSC 4029357733 CAIN7500,-369.74,Misc,POS PURCHASE Non-PIN,19:01:00,2021-02-08,6,PAYPAL,,"[*AIRBNB, HM2BPSC, 4029357733, CAIN7500]"


In [68]:
feb_df.loc[[7,29,28,26,32,33,45,41,42,38,65,56,57],'Category'] = 'Dining Out'

In [69]:
feb_df.loc[12,'Category'] = 'Entertainment'

In [70]:
feb_df.loc[43,'Category'] = 'Gas'

In [54]:
feb_df

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
2,2021-02-24,2021-02-24,VZ WIRELESS VE/VZW WEBPAYWALTER *COLEMAN,-104.2,Utilities,Other,00:00:00,2021-02-24,5,VZ,*COLEMAN,"[WIRELESS, VE/VZW, WEBPAYWALTER]"
3,2021-02-24,2021-02-24,Pillar DC/Pillar DCPILLAR CHURCH OF WASHI,-310.97,Tithe,Other,00:00:00,2021-02-24,6,Pillar,WASHI,"[DC/Pillar, DCPILLAR, CHURCH, OF]"
5,2021-02-23,2021-02-23,VENMO/PAYMENTWALTER COLEMAN Default,-46.41,Utilities,VENMO,00:00:00,2021-02-23,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
15,2021-02-22,2021-02-22,USAA PC/PAYMENTWALTER COLEMAN Default,-187.68,USAA Insurance,Other,00:00:00,2021-02-22,4,USAA,Default,"[PC/PAYMENTWALTER, COLEMAN]"
17,2021-02-22,2021-02-22,VENMO/PAYMENTWALTER COLEMAN Default,-845.0,Rent,VENMO,00:00:00,2021-02-22,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
16,2021-02-22,2021-02-22,VENMO/PAYMENTWALTER COLEMAN Default,-91.03,Utilities,VENMO,00:00:00,2021-02-22,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
20,2021-02-19,2021-02-19,"Accenture Federa/PAYRLL DEPColeman, Walter J",2099.81,Pay Check,Other,00:00:00,2021-02-19,5,Accenture,J,"[Federa/PAYRLL, DEPColeman,, Walter]"
47,2021-02-08,2021-02-08,VZ WIRELESS VE/VZW WEBPAYWALTER *COLEMAN,-79.08,Utilities,Other,00:00:00,2021-02-08,5,VZ,*COLEMAN,"[WIRELESS, VE/VZW, WEBPAYWALTER]"
48,2021-02-08,2021-02-08,VENMO/PAYMENTWALTER COLEMAN Default,-25.0,Dining Out,VENMO,00:00:00,2021-02-08,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
49,2021-02-08,2021-02-08,VENMO/PAYMENTWALTER COLEMAN Default,-18.03,Utilities,VENMO,00:00:00,2021-02-08,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]


In [75]:
feb_df.groupby('Category').sum()

Unnamed: 0_level_0,Amount,Content_Word_Length
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Car Misc,-314.57,8
Dining Out,-444.26,176
Entertainment,-40.98,15
Gas,-78.86,12
Groceries,-283.7,56
Gym,-180.95,11
Misc,-793.79,57
Pay Check,4037.43,10
Phone,-183.28,10
Rent,-845.0,3


In [74]:
feb_df.to_excel('FebruaryFormatted.xlsx',index=False)

In [72]:
feb_df.loc[feb_df['Content'].str.contains('MIDAS',na=False), 'Category'] = 'Car Misc'

In [73]:
feb_df[feb_df['Category'] == 'Utilities']

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
5,2021-02-23,2021-02-23,VENMO/PAYMENTWALTER COLEMAN Default,-46.41,Utilities,VENMO,00:00:00,2021-02-23,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
16,2021-02-22,2021-02-22,VENMO/PAYMENTWALTER COLEMAN Default,-91.03,Utilities,VENMO,00:00:00,2021-02-22,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
49,2021-02-08,2021-02-08,VENMO/PAYMENTWALTER COLEMAN Default,-18.03,Utilities,VENMO,00:00:00,2021-02-08,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]


In [76]:
feb_df[feb_df['Amount'] > 0]

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
20,2021-02-19,2021-02-19,"Accenture Federa/PAYRLL DEPColeman, Walter J",2099.81,Pay Check,Other,00:00:00,2021-02-19,5,Accenture,J,"[Federa/PAYRLL, DEPColeman,, Walter]"
50,2021-02-05,2021-02-05,"Accenture Federa/PAYRLL DEPColeman, Walter J",1937.62,Pay Check,Other,00:00:00,2021-02-05,5,Accenture,J,"[Federa/PAYRLL, DEPColeman,, Walter]"


In [81]:
mar_df[mar_df['Amount'] > 0]['Amount'].sum()

5291.23

In [82]:
mar_df['Amount'].sum()

2015.4100000000012

In [83]:
mar_df.groupby('Category').sum()

Unnamed: 0_level_0,Amount,Content_Word_Length
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Default,-137.21,18
Dining Out,-196.15,81
Entertainment,-4.99,7
Extra,1400.0,5
Groceries,-308.69,66
Gym,-21.95,5
Misc,-1424.47,382
Pay Check,3875.23,10
Rent,-845.0,3
Tolls/Uber/Metro/Parking,-70.56,40


In [4]:
mar_df.head()

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
2,2021-03-30,2021-03-30,ARLINGTON COUNTY/ARLCO PMTWALTER Default,-50.0,Misc,Other,00:00:00,2021-03-30,4,ARLINGTON,Default,"[COUNTY/ARLCO, PMTWALTER]"
3,2021-03-29,2021-03-29,VENMO/PAYMENTWALTER COLEMAN Default,-68.0,Default,VENMO,00:00:00,2021-03-29,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
14,2021-03-25,2021-03-25,VENMO/PAYMENTWALTER COLEMAN Default,-845.0,Rent,VENMO,00:00:00,2021-03-25,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
12,2021-03-25,2021-03-25,VENMO/PAYMENTWALTER COLEMAN Default,-43.21,Default,VENMO,00:00:00,2021-03-25,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
13,2021-03-25,2021-03-25,USAA PC/PAYMENTWALTER COLEMAN Default,-187.72,USAA Insurance,Other,00:00:00,2021-03-25,4,USAA,Default,"[PC/PAYMENTWALTER, COLEMAN]"


In [5]:
mar_df['Amount'].sum()

2015.4100000000012

In [6]:
mar_df[mar_df['Category'] == 'Default']

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
3,2021-03-29,2021-03-29,VENMO/PAYMENTWALTER COLEMAN Default,-68.0,Default,VENMO,00:00:00,2021-03-29,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
12,2021-03-25,2021-03-25,VENMO/PAYMENTWALTER COLEMAN Default,-43.21,Default,VENMO,00:00:00,2021-03-25,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
60,2021-03-18,2021-03-18,VENMO/PAYMENTWALTER COLEMAN Default,-2.0,Default,VENMO,00:00:00,2021-03-18,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
59,2021-03-18,2021-03-18,VENMO/PAYMENTWALTER COLEMAN Default,-15.0,Default,VENMO,00:00:00,2021-03-18,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
99,2021-03-01,2021-03-01,VENMO/PAYMENTWALTER COLEMAN Default,-7.0,Default,VENMO,00:00:00,2021-03-01,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
98,2021-03-01,2021-03-01,VENMO/PAYMENTWALTER COLEMAN Default,-2.0,Default,VENMO,00:00:00,2021-03-01,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]


In [8]:
mar_df.to_excel('March21_NeedBettCatzn.xlsx',index='False')

In [1]:
import importlib

In [3]:
importlib.import_module('Fin_Statement.py')

NameError: name 'null' is not defined

In [5]:
filename = 'May15_2021_Checking.csv'

In [6]:
df = clean_bank_statement_file(filename)

In [7]:
df = categorize(df)

In [8]:
df

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
0,2021-05-14,2021-05-14,VENMO/PAYMENTWALTER COLEMAN Default,-15.0,Default,VENMO,00:00:00,2021-05-14,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
5,2021-05-13,2021-05-13,VENMO/PAYMENTWALTER COLEMAN Default,-99.29,Default,VENMO,00:00:00,2021-05-13,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
18,2021-05-07,2021-05-07,Pillar DC/Pillar DCPILLAR CHURCH OF WASHI,-310.97,Tithe,Other,00:00:00,2021-05-07,6,Pillar,WASHI,"[DC/Pillar, DCPILLAR, CHURCH, OF]"
22,2021-05-06,2021-05-06,"Accenture Federa/PAYRLL DEPColeman, Walter J",2099.81,Pay Check,Other,00:00:00,2021-05-06,5,Accenture,J,"[Federa/PAYRLL, DEPColeman,, Walter]"
20,2021-05-06,2021-05-06,VENMO/PAYMENTWALTER COLEMAN Default,-15.0,Default,VENMO,00:00:00,2021-05-06,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
29,2021-05-03,2021-05-03,VENMO/PAYMENTWALTER COLEMAN Default,-8.0,Default,VENMO,00:00:00,2021-05-03,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
4,2021-05-14,2020-05-14,HARRIS TEETER #3 950 S ARLINGTON VA999999,-23.91,Groceries,POS PURCHASE with PIN,10:07:00,2021-05-14,8,HARRIS,,"[TEETER, #3, 950, S, ARLINGTON, VA999999]"
1,2021-05-14,2020-05-13,INOVA - HEALTH CARE SER FALLS CHURCH VA771520,-50.0,Misc,POS PURCHASE Non-PIN,02:58:00,2021-05-14,9,INOVA,,"[-, HEALTH, CARE, SER, FALLS, CHURCH, VA771520]"
2,2021-05-14,2020-05-13,TST* TORTAS Y TACOS LA ARLINGTON VA648915,-14.21,Misc,POS PURCHASE Non-PIN,00:35:00,2021-05-14,8,TST*,,"[TORTAS, Y, TACOS, LA, ARLINGTON, VA648915]"
3,2021-05-14,2020-05-13,ARLINGTON METER PARKING ARLINGTON VA694452,-2.25,Tolls/Uber/Metro/Parking,POS PURCHASE Non-PIN,23:39:00,2021-05-14,6,ARLINGTON,,"[METER, PARKING, ARLINGTON, VA694452]"


In [9]:
df[df['Category'] == 'Default']

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
0,2021-05-14,2021-05-14,VENMO/PAYMENTWALTER COLEMAN Default,-15.0,Default,VENMO,00:00:00,2021-05-14,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
5,2021-05-13,2021-05-13,VENMO/PAYMENTWALTER COLEMAN Default,-99.29,Default,VENMO,00:00:00,2021-05-13,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
20,2021-05-06,2021-05-06,VENMO/PAYMENTWALTER COLEMAN Default,-15.0,Default,VENMO,00:00:00,2021-05-06,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]
29,2021-05-03,2021-05-03,VENMO/PAYMENTWALTER COLEMAN Default,-8.0,Default,VENMO,00:00:00,2021-05-03,3,VENMO/PAYMENTWALTER,Default,[COLEMAN]


In [11]:
df[df['Category'] == 'Dining Out']['Amount'].sum()

-93.43

In [12]:
df.to_excel('Fix_real_quick.xlsx',index=False)

In [14]:
df = pd.read_excel('Fix_real_quick.xlsx')

In [15]:
df.groupby('Category').sum()

Unnamed: 0_level_0,Amount,Content_Word_Length
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Dining Out,-227.99,87
Gas,-45.28,6
Groceries,-175.62,43
Gym,-21.95,5
Medical,-100.0,18
Misc,-297.04,22
Pay Check,2099.81,5
Tithe,-310.97,6
Tolls/Uber/Metro/Parking,-2.7,11
Utilities,-129.29,9


In [16]:
15*4

60

In [17]:
60+(60*.1)

66.0

In [19]:
66*(1.2)

79.2