In [1]:
# https://www.kaggle.com/shakedzy/alone-in-the-woods-using-theil-s-u-for-survival
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import math
from collections import Counter
# import numpy as np
import seaborn as sns
# import pandas as pd
import scipy.stats as ss
# import matplotlib.pyplot as plt
import sklearn.preprocessing as sp
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from subprocess import check_output


#Dataframe Visual Settings
pd.set_option('display.max_rows',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',400)

def PIN_col(x):
    if("POS PURCHASE Non-PIN" in x):
        return "POS PURCHASE Non-PIN"
    elif("POS PURCHASE with PIN" in x):
        return "POS PURCHASE with PIN"
    elif("VENMO" in x):
        return "VENMO"
    else:
        return "Other"
    
def remove_payment_method(x):
    if("POS PURCHASE Non-PIN" in x):
        return x.split("POS PURCHASE Non-PIN")[1]
    elif("POS PURCHASE with PIN" in x):
        return x.split("POS PURCHASE with PIN")[1]
    else:
        return x

def middle_words(x):
    if(len(x) > 2):
        return x[1:-1]
    else:
        return "N/A"

def clean_bank_statement_file(filename):
    """
    Extract, modify & clean bank statement raw file for neater format
    """
    #Cleaning data
    # csv_file = '\Bank Statements\'' + filename
    df = pd.read_csv(filename,
                     usecols=['Transaction Number','Date','Memo','Amount Debit','Amount Credit'],
                     skiprows=3)
    
    df.fillna({'Amount Debit':0, 'Amount Credit':0},inplace=True)
    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
    df.sort_values(by=['Date'],ascending=False,ignore_index=True,inplace=True)
    df['Memo'].fillna('Default',inplace=True)
    df['Transaction Number'] = df['Transaction Number'].apply(lambda x: x.split('**')[1])
    df['Purchase Date'] = df[df['Memo'].str.contains('1574 ',na=False)]['Memo']\
        .apply(lambda x: x.split('1574 ')[1])
    df['Purchase Date'] = pd.to_datetime(df['Purchase Date'],format = '%m/%d %H:%M')
    df['Purchase Date'] = df['Purchase Date']+pd.DateOffset(years=120)
    df.loc[df['Purchase Date'].isna(),'Purchase Date'] = df['Date'].copy()
    df['Memo'] = df['Memo'].apply(lambda x: x.split('*****')[0])
    df['Content'] = df['Transaction Number'] + ' ' + df['Memo']
    df['Amount'] = df['Amount Debit'].astype(float) + df['Amount Credit'].astype(float)
    df['Category'] = 'Default'
    df['Payment_Method'] = df['Content'].apply(lambda x: PIN_col(x))
    df["Content"] = df["Content"].apply(lambda x: remove_payment_method(x))
    df['Purchase Time'] = [d.time() for d in df['Purchase Date']]
    df['Purchase Date'] = [d.date() for d in df['Purchase Date']]
    df['Verification Date'] = df['Date'].copy()
    df['Content_Word_Length'] = df['Content'].apply(lambda x: len(x.split(' ')))
    df['First_Word'] = df['Content'].apply(lambda x: x.split(' ')[0])
    df['Last_Word'] = df['Content'].apply(lambda x: x.split(' ')[-1])
    df['Middle_Words'] = df['Content'].apply(lambda x: middle_words(x.split(' ')))
    df.drop(df[df['Content'].str.contains("INTERNET TRANSFER",na=False)].index,inplace=True)
    df.drop(df[df['Content'].str.contains('DDXXXX5941',na=False)].index, inplace=True)
    df.reset_index(drop=True,inplace=True)
    df.drop(['Transaction Number','Memo','Amount Debit','Amount Credit'],
            axis=1,inplace=True)
    df.sort_values(by=['Purchase Date'],ascending=False, inplace=True)
    return df

def categorize(df):
    """
    For common transactions, categorize these early, and use to train a model
    """
    #Easier Categorization
    df.loc[df['Content'].str.contains("UBER EATS",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("HARRIS",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("GIANT",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("USAA",na=False),'Category'] = 'USAA Insurance'
    df.loc[df['Content'].str.contains("Accenture",na=False),'Category'] = 'Pay Check'
    df.loc[df['Content'].str.contains("XSPORT",na=False),'Category'] = 'Gym'
    df.loc[df['Content'].str.contains("DISTRICT MARTIAL ARTS",na=False),'Category'] = 'Gym'
    df.loc[df['Content'].str.contains("PARKING",na=False),'Category'] = 'Tolls/Uber/Metro/Parking'
    df.loc[df['Content'].str.contains("NAZRET",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("TAJ OF INDIA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("DCPILLAR",na=False),'Category'] = 'Tithe'
    df.loc[df['Content'].str.contains("GOOGLE",na=False),'Category'] = 'Entertainment'
    df.loc[df['Content'].str.contains("VENMO/CASHOUT",na=False),'Category'] = 'Venmo Extra'
    df.loc[df['Content'].str.contains("CITGO",na=False),'Category'] = 'Gas'
    df.loc[df['Content'].str.contains("SHELL",na=False),'Category'] = 'Gas'
    df.loc[df['Content'].str.contains("PUPATELLA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("GOOD COMPANY DONUT",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("STARBUCKS",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("UBER TRIP",na=False),'Category'] = 'Tolls/Uber/Metro/Parking'
    df.loc[df['Content'].str.contains("VERIZON",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("WASHINGTON GAS",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("ENERGY",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("TOM COLEMAN",na=False),'Category'] = 'Phone'
    df.loc[df['Content'].str.contains("STDNT LOAN",na=False),'Category'] = 'Student Loans'
    df.loc[(df['Content'].str.contains("VENMO/PAYMENTWALTER COLEMAN Default",na=False)) &
           (df['Amount'] == -845),'Category'] = 'Rent'
    df.loc[df['Content'].str.contains("Margaret Coleman",na=False),'Category'] = 'Extra'
    df.loc[df['Content'].str.contains("Person-to-Person TransferPAYPAL",na=False),'Category'] = 'Extra'
    df.loc[df['Content'].str.contains("Tortas y Tacos",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("CROWNE PLAZA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("CROWNE PLAZA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("Emmaus Family Couns",na=False),'Category'] = 'Medical'
    df.loc[df['Content'].str.contains("ADVANCED HEALTH CARE",na=False),'Category'] = 'Medical'
    df.loc[df['Content'].str.contains("AMZN Mktp",na=False),'Category'] = 'Misc'
    df.loc[df['Content'].str.contains("Amazon web services",na=False),'Category'] = 'Misc'
    df.loc[df['Content'].str.contains("ALDI",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("FOOD LION",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("Audible",na=False),'Category'] = 'Entertainment'
    df.loc[df['Content'].str.contains("PIZZA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("CROWNE PLAZA",na=False),'Category'] = 'Dining Out'
    #set positive defaults to Misc
    df.loc[(df['Category'] == 'Default') & (df['Amount'] > 0), 'Category'] = 'Extra'
    df.loc[(df['Purchase Time'] == dt.time(0,0,0)) & (df['Category'] == 'Default') & \
             (df['First_Word'] != 'VENMO/PAYMENTWALTER'), 'Category'] = 'Misc'
    df.loc[(df['Category'] == 'Default') & (df['First_Word'] == 'SQ'), 'Category'] = 'Dining Out'
    df.loc[(df['Category'] == "Default") & (df["First_Word"] != "VENMO/PAYMENTWALTER"), "Category"] = "Misc"
    df.loc[df['Content'].str.contains("Pizza",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("Amzn",na=False),'Category'] = 'Misc'
    df.loc[df['Content'].str.contains("Pollo",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("VZ WIRELESS",na=False),'Category'] = 'Phone'
    df.loc[df['Content'].str.contains("PARKMOBILE",na=False),'Category'] = 'Tolls/Uber/Metro/Parking'
    return df 

def create_grand_file(directory):
    """
    Extract based on string file names
    """
    grand_df = pd.DataFrame()
    temp_df = pd.DataFrame()

    #Get list of files
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            temp_df = clean_bank_statement_file(filename)
            temp_df = categorize(temp_df)
            grand_df = pd.concat([grand_df,temp_df],ignore_index=True)
    
    return grand_df

In [3]:
mar_df = pd.read_excel('March21_NeedBettCatzn.xlsx')

In [5]:
mar_df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
mar_df.head(5)

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
0,2021-03-01,2020-02-26,SUPER POLLO CHARCOAL CH ARLINGTON VAIN7400,-12.08,Dining Out,POS PURCHASE Non-PIN,23:39:00,2021-03-01,7,SUPER,,"['POLLO', 'CHARCOAL', 'CH', 'ARLINGTON', 'VAIN7400']"
1,2021-03-01,2020-02-27,HARRIS TEETER #3 950 S ARLINGTON VA999999,-19.86,Groceries,POS PURCHASE with PIN,16:26:00,2021-03-01,8,HARRIS,,"['TEETER', '#3', '950', 'S', 'ARLINGTON', 'VA999999']"
2,2021-03-01,2020-02-28,HARRIS TEETER #3 950 S ARLINGTON VA999999,-16.18,Groceries,POS PURCHASE with PIN,14:43:00,2021-03-01,8,HARRIS,,"['TEETER', '#3', '950', 'S', 'ARLINGTON', 'VA999999']"
3,2021-03-01,2020-02-28,HARRIS TEETER #0 4250 C ARLINGTON VA999999,-27.93,Groceries,POS PURCHASE with PIN,14:17:00,2021-03-01,8,HARRIS,,"['TEETER', '#0', '4250', 'C', 'ARLINGTON', 'VA999999']"
4,2021-03-02,2020-02-28,NEW ANNANGOL ANNANDALE VA IN6600,-39.42,Dining Out,POS PURCHASE Non-PIN,21:19:00,2021-03-02,5,NEW,IN6600,"['ANNANGOL', 'ANNANDALE', 'VA']"


In [7]:
mar_df[mar_df['Category'] == 'Default']

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
91,2021-03-01,2021-03-01,VENMO/PAYMENTWALTER COLEMAN Default,-7.0,Default,VENMO,00:00:00,2021-03-01,3,VENMO/PAYMENTWALTER,Default,['COLEMAN']
92,2021-03-01,2021-03-01,VENMO/PAYMENTWALTER COLEMAN Default,-2.0,Default,VENMO,00:00:00,2021-03-01,3,VENMO/PAYMENTWALTER,Default,['COLEMAN']
96,2021-03-18,2021-03-18,VENMO/PAYMENTWALTER COLEMAN Default,-2.0,Default,VENMO,00:00:00,2021-03-18,3,VENMO/PAYMENTWALTER,Default,['COLEMAN']
97,2021-03-18,2021-03-18,VENMO/PAYMENTWALTER COLEMAN Default,-15.0,Default,VENMO,00:00:00,2021-03-18,3,VENMO/PAYMENTWALTER,Default,['COLEMAN']
100,2021-03-25,2021-03-25,VENMO/PAYMENTWALTER COLEMAN Default,-43.21,Default,VENMO,00:00:00,2021-03-25,3,VENMO/PAYMENTWALTER,Default,['COLEMAN']
103,2021-03-29,2021-03-29,VENMO/PAYMENTWALTER COLEMAN Default,-68.0,Default,VENMO,00:00:00,2021-03-29,3,VENMO/PAYMENTWALTER,Default,['COLEMAN']


In [8]:
# 2 Oats
# 7 Pizza
# 2 Kitchen supplies
# 15 Internet
# 43.21 Electric
# 68 Pinemoor

In [9]:
mar_df.loc[[91,103],'Category'] = 'Dining Out'

In [10]:
mar_df.loc[[97,100],'Category'] = 'Utilities'

In [11]:
mar_df.loc[[96,92],'Category'] = 'Groceries'

In [13]:
mar_df.groupby('Category').sum()

Unnamed: 0_level_0,Amount,Content_Word_Length
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Dining Out,-828.74,273
Entertainment,-204.84,21
Extra,1400.0,5
Gas,-94.59,11
Groceries,-365.98,104
Gym,-21.95,5
Medical,-100.0,16
Misc,-126.64,34
Pay Check,3875.23,10
Rent,-845.0,3


In [15]:
apr_df = clean_bank_statement_file('April21Check.csv')

In [16]:
apr_df = categorize(apr_df)

In [17]:
apr_df.groupby('Category').sum()

Unnamed: 0_level_0,Amount,Content_Word_Length
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Default,-66.26,12
Dining Out,-49.77,24
Entertainment,-4.99,7
Extra,100.0,11
Gas,-45.6,6
Groceries,-364.24,77
Gym,-21.95,21
Misc,-1725.64,210
Pay Check,5222.44,15
Rent,-845.0,3


In [18]:
apr_df['Amount'].sum()

1739.2499999999998

In [19]:
apr_df.to_excel('Apr21_NedBetCat.xlsx',index=False)

In [21]:
apr_df = pd.read_excel('Apr21_NedBetCat.xlsx')

In [22]:
apr_df.head(5)

Unnamed: 0,Date,Purchase Date,Content,Amount,Category,Payment_Method,Purchase Time,Verification Date,Content_Word_Length,First_Word,Last_Word,Middle_Words
0,2021-04-26,2021-04-26,VENMO/PAYMENTWALTER COLEMAN Default,-2.0,Groceries,VENMO,00:00:00,2021-04-26,3,VENMO/PAYMENTWALTER,Default,['COLEMAN']
1,2021-04-22,2021-04-22,VENMO/PAYMENTWALTER COLEMAN Default,-34.74,Utilities,VENMO,00:00:00,2021-04-22,3,VENMO/PAYMENTWALTER,Default,['COLEMAN']
2,2021-04-19,2021-04-19,VENMO/PAYMENTWALTER COLEMAN Default,-14.52,Dining Out,VENMO,00:00:00,2021-04-19,3,VENMO/PAYMENTWALTER,Default,['COLEMAN']
3,2021-04-12,2021-04-12,VENMO/PAYMENTWALTER COLEMAN Default,-15.0,Internet,VENMO,00:00:00,2021-04-12,3,VENMO/PAYMENTWALTER,Default,['COLEMAN']
4,2021-04-19,2020-04-16,NAZRET RESTAURANT 703-3479911 VA768720,-18.42,Dining Out,POS PURCHASE Non-PIN,13:42:00,2021-04-19,5,NAZRET,,"['RESTAURANT', '703-3479911', 'VA768720']"


In [23]:
apr_df.groupby('Category').sum()

Unnamed: 0_level_0,Amount,Content_Word_Length
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Dining Out,-344.24,117
Entertainment,-566.46,39
Extra,259.0,21
Gas,-170.25,21
Groceries,-420.5,94
Gym,-180.95,11
Internet,-15.0,3
Medical,-50.0,8
Misc,-543.19,32
Pay Check,5222.44,15
