In [35]:
import pandas as pd
import os

#Dataframe Visual Settings
pd.set_option('display.max_rows',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',400)

In [36]:
payment_methods = ('POS PURCHASE Non-PIN','POS PURCHASE with PIN','Person-to-Person Transfer','VENMO')
budget_cells = {'Student Loans':'C2','Car Payment':'C3','USAA Insurance':'C4','Rent':'C5','Phone':'C6',
               'Gas':'C7','Utilities':'C8','Tolls/Uber/Metro/Parking':'C9','Dining Out':'C10','Groceries':'C11',
               'Entertainment':'C12','Tithe':'C13','Medical':'C14','401k':'C15','Maintenance Service':'C16',
               'Gym':'C17','Misc':'C18','Taxes':'C20','Paycheck':'A24','Extra':'B24'}

In [37]:
def clean_bank_statement_file(filename):
    """
    Extract, modify & clean bank statement raw file for neater format
    """
    df = pd.read_csv(filename,
                     usecols=['Transaction Number','Date','Memo','Amount Debit','Amount Credit'],
                     skiprows=3)
    
    df.fillna({'Amount Debit':0, 'Amount Credit':0},inplace=True)
    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
    df['Bank Date'] = df['Date'].copy()
    df.sort_values(by=['Date'],ascending=False,ignore_index=True,inplace=True)
    df['Memo'].fillna('Default',inplace=True)
    df['Transaction Number'] = df['Transaction Number'].apply(lambda x: x.split('**')[1])
    df['Purchase Date'] = df[df['Memo'].str.contains('1574 ',na=False)]['Memo']\
        .apply(lambda x: x.split('1574 ')[1])
    df['Purchase Date'] = pd.to_datetime(df['Purchase Date'],format = '%m/%d %H:%M')
    df['Purchase Date'] = df['Purchase Date']+pd.DateOffset(years=120)
    df.loc[df['Purchase Date'].isna(),'Purchase Date'] = df['Date'].copy()
    df['Memo'] = df['Memo'].apply(lambda x: x.split('*****')[0])
    df['Content'] = df['Transaction Number'] + ' ' + df['Memo']
    df['Amount'] = df['Amount Debit'].astype(float) + df['Amount Credit'].astype(float)
    df['Category'] = 'Default'
    df.drop(df[df['Content'].str.contains("INTERNET TRANSFER",na=False)].index,inplace=True)
    df.reset_index(drop=True,inplace=True)
    df.drop(['Transaction Number','Memo','Amount Debit','Amount Credit','Date'],
            axis=1,inplace=True)
    df.sort_values(by=['Purchase Date'],ascending=False, inplace=True)
    return df

In [39]:
def categorize(df):
    #Easier Categorization
    df.loc[df['Content'].str.contains("UBER EATS",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("HARRIS",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("GIANT",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("USAA",na=False),'Category'] = 'USAA Insurance'
    df.loc[df['Content'].str.contains("Accenture",na=False),'Category'] = 'Pay Check'
    df.loc[df['Content'].str.contains("XSPORT",na=False),'Category'] = 'Gym'
    df.loc[df['Content'].str.contains("DISTRICT MARTIAL ARTS",na=False),'Category'] = 'Gym'
    df.loc[df['Content'].str.contains("PARKING",na=False),'Category'] = 'Tolls/Uber/Metro/Parking'
    df.loc[df['Content'].str.contains("NAZRET",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("TAJ OF INDIA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("DCPILLAR",na=False),'Category'] = 'Tithe'
    df.loc[df['Content'].str.contains("GOOGLE",na=False),'Category'] = 'Entertainment'
    df.loc[df['Content'].str.contains("VENMO/CASHOUT",na=False),'Category'] = 'Venmo Extra'
    df.loc[df['Content'].str.contains("CITGO",na=False),'Category'] = 'Gas'
    df.loc[df['Content'].str.contains("SHELL",na=False),'Category'] = 'Gas'
    df.loc[df['Content'].str.contains("PUPATELLA",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("GOOD COMPANY DONUT",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("STARBUCKS",na=False),'Category'] = 'Dining Out'
    df.loc[df['Content'].str.contains("UBER TRIP",na=False),'Category'] = 'Tolls/Uber/Metro/Parking'
    df.loc[df['Content'].str.contains("VERIZON",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("WASHINGTON GAS",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("ENERGY",na=False),'Category'] = 'Utilities'
    df.loc[df['Content'].str.contains("TOM COLEMAN",na=False),'Category'] = 'Phone'
    df.loc[df['Content'].str.contains("STDNT LOAN",na=False),'Category'] = 'Student Loans'
    df.loc[(df['Content'].str.contains("VENMO/PAYMENTWALTER COLEMAN Default",na=False)) &
           (df['Amount'] == -668.75),'Category'] = 'Rent'
    df.loc[df['Content'].str.contains("Margaret Coleman",na=False),'Category'] = 'Extra'
    df.loc[df['Content'].str.contains("Person-to-Person TransferPAYPAL",na=False),'Category'] = 'Extra'
    df.loc[df['Content'].str.contains("Emmaus Family Couns",na=False),'Category'] = 'Medical'
    df.loc[df['Content'].str.contains("AMAZON.COM",na=False),'Category'] = 'Misc'
    df.loc[df['Content'].str.contains("ADVANCED HEALTH CARE",na=False),'Category'] = 'Medical'
    df.loc[df['Content'].str.contains("Audible",na=False),'Category'] = 'Entertainment'
    df.loc[df['Content'].str.contains("Amazon web services",na=False),'Category'] = 'Misc'
    return df

In [40]:
directory = r'C:\Users\waltj\OneDrive\BankParser'

In [41]:
df_list = []
#Get list of files
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        df_list.append(filename)
df_list

['AprilChecking.csv',
 'AugustChecking.csv',
 'JulyChecking.csv',
 'JuneChecking.csv',
 'MarchChecking.csv',
 'MayChecking.csv',
 'NovemberChecking20.csv',
 'OctoberChecking.csv',
 'SeptemberChecking.csv']

In [42]:
#Take each file, and clean it into proper format
#Concatentate each file

grand_df = clean_bank_statement_file(df_list[0])

df_list = df_list[1:]

for file in df_list:
    temp_df = clean_bank_statement_file(file)
    grand_df = pd.concat([grand_df,temp_df],ignore_index=True)

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
grand_df = categorize(grand_df)

In [11]:
grand_df[grand_df['Category'] == 'Default']

Unnamed: 0,Date,Purchase Date,Content,Amount,Category
9,2020-04-23,2020-04-22 19:30:00,POS PURCHASE Non-PINVTG*Emmaus Family Couns 703-7292822 VA000000,-125.0,Default
16,2020-04-21,2020-04-21 01:58:00,POS PURCHASE Non-PINLCA*LABCORP 8008456167 800-845-6167 NCIN7200,-21.82,Default
33,2020-04-17,2020-04-17 00:00:00,"IRS TREAS 310/TAX REFCOLEMAN, WALTER J",2439.0,Default
35,2020-04-16,2020-04-16 10:50:00,POS PURCHASE Non-PINAudible*362757T83 Amzn.com/bill NJIN2000,-14.95,Default
36,2020-04-16,2020-04-16 00:00:00,Mobile Check Deposit Default,126.0,Default
42,2020-04-13,2020-04-12 18:30:00,POS PURCHASE Non-PINSQ *IDIDOS SOCIAL HOUSE Arlington VAINB900,-5.28,Default
46,2020-04-10,2020-04-10 00:00:00,FUNDRISE/2025840550WALTER COLEMAN,1.53,Default
47,2020-04-10,2020-04-10 00:00:00,FUNDRISE GROWTH/2025840550WALTER COLEMAN,0.43,Default
48,2020-04-13,2020-04-09 19:18:00,POS PURCHASE Non-PINVTG*Emmaus Family Couns 703-7292822 VA000000,-125.0,Default
49,2020-04-09,2020-04-09 11:10:00,POS PURCHASE Non-PINAudible*BB60412B3 Amzn.com/bill NJINB900,-14.95,Default


In [12]:
grand_df[grand_df['Category'] == 'Default']['Content'].value_counts()

VENMO/PAYMENTWALTER COLEMAN Default                                    19
POS PURCHASE Non-PINVTG*Emmaus Family Couns 703-7292822 VA000000        8
ROBINHOOD/FundsWalter Coleman Default                                   6
POS PURCHASE Non-PINCROWNE PLAZA NATIONAL R ARLINGTON VA69              5
POS PURCHASE Non-PINTortas y Tacos La Chiqu Arlington VA007885          4
WEBULL FINANCIAL/ACHWALTER COLEMAN                                      3
FUNDRISE/2025840550WALTER COLEMAN                                       3
FUNDRISE GROWTH/2025840550WALTER COLEMAN                                3
POS PURCHASE Non-PINCORADOS RESTAURANT WASHINGTON DC090207              2
POS PURCHASE Non-PINBRONSON OF ARLINGTON ARLINGTON VA000026             2
POS PURCHASE Non-PINWE THE PIZZA ARLINGTON VA 793031                    2
POS PURCHASE Non-PIN708 BOWLERO ARLINGTON 8 ARLINGTON VA501997          2
POS PURCHASE Non-PINAmazon web services aws.amazon.co WAIN0800          2
ARLINGTON COUNTY/ARLCO PMTWALTER Defau

In [13]:
def extra_cat(df):
    df.loc[df['Content'].str.contains("Emmaus Family Couns",na=False),'Category'] = 'Medical'
    df.loc[df['Content'].str.contains("AMAZON.COM",na=False),'Category'] = 'Misc'
    df.loc[df['Content'].str.contains("ADVANCED HEALTH CARE",na=False),'Category'] = 'Medical'
    df.loc[df['Content'].str.contains("Audible",na=False),'Category'] = 'Entertainment'
    df.loc[df['Content'].str.contains("Amazon web services",na=False),'Category'] = 'Misc'
    return df

In [14]:
grand_df = extra_cat(grand_df)

In [15]:
grand_df

Unnamed: 0,Date,Purchase Date,Content,Amount,Category
0,2020-04-29 00:00:00,2020-04-29 12:47:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-5.11,Groceries
1,2020-04-29 00:00:00,2020-04-29 02:32:00,Person-to-Person TransferPOP*Margaret Coleman Visa Direct VAPOPMON,250,Extra
2,2020-04-29 00:00:00,2020-04-28 22:44:00,POS PURCHASE Non-PINPP*GOOGLE A MEDIUM CORP 402-935-7733 CAINC000,-4.99,Entertainment
3,2020-04-28 00:00:00,2020-04-28 13:17:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-5.11,Groceries
4,2020-04-27 00:00:00,2020-04-26 22:23:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA INB600,-21.23,Dining Out
5,2020-04-27 00:00:00,2020-04-26 22:23:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA INB500,-3.7,Dining Out
6,2020-04-24 00:00:00,2020-04-24 15:16:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-47.51,Groceries
7,2020-04-23 00:00:00,2020-04-23 03:43:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA IN1610,-17.49,Dining Out
8,2020-04-23 00:00:00,2020-04-23 03:43:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA IN8900,-2.09,Dining Out
9,2020-04-23 00:00:00,2020-04-22 19:30:00,POS PURCHASE Non-PINVTG*Emmaus Family Couns 703-7292822 VA000000,-125,Medical


In [19]:
grand_df.groupby('Category')['Amount'].sum()

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [20]:
grand_df['Category'].value_counts()

Default                     190
Groceries                   130
Dining Out                   89
Tolls/Uber/Metro/Parking     38
Medical                      26
Utilities                    22
Venmo Extra                  21
Pay Check                    20
Gym                          15
Student Loans                13
Entertainment                13
Tithe                        12
Extra                        12
Gas                          12
Misc                         10
Rent                          8
USAA Insurance                8
Phone                         5
Name: Category, dtype: int64

In [21]:
X = grand_df[grand_df['Category'] != 'Default']
y = grand_df[grand_df['Category'] == 'Default']

In [22]:
clf = RandomForestClassifier(n_estimators=10)

In [25]:
grand_df[grand_df['Category'] != 'Default'].drop('Category')

KeyError: "['Category'] not found in axis"

In [26]:
clf.fit(grand_df[grand_df['Category'] != 'Default'].drop('Category',axis=1),
        grand_df[grand_df['Category'] != 'Default']['Category'])

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [27]:
grand_df

Unnamed: 0,Date,Purchase Date,Content,Amount,Category
0,2020-04-29 00:00:00,2020-04-29 12:47:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-5.11,Groceries
1,2020-04-29 00:00:00,2020-04-29 02:32:00,Person-to-Person TransferPOP*Margaret Coleman Visa Direct VAPOPMON,250,Extra
2,2020-04-29 00:00:00,2020-04-28 22:44:00,POS PURCHASE Non-PINPP*GOOGLE A MEDIUM CORP 402-935-7733 CAINC000,-4.99,Entertainment
3,2020-04-28 00:00:00,2020-04-28 13:17:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-5.11,Groceries
4,2020-04-27 00:00:00,2020-04-26 22:23:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA INB600,-21.23,Dining Out
5,2020-04-27 00:00:00,2020-04-26 22:23:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA INB500,-3.7,Dining Out
6,2020-04-24 00:00:00,2020-04-24 15:16:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-47.51,Groceries
7,2020-04-23 00:00:00,2020-04-23 03:43:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA IN1610,-17.49,Dining Out
8,2020-04-23 00:00:00,2020-04-23 03:43:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA IN8900,-2.09,Dining Out
9,2020-04-23 00:00:00,2020-04-22 19:30:00,POS PURCHASE Non-PINVTG*Emmaus Family Couns 703-7292822 VA000000,-125,Medical


In [43]:
grand_df['Payment Method'] = 'Other'

In [44]:
grand_df.loc[grand_df['Content'].str.contains('POS PURCHASE with PIN'),'Payment Method'] = 'POS PURCHASE with PIN'

In [45]:
grand_df.loc[grand_df['Content'].str.contains('POS PURCHASE Non-PIN'),'Payment Method'] =\
'POS PURCHASE Non-PIN'

In [46]:
grand_df.loc[grand_df['Content'].str.contains('VENMO'),'Payment Method'] = 'VENMO'

In [49]:
grand_df

Unnamed: 0,Bank Date,Purchase Date,Content,Amount,Category,Payment Method
0,2020-04-29,2020-04-29 12:47:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-5.11,Default,POS PURCHASE with PIN
1,2020-04-29,2020-04-29 02:32:00,Person-to-Person TransferPOP*Margaret Coleman Visa Direct VAPOPMON,250.0,Default,Person-to-Person Transfer
2,2020-04-29,2020-04-28 22:44:00,POS PURCHASE Non-PINPP*GOOGLE A MEDIUM CORP 402-935-7733 CAINC000,-4.99,Default,POS PURCHASE Non-PIN
3,2020-04-28,2020-04-28 13:17:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-5.11,Default,POS PURCHASE with PIN
4,2020-04-27,2020-04-26 22:23:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA INB600,-21.23,Default,POS PURCHASE Non-PIN
5,2020-04-27,2020-04-26 22:23:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA INB500,-3.7,Default,POS PURCHASE Non-PIN
6,2020-04-24,2020-04-24 15:16:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-47.51,Default,POS PURCHASE with PIN
7,2020-04-23,2020-04-23 03:43:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA IN1610,-17.49,Default,POS PURCHASE Non-PIN
8,2020-04-23,2020-04-23 03:43:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA IN8900,-2.09,Default,POS PURCHASE Non-PIN
9,2020-04-23,2020-04-22 19:30:00,POS PURCHASE Non-PINVTG*Emmaus Family Couns 703-7292822 VA000000,-125.0,Default,POS PURCHASE Non-PIN


In [48]:
grand_df.loc[grand_df['Content'].str.contains('Person-to-Person Transfer'),'Payment Method'] = 'Person-to-Person Transfer'

In [50]:
grand_df['Payment Method'].value_counts()

POS PURCHASE Non-PIN         326
POS PURCHASE with PIN        149
Other                        108
VENMO                         49
Person-to-Person Transfer     12
Name: Payment Method, dtype: int64

In [51]:
grand_df[grand_df['Payment Method'] == 'Other']

Unnamed: 0,Bank Date,Purchase Date,Content,Amount,Category,Payment Method
11,2020-04-22,2020-04-22 00:00:00,FEDLOANSERVICING/STDNT LOANWALTER COLEMAN,-1387.62,Default,Other
12,2020-04-22,2020-04-22 00:00:00,USAA.COM PAY EXT/PCWALTER COLEMAN,-243.63,Default,Other
13,2020-04-22,2020-04-22 00:00:00,Pillar DC/Pillar DCPILLAR CHURCH OF WASHI,-109.81,Default,Other
17,2020-04-21,2020-04-21 00:00:00,"Accenture Federa/PAYRLL DEPColeman, Walter J",3311.31,Default,Other
20,2020-04-20,2020-04-20 00:00:00,FEDLOANSERVICING/STDNT LOANWALTER COLEMAN,-1410.02,Default,Other
21,2020-04-20,2020-04-20 00:00:00,Pillar DC/Pillar DCPILLAR CHURCH OF WASHI,-1200.0,Default,Other
33,2020-04-17,2020-04-17 00:00:00,"IRS TREAS 310/TAX REFCOLEMAN, WALTER J",2439.0,Default,Other
36,2020-04-16,2020-04-16 00:00:00,Mobile Check Deposit Default,126.0,Default,Other
40,2020-04-15,2020-04-15 00:00:00,WASHINGTON GAS/PAYMENTEvan Barnes,-88.7,Default,Other
46,2020-04-10,2020-04-10 00:00:00,FUNDRISE/2025840550WALTER COLEMAN,1.53,Default,Other


In [53]:
grand_df[grand_df['Category'] == 'Utilities']

Unnamed: 0,Bank Date,Purchase Date,Content,Amount,Category,Payment Method


In [54]:
grand_df['Category'].value_counts()

Default    644
Name: Category, dtype: int64

In [55]:
grand_df = categorize(grand_df)

In [56]:
grand_df

Unnamed: 0,Bank Date,Purchase Date,Content,Amount,Category,Payment Method
0,2020-04-29,2020-04-29 12:47:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-5.11,Groceries,POS PURCHASE with PIN
1,2020-04-29,2020-04-29 02:32:00,Person-to-Person TransferPOP*Margaret Coleman Visa Direct VAPOPMON,250.0,Extra,Person-to-Person Transfer
2,2020-04-29,2020-04-28 22:44:00,POS PURCHASE Non-PINPP*GOOGLE A MEDIUM CORP 402-935-7733 CAINC000,-4.99,Entertainment,POS PURCHASE Non-PIN
3,2020-04-28,2020-04-28 13:17:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-5.11,Groceries,POS PURCHASE with PIN
4,2020-04-27,2020-04-26 22:23:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA INB600,-21.23,Dining Out,POS PURCHASE Non-PIN
5,2020-04-27,2020-04-26 22:23:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA INB500,-3.7,Dining Out,POS PURCHASE Non-PIN
6,2020-04-24,2020-04-24 15:16:00,POS PURCHASE with PINGIANT 0774 ARLINGTON VA 001,-47.51,Groceries,POS PURCHASE with PIN
7,2020-04-23,2020-04-23 03:43:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA IN1610,-17.49,Dining Out,POS PURCHASE Non-PIN
8,2020-04-23,2020-04-23 03:43:00,POS PURCHASE Non-PINUBER EATS HELP.UBER.COM CA IN8900,-2.09,Dining Out,POS PURCHASE Non-PIN
9,2020-04-23,2020-04-22 19:30:00,POS PURCHASE Non-PINVTG*Emmaus Family Couns 703-7292822 VA000000,-125.0,Medical,POS PURCHASE Non-PIN


In [57]:
grand_df.sort_values(by=['Purchase Date'],ascending=False)

Unnamed: 0,Bank Date,Purchase Date,Content,Amount,Category,Payment Method
420,2020-11-20,2020-11-20 00:00:00,"Accenture Federa/PAYRLL DEPColeman, Walter J",570.44,Pay Check,Other
421,2020-11-20,2020-11-20 00:00:00,"Accenture Federa/PAYRLL DEPColeman, Walter J",1331.03,Pay Check,Other
422,2020-11-20,2020-11-19 23:04:00,POS PURCHASE Non-PINARLINGTON METER PARKING ARLINGTON VA694452,-1.0,Tolls/Uber/Metro/Parking,POS PURCHASE Non-PIN
423,2020-11-20,2020-11-19 20:48:00,POS PURCHASE with PINHARRIS TEETER #3 950 S ARLINGTON VA999999,-44.56,Groceries,POS PURCHASE with PIN
424,2020-11-19,2020-11-19 14:09:00,POS PURCHASE Non-PINAMZN Mktp US*3X4IW47Q3 Amzn.com/bill WAIN8700,-37.6,Default,POS PURCHASE Non-PIN
425,2020-11-20,2020-11-19 03:23:00,POS PURCHASE Non-PINARLINGTON METER PARKING ARLINGTON VA694452,-2.25,Tolls/Uber/Metro/Parking,POS PURCHASE Non-PIN
426,2020-11-19,2020-11-18 23:15:00,POS PURCHASE Non-PINPP*GOOGLE A MEDIUM CORP 402-935-7733 CAINB500,-4.99,Entertainment,POS PURCHASE Non-PIN
427,2020-11-19,2020-11-18 22:47:00,POS PURCHASE Non-PINARLINGTON METER PARKING ARLINGTON VA694452,-1.0,Tolls/Uber/Metro/Parking,POS PURCHASE Non-PIN
428,2020-11-18,2020-11-18 00:00:00,ARLINGTON COUNTY/ARLCO PMTWALTER Default,-50.0,Default,Other
429,2020-11-19,2020-11-17 21:17:00,POS PURCHASE Non-PINTAJ OF INDIA ARLINGTON VA IN4400,-29.3,Dining Out,POS PURCHASE Non-PIN


In [58]:
grand_df.sort_values(by=['Purchase Date'],ascending=False,inplace=True)

In [59]:
grand_df

Unnamed: 0,Bank Date,Purchase Date,Content,Amount,Category,Payment Method
420,2020-11-20,2020-11-20 00:00:00,"Accenture Federa/PAYRLL DEPColeman, Walter J",570.44,Pay Check,Other
421,2020-11-20,2020-11-20 00:00:00,"Accenture Federa/PAYRLL DEPColeman, Walter J",1331.03,Pay Check,Other
422,2020-11-20,2020-11-19 23:04:00,POS PURCHASE Non-PINARLINGTON METER PARKING ARLINGTON VA694452,-1.0,Tolls/Uber/Metro/Parking,POS PURCHASE Non-PIN
423,2020-11-20,2020-11-19 20:48:00,POS PURCHASE with PINHARRIS TEETER #3 950 S ARLINGTON VA999999,-44.56,Groceries,POS PURCHASE with PIN
424,2020-11-19,2020-11-19 14:09:00,POS PURCHASE Non-PINAMZN Mktp US*3X4IW47Q3 Amzn.com/bill WAIN8700,-37.6,Default,POS PURCHASE Non-PIN
425,2020-11-20,2020-11-19 03:23:00,POS PURCHASE Non-PINARLINGTON METER PARKING ARLINGTON VA694452,-2.25,Tolls/Uber/Metro/Parking,POS PURCHASE Non-PIN
426,2020-11-19,2020-11-18 23:15:00,POS PURCHASE Non-PINPP*GOOGLE A MEDIUM CORP 402-935-7733 CAINB500,-4.99,Entertainment,POS PURCHASE Non-PIN
427,2020-11-19,2020-11-18 22:47:00,POS PURCHASE Non-PINARLINGTON METER PARKING ARLINGTON VA694452,-1.0,Tolls/Uber/Metro/Parking,POS PURCHASE Non-PIN
428,2020-11-18,2020-11-18 00:00:00,ARLINGTON COUNTY/ARLCO PMTWALTER Default,-50.0,Default,Other
429,2020-11-19,2020-11-17 21:17:00,POS PURCHASE Non-PINTAJ OF INDIA ARLINGTON VA IN4400,-29.3,Dining Out,POS PURCHASE Non-PIN


In [60]:
grand_df = grand_df[['Payment Method','Content','Purchase Date','Bank Date','Amount','Category']]

In [61]:
grand_df

Unnamed: 0,Payment Method,Content,Purchase Date,Bank Date,Amount,Category
420,Other,"Accenture Federa/PAYRLL DEPColeman, Walter J",2020-11-20 00:00:00,2020-11-20,570.44,Pay Check
421,Other,"Accenture Federa/PAYRLL DEPColeman, Walter J",2020-11-20 00:00:00,2020-11-20,1331.03,Pay Check
422,POS PURCHASE Non-PIN,POS PURCHASE Non-PINARLINGTON METER PARKING ARLINGTON VA694452,2020-11-19 23:04:00,2020-11-20,-1.0,Tolls/Uber/Metro/Parking
423,POS PURCHASE with PIN,POS PURCHASE with PINHARRIS TEETER #3 950 S ARLINGTON VA999999,2020-11-19 20:48:00,2020-11-20,-44.56,Groceries
424,POS PURCHASE Non-PIN,POS PURCHASE Non-PINAMZN Mktp US*3X4IW47Q3 Amzn.com/bill WAIN8700,2020-11-19 14:09:00,2020-11-19,-37.6,Default
425,POS PURCHASE Non-PIN,POS PURCHASE Non-PINARLINGTON METER PARKING ARLINGTON VA694452,2020-11-19 03:23:00,2020-11-20,-2.25,Tolls/Uber/Metro/Parking
426,POS PURCHASE Non-PIN,POS PURCHASE Non-PINPP*GOOGLE A MEDIUM CORP 402-935-7733 CAINB500,2020-11-18 23:15:00,2020-11-19,-4.99,Entertainment
427,POS PURCHASE Non-PIN,POS PURCHASE Non-PINARLINGTON METER PARKING ARLINGTON VA694452,2020-11-18 22:47:00,2020-11-19,-1.0,Tolls/Uber/Metro/Parking
428,Other,ARLINGTON COUNTY/ARLCO PMTWALTER Default,2020-11-18 00:00:00,2020-11-18,-50.0,Default
429,POS PURCHASE Non-PIN,POS PURCHASE Non-PINTAJ OF INDIA ARLINGTON VA IN4400,2020-11-17 21:17:00,2020-11-19,-29.3,Dining Out


In [62]:
def extra_cat(df):
    df.loc[df['Content'].str.contains("ALDI",na=False),'Category'] = 'Groceries'
    df.loc[df['Content'].str.contains("Audible",na=False),'Category'] = 'Entertainment'
    df.loc[df['Content'].str.contains("Amazon web services",na=False),'Category'] = 'Misc'
    return df

NameError: name 'df' is not defined

In [65]:
grand_df.loc[grand_df['Content'].str.contains("ALDI",na=False)]

Unnamed: 0,Payment Method,Content,Purchase Date,Bank Date,Amount,Category
194,POS PURCHASE with PIN,POS PURCHASE with PINALDI 71180 BAILEY CROSSR VA 537198,2020-07-20 18:27:00,2020-07-21,-4.12,Default
206,POS PURCHASE with PIN,POS PURCHASE with PINALDI 71180 BAILEY CROSSR VA 537198,2020-07-16 17:36:00,2020-07-17,-12.4,Default
243,POS PURCHASE with PIN,POS PURCHASE with PINALDI 71065 ALEXANDRIA VA 299648,2020-06-28 13:47:00,2020-06-29,-28.75,Default


In [66]:
grand_df.loc[grand_df['Content'].str.contains("KROGER",na=False)]

Unnamed: 0,Payment Method,Content,Purchase Date,Bank Date,Amount,Category


In [67]:
grand_df[grand_df['Category'] == 'Default']

Unnamed: 0,Payment Method,Content,Purchase Date,Bank Date,Amount,Category
424,POS PURCHASE Non-PIN,POS PURCHASE Non-PINAMZN Mktp US*3X4IW47Q3 Amzn.com/bill WAIN8700,2020-11-19 14:09:00,2020-11-19,-37.6,Default
428,Other,ARLINGTON COUNTY/ARLCO PMTWALTER Default,2020-11-18 00:00:00,2020-11-18,-50.0,Default
440,POS PURCHASE Non-PIN,POS PURCHASE Non-PINMIDAS. ARLINGTON VA 00,2020-11-14 16:19:00,2020-11-16,-48.0,Default
443,POS PURCHASE Non-PIN,POS PURCHASE Non-PINMCDONALD'S F20663 ARLINGTON VA1,2020-11-13 22:09:00,2020-11-16,-5.71,Default
445,POS PURCHASE Non-PIN,POS PURCHASE Non-PINSQ *BAKESHOP Arlington VA INB800,2020-11-13 03:36:00,2020-11-16,-10.22,Default
448,VENMO,VENMO/PAYMENTWALTER COLEMAN Default,2020-11-12 00:00:00,2020-11-12,-10.0,Default
461,VENMO,VENMO/PAYMENTWALTER COLEMAN Default,2020-11-09 00:00:00,2020-11-09,-1492.0,Default
463,POS PURCHASE Non-PIN,POS PURCHASE Non-PINSQ *BAKESHOP Arlington VA IN8700,2020-11-06 23:32:00,2020-11-09,-3.19,Default
464,POS PURCHASE Non-PIN,POS PURCHASE Non-PINAMZN Mktp US*282R58XG2 Amzn.com/bill WAIN7300,2020-11-06 17:19:00,2020-11-09,-15.89,Default
465,POS PURCHASE Non-PIN,POS PURCHASE Non-PINAMBAR CLARENDON 703-8759663 VA792935,2020-11-06 12:05:00,2020-11-09,-30.36,Default


In [68]:
len(grand_df[grand_df['Category'] == 'Default'])

190

In [None]:
#     State the question and determine required data
# Can I predict the categories from my bank statements?
#     Acquire the data in an accessible format


In [None]:
#     Identify and correct missing data points/anomalies as required
#     Prepare the data for the machine learning model
#     Establish a baseline model that you aim to exceed
#     Train the model on the training data
#     Make predictions on the test data
#     Compare predictions to the known test set targets and calculate performance metrics
#     If performance is not satisfactory, adjust the model, acquire more data, or try a different modeling technique
#     Interpret model and report results visually and numerically