In [18]:
import os
import pickle
import numpy as np
import pandas as pd

os.chdir("../scripts/")
import model
import data

In [19]:
from scipy.sparse import hstack

import sklearn

from sklearn.model_selection import train_test_split, cross_validate

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

### Load and Prep Data

In [47]:
source = data.load_source()
m = model.load_model()

In [48]:
source.head()

Unnamed: 0,date,description,amount,tags,notes,type,check
0,2015-08-24,NASSAU SUSHI PRINCETON NJ,-41.89,Food,,Credit Card,1765.0
1,2015-08-24,DEPT EDUCATION DES:STUDENT LN ID:0000 INDN:TYR...,-166.33,Student Loans,,Checking,2665.0
2,2015-08-24,FIVE GUYS #1107 N. BRUNSWICK NJ,-12.49,Food,,Credit Card,1766.0
3,2015-08-24,STARBUCKS #11363 NORTH BRNorth BrunswiNJ,-8.45,Food,,Credit Card,1768.0
4,2015-08-24,FRUITY YOGURT & CAFE - PRINCETON NJ,-9.04,Food,,Credit Card,1767.0


In [49]:
test_source = source[(source.date.dt.year >= 2019)]
tag_counts = test_source.tags.value_counts()
test_source = test_source[(test_source.tags.isin(tag_counts[tag_counts > 30].index))]

### Build and Train Model

In [23]:
train_x, test_x, train_y, test_y = train_test_split(test_source, test_source.tags, test_size=.20, random_state=376)

In [24]:
ct = ColumnTransformer([('description', TfidfVectorizer(ngram_range=(1,1)), "description"),
                        ('amount', StandardScaler(), ["amount"]),
                        ('type', OrdinalEncoder(), ["type"]) 
                       ])
pipe = Pipeline([('column_trans', ct),
#                   ('chi2_filter', SelectPercentile(chi2, percentile=75)),
                  ('mlp', MLPClassifier(hidden_layer_sizes=(250, 250, 250)))])

In [25]:
cv = cross_validate(pipe, train_x, train_y)
cv, cv['test_score'].mean()

({'fit_time': array([3.43330288, 2.57734299, 3.25310707, 3.49579072, 2.13395214]),
  'score_time': array([0.00866103, 0.00925612, 0.008533  , 0.00868106, 0.00839901]),
  'test_score': array([0.91176471, 0.91911765, 0.92279412, 0.91911765, 0.93014706])},
 0.9205882352941177)

In [26]:
pipe.fit(train_x, train_y)
pipe.score(test_x, test_y)

0.9149560117302052

In [51]:
# Save model

# with open('../data/model.pickle', 'wb') as f:
#     pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

### Analyze Test Results, Errors

In [13]:
test_eval = test_x.copy()
test_eval["pred"] = pipe.predict(test_x)
test_eval["prob"]= pipe.predict_proba(test_x).max(axis=1)
test_eval["act"] = test_y
test_eval[test_eval.pred != test_eval.act].sort_values("prob")

Unnamed: 0,date,description,amount,tags,notes,type,check,pred,prob,act
3999,2021-05-20,ULTA #1523 SOUTH PLAINFINJ,-7.45,Food,,Cash Rewards,2.413829e+22,Home,0.419099,Food
3282,2020-02-20,TOMS.COM USD 800-975-8667 CA,-161.89,Clothing,,Cash Rewards,1585.0,Food,0.424073,Clothing
2559,2019-04-25,SMASHBURGER #1383 EDISON NJ,-10.66,Food,,Cash Rewards,1068.0,Transition,0.474028,Food
2599,2019-05-10,CHICK-FIL-A #01428 EDISON NJ,-8.8,Food,,Cash Rewards,1097.0,Transition,0.474101,Food
2320,2019-01-07,WALGREENS #11313 HAMILTON NJ,-37.8,Transition,,Cash Rewards,886.0,Food,0.482556,Transition
3182,2020-01-06,THE YETEE LLC 331-707-5436 IL,-19.5,Charity,,Cash Rewards,1515.0,Food,0.487793,Charity
2606,2019-05-14,THE HALAL GUYS 5 ASTORIA NY,-10.88,"Work, Food",,Cash Rewards,1107.0,Food,0.490029,"Work, Food"
2327,2019-01-10,CRISP KITCHEN - PHILADELPPHILADELPHIA PA,-9.06,"Work, Food",,Cash Rewards,890.0,Food,0.490482,"Work, Food"
2663,2019-06-01,LOWES #00692* PISCATAWAY NJ,-37.24,Home,,Cash Rewards,1151.0,"Work, Food",0.504578,Home
2325,2019-01-09,WAWA 8093 00080937 PHILADELPHIA PA,-1.08,"Work, Food",,Cash Rewards,889.0,Transition,0.566145,"Work, Food"


In [57]:
pipe.fit(test_source, test_source.tags)
test_source = test_source.copy()

predictions = model.predict(test_source)
probs = model.predict_proba(test_source).max(axis=1)
test_source["pred"] = predictions
test_source["prob"]= probs
test_source["act"] = test_source.tags
test_source[test_source.pred != test_source.act].sort_values("prob")

Unnamed: 0,date,description,amount,tags,notes,type,check,pred,prob,act
3271,2020-02-17,DUNKIN #340935 Q35 HIGHLAND PK NJ,-5.4,"Work, Food",,Cash Rewards,1581.0,Food,0.580551,"Work, Food"
2946,2019-09-30,DUNKIN #340935 Q35 HIGHLAND PK NJ,-4.52,"Work, Food",,Cash Rewards,1356.0,Food,0.580625,"Work, Food"
2708,2019-06-18,DUNKIN #340935 Q35 HIGHLAND PK NJ,-3.93,"Work, Food",,Cash Rewards,1182.0,Food,0.580674,"Work, Food"
2945,2019-09-30,DUNKIN #340935 Q35 HIGHLAND PK NJ,-3.93,"Work, Food",,Cash Rewards,1357.0,Food,0.580674,"Work, Food"
2769,2019-07-16,DUNKIN #340935 Q35 HIGHLAND PK NJ,-3.93,"Work, Food",,Cash Rewards,1227.0,Food,0.580674,"Work, Food"
2696,2019-06-15,DUNKIN #340935 Q35 HIGHLAND PK NJ,-3.67,"Work, Food",,Cash Rewards,1173.0,Food,0.580696,"Work, Food"
3062,2019-11-13,DUNKIN #340935 Q35 HIGHLAND PK NJ,-3.08,"Work, Food",,Cash Rewards,1437.0,Food,0.580746,"Work, Food"
2985,2019-10-15,SQ *PENSTOCK COFFEE HIGHLAND PARKNJ,-2.93,"Work, Food",,Cash Rewards,1384.0,Food,0.730074,"Work, Food"
2960,2019-10-05,SQ *PENSTOCK COFFEE HIGHLAND PARKNJ,-3.73,"Work, Food",,Cash Rewards,1364.0,Food,0.730196,"Work, Food"
2855,2019-08-24,SQ *PENSTOCK COFFEE HIGHLAND PARKNJ,-3.93,"Work, Food",,Cash Rewards,1289.0,Food,0.730226,"Work, Food"


### Feature Exploration

In [56]:
mlp = m

In [63]:
encoder, clf = list(mlp.named_steps.values())
tfidf = list(encoder.named_transformers_.values())[0]
encoder, clf

(ColumnTransformer(transformers=[('description',
                                  TfidfVectorizer(ngram_range=(1, 2)),
                                  'description'),
                                 ('amount', StandardScaler(), ['amount']),
                                 ('type', OrdinalEncoder(), ['type'])]),
 MLPClassifier(hidden_layer_sizes=(250, 250)))

In [58]:
training_source = source[source.date.dt.year >= 2019]
sparse_features = encoder.transform(training_source)
df = pd.DataFrame(sparse_features.toarray()[:,:-2], columns=tfidf.get_feature_names(), index=training_source.index)
df["tags"] = training_source.tags
df.index = training_source.description

In [59]:
df

Unnamed: 0_level_0,0000,0000 indn,000000000000004999,0000000604487435000000,0000000604487435000000 co,000001918,000001918 withdrwl,000002001,000002001 withdrwl,000002700,...,york ny,york times,you,zc9de2xo3,zc9de2xo3 amzn,zelle,zelle transfer,zf6xd49y3,zf6xd49y3 888,tags
description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bank of America DES:CASHREWARD ID:TYRRELL INDN:0000000604487435000000 CO ID:XXXXX90310 PPD,0.0,0.0,0.0,0.227446,0.227446,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Income
ROOTS ASIAN KITCHEN PRINCETON JUNNJ,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Food
Online Banking Transfer Conf# d72891fa5; Breach,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Housing
"INFOSYS LIMITED DES:PAYROLL ID:XAW000000767199 INDN:TYRRELL,THOMAS CO ID:XXXXX60235 PPD",0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Income
PUBLIC STORAGE 27302 800-567-0759 NJ,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Housing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BANDCAMP ALPHA MALE TEA 402-935-7733 CA,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Media
FOREIGN TRANSACTION FEE,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Media
VENMO DES:PAYMENT ID:XXXXX16229188 INDN:THOM TYRRELL CO ID:XXXXX81992 WEB,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Therapy
METLIFE P C DES:INS PAYMNT ID:XXXXX3724 INDN:TYRRELL THOMAS CO ID:XXXXX25441 PPD,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Car


In [60]:
training_source.tags.sort_values().unique()

array(['Amazon', 'Amazon, Media', 'Beep Boop', 'Bill', 'Books', 'Car',
       'Cash', 'Charity', 'Clothing', 'Costco', 'Drugs', 'Food', 'Gas',
       'Gift', 'Groceries', 'HSA', 'Home', 'Housing', 'Income', 'Media',
       'Medical', 'Nasya', 'Phone', 'Savings', 'Student Loans', 'Taxes',
       'Therapy', 'Transition', 'Utilities', 'Vacation', 'Vet',
       'Work, Commute', 'Work, Food'], dtype=object)

In [61]:
tag_dicts = dict(iter(df.groupby("tags")))

for t in df.tags.sort_values().unique():
    print(t, "\n==============")
    tag_df = tag_dicts[t]
    print(tag_df[tag_df!=0].dropna(axis=1, how="all").max(axis=0).drop("tags").sort_values(ascending=False)[:5])
    print()

Amazon 
amzn                      0.495387
twitch purchasesuppoca    0.451805
purchasesuppoca           0.451805
amz twitch                0.451805
twitch                    0.451805
dtype: object

Amazon, Media 
mk4fl5f30 888      0.368447
video mn68038p1    0.368447
video mk4fl5f30    0.368447
video m42hi5sa2    0.368447
m42hi5sa2          0.368447
dtype: object

Beep Boop 
bestbuycom806316643512 888    0.398542
888 bestbuy                   0.398542
bestbuy                       0.398542
bestbuy mn                    0.398542
bestbuycom806316643512        0.398542
dtype: object

Bill 
you              0.488985
thank you        0.488985
thank            0.488985
payment thank    0.488985
chk              0.410442
dtype: object

Books 
bookshop            0.530854
bookshop org        0.530854
org                 0.530854
instarbooks indn    0.334568
instarbooks         0.334568
dtype: object

Car 
172                           0.640381
check 172                     0.640381
princetonp