In [1]:
import os
import pickle
import numpy as np
import pandas as pd

os.chdir("../scripts/")
import model
import data

In [2]:
from scipy.sparse import hstack

import sklearn

from sklearn.model_selection import train_test_split, cross_validate

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier

### Load and Prep Data

In [3]:
source = data.load_source()
m = model.load_model()

In [4]:
source.head()

Unnamed: 0,date,description,amount,tags,notes,type,check
0,2017-01-19,Monthly Maintenance Fee,-12.0,Bank,,Checking,2881.0
1,2017-01-19,STOP & SHOP 0867 HIGHLAND PK NJ,-24.43,Groceries,,Credit Card,2382.0
2,2017-01-20,NETFLIX.COM NETFLIX.COM CA,-10.68,Media,,Credit Card,2383.0
3,2017-01-23,STOP & SHOP 0867 HIGHLAND PK NJ,-70.15,Groceries,,Credit Card,2385.0
4,2017-01-23,STOP & SHOP 0867 HIGHLAND PK NJ,-20.29,Groceries,,Credit Card,2384.0


In [5]:
test_source = source[(source.date.dt.year >= 2019)]
tag_counts = test_source.tags.value_counts()
test_source = test_source[(test_source.tags.isin(tag_counts[tag_counts > 30].index))]

In [6]:
train_x, test_x, train_y, test_y = train_test_split(test_source, test_source.tags, test_size=.20, random_state=376)

### Build and Train Model

In [7]:
ct = ColumnTransformer([('description', TfidfVectorizer(ngram_range=(1,1)), "description"),
                        ('amount', StandardScaler(), ["amount"]),
                        ('type', OrdinalEncoder(), ["type"]) 
                       ])
xgb_pipe = Pipeline([('column_trans', ct),
#                   ('chi2_filter', SelectPercentile(chi2, percentile=75)),
#                   ('mlp', MLPClassifier(hidden_layer_sizes=(250, 250, 250)))]
                 ('xgboost', GradientBoostingClassifier(n_estimators=150))])

mlp_pipe = Pipeline([('column_trans', ct),
#                   ('chi2_filter', SelectPercentile(chi2, percentile=75)),
                  ('mlp', MLPClassifier(hidden_layer_sizes=(250, 250, 250)))])
#                  ('xgboost', GradientBoostingClassifier(n_estimators=150))])

In [8]:
pipe = mlp_pipe

In [9]:
cv = cross_validate(pipe, train_x, train_y)
cv, cv['test_score'].mean()

({'fit_time': array([3.62199211, 3.02642202, 3.13421631, 3.47611094, 2.62195015]),
  'score_time': array([0.00848413, 0.0086751 , 0.00855184, 0.00850105, 0.00853205]),
  'test_score': array([0.91519435, 0.91872792, 0.92226148, 0.91843972, 0.90070922])},
 0.9150665363506503)

In [10]:
pipe.fit(train_x, train_y)
pipe.score(test_x, test_y)

0.9322033898305084

In [11]:
# Save model

# with open('../data/model.pickle', 'wb') as f:
#     pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

### Analyze Test Results, Errors

In [12]:
test_eval = test_x.copy()
test_eval["prob"]= pipe.predict_proba(test_x).max(axis=1)
test_eval["pred"] = pipe.predict(test_x)
test_eval["act"] = test_y

In [13]:
errors = test_eval[test_eval.pred != test_eval.act].sort_values("prob")
(test_eval.tags.value_counts() - errors.act.value_counts())/test_eval.tags.value_counts()

Amazon             NaN
Bill          0.950000
Car           0.833333
Cash               NaN
Charity       0.846154
Clothing      0.833333
Costco             NaN
Food          0.894737
Gas                NaN
Groceries     0.967742
Home               NaN
Housing            NaN
Income             NaN
Media         0.936170
Nasya              NaN
Phone              NaN
Therapy            NaN
Transition    0.900000
Utilities     0.937500
Work, Food    0.864865
dtype: float64

### Feature Exploration

In [14]:
encoder, clf = list(pipe.named_steps.values())
tfidf = list(encoder.named_transformers_.values())[0]
encoder, clf

(ColumnTransformer(transformers=[('description', TfidfVectorizer(),
                                  'description'),
                                 ('amount', StandardScaler(), ['amount']),
                                 ('type', OrdinalEncoder(), ['type'])]),
 MLPClassifier(hidden_layer_sizes=(250, 250, 250)))

In [15]:
training_source = source[source.date.dt.year >= 2019]
sparse_features = encoder.transform(training_source)
df = pd.DataFrame(sparse_features.toarray()[:,:-2], columns=tfidf.get_feature_names(), index=training_source.index)
df["tags"] = training_source.tags
df.index = training_source.description

In [16]:
tag_dicts = dict(iter(df.groupby("tags")))

for t in df.tags.sort_values().unique():
    print(t, "\n==============")
    tag_df = tag_dicts[t]
    print(tag_df[tag_df!=0].dropna(axis=1, how="all").max(axis=0).drop("tags").sort_values(ascending=False)[:5])
    print()

Amazon 
amzn         0.695855
204rz9e71    0.644228
2r49o5431    0.644228
ms0tn2np1    0.644228
9o8uv81u3    0.644228
dtype: object

Amazon, Media 
video    0.430633
3080     0.426292
802      0.426292
wa       0.414522
888       0.38444
dtype: object

Beep Boop 
store         1.0
mn        0.84483
depot    0.835619
888      0.535035
800      0.478244
dtype: object

Bill 
payment            1.0
chk           0.579403
from          0.575383
3294          0.529057
0159243502    0.524058
dtype: object

Books 
ny           1.0
wa      0.785743
866     0.618553
id      0.395824
inst    0.375609
dtype: object

Car 
172                 0.839851
in                  0.798689
princetonparking    0.786574
800                 0.772043
lyft                0.738866
dtype: object

Cash 
park         0.567725
27           0.440389
31           0.437169
square       0.431678
000002936    0.427559
dtype: object

Charity 
11         0.694672
city       0.680407
gofndme    0.641936
redwood    0.610986
707

In [20]:
example = source.iloc[3367]
example

date                                         2021-08-27 00:00:00
description    INFOSYS LIMITED DES:PAYROLL ID:XAW000000767199...
amount                                                   1147.41
tags                                                      Income
notes                                                           
type                                                    Checking
check                                                   32775.74
Name: 3367, dtype: object

In [26]:
encoder.__dict__

{'transformers': [('description', TfidfVectorizer(), 'description'),
  ('amount', StandardScaler(), ['amount']),
  ('type', OrdinalEncoder(), ['type'])],
 'remainder': 'drop',
 'sparse_threshold': 0.3,
 'n_jobs': None,
 'transformer_weights': None,
 'verbose': False,
 '_feature_names_in': array(['date', 'description', 'amount', 'tags', 'notes', 'type', 'check'],
       dtype=object),
 'n_features_in_': 7,
 '_columns': ['description', ['amount'], ['type']],
 '_has_str_cols': True,
 '_df_columns': Index(['date', 'description', 'amount', 'tags', 'notes', 'type', 'check'], dtype='object'),
 '_n_features': 7,
 '_remainder': ('remainder', 'drop', [0, 3, 4, 6]),
 'sparse_output_': True,
 'transformers_': [('description', TfidfVectorizer(), 'description'),
  ('amount', StandardScaler(), ['amount']),
  ('type', OrdinalEncoder(), ['type']),
  ('remainder', 'drop', [0, 3, 4, 6])]}

In [35]:
dir(tfidf)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_n_features',
 '_check_params',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_tfidf',
 '_validate_data',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'decode_error',
 'dtype',
 'encoding',
 'f

In [43]:
tfidf.build_analyzer()(example.description)

['infosys',
 'limited',
 'des',
 'payroll',
 'id',
 'xaw000000767199',
 'indn',
 'tyrrell',
 'thomas',
 'co',
 'id',
 'xxxxx60235',
 'ppd']

In [44]:
tfidf.build_preprocessor()(example.description)

'infosys limited des:payroll id:xaw000000767199 indn:tyrrell,thomas co id:xxxxx60235 ppd'

In [45]:
tfidf.build_tokenizer()(example.description)

['INFOSYS',
 'LIMITED',
 'DES',
 'PAYROLL',
 'ID',
 'XAW000000767199',
 'INDN',
 'TYRRELL',
 'THOMAS',
 'CO',
 'ID',
 'XXXXX60235',
 'PPD']

In [38]:
example.description

'INFOSYS LIMITED DES:PAYROLL ID:XAW000000767199 INDN:TYRRELL,THOMAS CO ID:XXXXX60235 PPD'