In [27]:
# Assumption:
#  1. All the files given was generated by 'train_test_split' with pre-defined test_size and random_state already
#  2. All personal expenses will have the following keywords in the expense description. Once found, drop the raw.
#     ["personal", "family", "families]

# Load the data
import numpy as np
import pandas as pd
df_employee_src = pd.read_csv('employee.csv')
df_train_src = pd.read_csv('training_data_example.csv')
df_test_src = pd.read_csv('validation_data_example.csv')

# Merge with employee data
df_train = pd.merge(df_train_src, df_employee_src, on='employee id', how='left')
df_test = pd.merge(df_test_src, df_employee_src, on='employee id', how='left')

# Assumption: All personal expenses will have the following words in the expense description. 
# ["personal", "family", "families], pre-processing for personal/business expense

import re
def is_personal_expense(string_param):
    for desc in string_param.strip().lower().split():
        if re.match(r'personal|family|families', desc):
            return 1
    return 0

# Wipe out expense description if personal expense so that the word count doesn't increase.
for i in range(df_train.shape[0]):
    if is_personal_expense(df_train.iloc[i]['expense description']):
        df_train.loc[i, 'expense description'] = ''


In [28]:
# Assumption: the files given was generated by 'train_test_split' with pre-defined test_size and random_state already
X_train = []
for i in range(df_train.shape[0]):
	X_train.append(df_train.iloc[i]['expense description'])
y_train = np.array(df_train['category'])

X_test = []
for i in range(df_test.shape[0]):
    X_test.append(df_test.iloc[i]['expense description'])
y_test = np.array(df_test['category'])


In [29]:
X_test

['Taxi ride',
 'Dinner with Family',
 'Macbook Air Computer',
 'Paper',
 'Pens',
 'Airplane ticket to Miami',
 'Starbucks coffee',
 'Dinner',
 'Dinner with client',
 'Dinner',
 'Dinner',
 'Dinner']

In [30]:

# Basic imports for all chosen algorithms towards different approaches
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import model_selection


In [31]:
# Step 1: cross validate each text processing algorithm first to see what might be the best to pick up initially.

# cross validate each desired algorithm first to see what might be the best to pick up initially.

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

seed = 7
scoring = 'accuracy'
kfold = model_selection.KFold(n_splits=13, random_state=seed)
models = []
text_clf_svc = Pipeline([('vect', TfidfVectorizer()),
                         ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

text_clf_nb = Pipeline([('vect', TfidfVectorizer()),
                         ('clf', MultinomialNB()),])

text_clf_sgd = Pipeline([('vect', TfidfVectorizer()),
                         ('clf', SGDClassifier(max_iter=5)),
])

models.append(('SVC', text_clf_svc))
models.append(('NB', text_clf_nb))
models.append(('SGD', text_clf_sgd))

# evaluate each algorithm in turn
results = []
names = []
for name, model in models:
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


SVC: 0.846154 (0.230769)
NB: 0.692308 (0.417799)
SGD: 0.884615 (0.210663)


In [32]:
# Since SGDClassifier has higher score, so start with it.

In [48]:
# Approach I: SGDClassifier
text_clf_sgd = Pipeline([('vect', TfidfVectorizer()),
                         ('clf-svm', SGDClassifier(max_iter=5)),
])

# fit model with training data
text_clf_sgd.fit(X_train, y_train)

# evaluation on test data
pred = text_clf_sgd.predict(X_test)

print(np.mean(pred == y_test))



0.9166666666666666


In [36]:
# Approach I cont'd
# paramater selection
parameters_sgd = {'vect__ngram_range': [(1, 1), (1, 2), (2,2)],
                  'vect__use_idf': (True, False),
                  'clf-svm__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
                 }
gs_clf_sgd = GridSearchCV(text_clf_sgd, parameters_sgd, n_jobs=2)
gs_clf_sgd = gs_clf_sgd.fit(X_train, y_train)

print(gs_clf_sgd.best_score_)
print(gs_clf_sgd.best_params_)



0.875
{'clf-svm__alpha': 0.1, 'vect__ngram_range': (1, 1), 'vect__use_idf': False}


In [37]:
# Approach I cont'd
# apply parameters
text_clf_sgd = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,1), use_idf=True)),
                         ('clf-svm', SGDClassifier(alpha=1e-2, max_iter=5)),
])

# fit model with training data with improved params
text_clf_sgd.fit(X_train, y_train)

# evaluation on test data
pred = text_clf_sgd.predict(X_test)

print(accuracy_score(y_test, pred))

0.9166666666666666


In [38]:
# End of Approach I. *** Not improved after tuning parameter. `best_score_` is 0.875 ***

# Beginning of Approach II.

In [49]:
# Approach II: OneVsRestClassifier & LinearSVC

text_clf_svc = Pipeline([('vect', TfidfVectorizer()),
                            ('clf-svc', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

# fit model with training data
text_clf_svc.fit(X_train, y_train)

# evaluation on test data
pred = text_clf_svc.predict(X_test)

print(np.mean(pred == y_test))



0.9166666666666666


In [40]:
# Approach II cont'd
#paramater selection
parameters_svc = {'vect__ngram_range': [(1, 1), (1, 2),(2,2)],
                   'vect__use_idf': (True, False)
                 }
gs_clf_svc = GridSearchCV(text_clf_svc, parameters_svc, n_jobs=2)
gs_clf_svc = gs_clf_svc.fit(X_train, y_train)
print(gs_clf_svc.best_score_)
print(gs_clf_svc.best_params_)



0.8333333333333334
{'vect__ngram_range': (1, 1), 'vect__use_idf': True}


In [41]:
# Approach II cont'd
# apply parameters
text_clf_svc = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,1), use_idf=True)),
                            ('clf-svc', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

# fit model with training data
text_clf_svc.fit(X_train, y_train)

# evaluation on test data
pred = text_clf_svc.predict(X_test)

print(accuracy_score(y_test, pred))


0.9166666666666666


In [42]:
# End of Approach II. *** Has improved after tuning parameter. `best_score_` is 0.833. ***

# Beginning of Approach III.

In [43]:
# Approach III: Naive Bayers

text_clf_nb = Pipeline([('vect', TfidfVectorizer()),
                         ('clf-nb', MultinomialNB()),])

# fit model with training data
text_clf_nb = text_clf_nb.fit(X_train, y_train)

# evaluation on test data
pred = text_clf_nb.predict(X_test)

print(np.mean(pred == y_test))




0.8333333333333334


In [44]:
# Approach III cont'd
# paramater selection
parameters_nb = {'vect__ngram_range': [(1, 1), (1, 2), (2,2)],
                  'vect__use_idf': (True, False),
                  'clf-nb__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
                 }
gs_clf_nb = GridSearchCV(text_clf_nb, parameters_nb, n_jobs=2)
gs_clf_nb = gs_clf_nb.fit(X_train, y_train)

print(gs_clf_nb.best_score_)
print(gs_clf_nb.best_params_)





0.875
{'clf-nb__alpha': 0.1, 'vect__ngram_range': (1, 1), 'vect__use_idf': True}


In [45]:
# Approach III cont'd
# apply parameters
text_clf_nb = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,1), use_idf=True)),
                         ('clf-nb', MultinomialNB(alpha=1e-1))])

# fit model with training data with improved params
text_clf_nb.fit(X_train, y_train)

# evaluation on test data
pred = text_clf_nb.predict(X_test)

print(accuracy_score(y_test, pred))

0.9166666666666666


In [46]:
# End of Approach III. *** Has improved after tuning parameter. `best_score_` is 0.875. ***


In [47]:
# Conclusion:

# After examining all three algorithms, They all have accuracy of 0.91666 (after tuning two of them).
# Both SGDClassifier and MultinomialNB (after tuning) would be an ideal pick since they have the same 
# `best_score_` of 0.875.

# However, sincee SGDClassifier has better result after cross validation. 
# I'd pick SGDClassifier algorithm as my first choice to train the model.

# Steps can be done further:
#   - since this is text processing classificaiton and NLP problem, we can even tune with stop words and stemming.
#   - find more text processing classification algorithm and compare them through the same process.
#   - try Neural Networks approach using deep learning.