In [94]:
import pandas as pd
import numpy as np
import sklearn 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.cluster import KMeans, DBSCAN

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


In [29]:
# read training, validation and employees data files into pandas dataframe
training_data = pd.read_csv("./training_data_example.csv")
validation_data = pd.read_csv("./validation_data_example.csv")
employee_data = pd.read_csv("./employee.csv")

training_data = \
pd.merge(training_data, employee_data, how="inner",on="employee id").drop(['employee address','employee name'],axis=1)

validation_data = \
pd.merge(validation_data, employee_data, how="inner",on="employee id").drop(['employee address','employee name'],axis=1)

# concatenate training and validation data for feature engineering in order to have consistency of columns
training_data['type'] = "training"
validation_data['type'] = "validation"
combined_data = pd.concat([training_data, validation_data], ignore_index = True)

In [30]:
# summary of training data 
combined_data['employee id'] = combined_data['employee id'].astype(str)
# drop date column, since we are not considering it as a forecasting problem
combined_data = combined_data.drop(['date'],axis=1)
print(combined_data.describe(include="all"))

                       category employee id expense description  \
count                        36          36                  36   
unique                        5           7                  20   
top     Meals and Entertainment           4              Dinner   
freq                         17          15                   8   
mean                        NaN         NaN                 NaN   
std                         NaN         NaN                 NaN   
min                         NaN         NaN                 NaN   
25%                         NaN         NaN                 NaN   
50%                         NaN         NaN                 NaN   
75%                         NaN         NaN                 NaN   
max                         NaN         NaN                 NaN   

        pre-tax amount      tax name  tax amount   role      type  
count        36.000000            36   36.000000     36        36  
unique             NaN             2         NaN      4    

In [32]:
# normalize real value features 
min_max_scaler = MinMaxScaler()

combined_data['pre-tax amount'] = min_max_scaler.\
                            fit_transform(np.array(combined_data['pre-tax amount']).reshape(-1, 1))
combined_data['tax amount'] = min_max_scaler.\
                            fit_transform(np.array(combined_data['tax amount']).reshape(-1, 1))       

In [33]:
# get one hot encoded training and validation data for categorical variables 'tax name' and 'role'
combined_data = pd.get_dummies(combined_data, columns=['tax name', 'role'], drop_first=True)
combined_data.columns

Index(['category', 'employee id', 'expense description', 'pre-tax amount',
       'tax amount', 'type', 'tax name_NY Sales tax', 'role_Engineer',
       'role_IT and Admin', 'role_Sales'],
      dtype='object')

In [34]:
# for 'expense description' column generate vocabulary to better parse the description for classification task
#pd.get_dummies(training_data, sep=' ', columns=['expense description'], drop_first=True)
# first remove stop words to reduce the dimensionality of features thus generated
# and convert all words to lower case to disambiguate 
stop = stopwords.words('english')
combined_data['expense_desc_stop_removed'] = combined_data['expense description'].apply(lambda x: ' '.\
                                            join([word.lower() for word in x.split() if word not in (stop)]))


# then generate vocabulary for all words that occur under expense description column and assign 

print("vocabulary generated from expense description column")
print(combined_data['expense_desc_stop_removed'].str.get_dummies(sep=' ').columns)
df_vocab_expenses_ =  combined_data['expense_desc_stop_removed'].str.get_dummies(sep=' ')

joined_train_val_ = combined_data.merge(df_vocab_expenses_, how='outer', left_index=True, right_index=True)
X_train = joined_train_val_[joined_train_val_.type=="training"]
X_validation = joined_train_val_[joined_train_val_.type=="validation"]
X_train.columns

vocabulary generated from expense description column
Index(['air', 'airplane', 'client', 'coffee', 'computer', 'dinner', 'dropbox',
       'family', 'flight', 'hp', 'icloud', 'iphone', 'laptop', 'lunch',
       'macbook', 'miami', 'microsoft', 'ny', 'office', 'paper', 'pens',
       'potential', 'ride', 'starbucks', 'steve', 'subscription', 'taxi',
       'team', 'ticket'],
      dtype='object')


Index(['category', 'employee id', 'expense description', 'pre-tax amount',
       'tax amount', 'type', 'tax name_NY Sales tax', 'role_Engineer',
       'role_IT and Admin', 'role_Sales', 'expense_desc_stop_removed', 'air',
       'airplane', 'client', 'coffee', 'computer', 'dinner', 'dropbox',
       'family', 'flight', 'hp', 'icloud', 'iphone', 'laptop', 'lunch',
       'macbook', 'miami', 'microsoft', 'ny', 'office', 'paper', 'pens',
       'potential', 'ride', 'starbucks', 'steve', 'subscription', 'taxi',
       'team', 'ticket'],
      dtype='object')

In [35]:
# get target/label data by encoding 'category' column
Y_train = X_train['category'] 
le_targets = LabelEncoder()
le_targets.fit(Y_train)
Y_train = le_targets.transform(Y_train)

Y_validation = X_validation['category']
le_validation_targets = LabelEncoder()
le_validation_targets.fit(Y_validation)
Y_validation = le_validation_targets.transform(Y_validation)

In [36]:
# filter out the columns not to be used as training feature set
drop_cols = ['category', 'employee id', 'expense description', 'expense_desc_stop_removed','type']
X_train = X_train.drop(drop_cols, axis=1)
X_validation = X_validation.drop(drop_cols, axis=1)
X_validation.columns


Index(['pre-tax amount', 'tax amount', 'tax name_NY Sales tax',
       'role_Engineer', 'role_IT and Admin', 'role_Sales', 'air', 'airplane',
       'client', 'coffee', 'computer', 'dinner', 'dropbox', 'family', 'flight',
       'hp', 'icloud', 'iphone', 'laptop', 'lunch', 'macbook', 'miami',
       'microsoft', 'ny', 'office', 'paper', 'pens', 'potential', 'ride',
       'starbucks', 'steve', 'subscription', 'taxi', 'team', 'ticket'],
      dtype='object')

array([0.04788078, 0.12339354, 0.0255676 , 0.01644852, 0.05863334,
       0.04012635, 0.007901  , 0.07450459, 0.02064408, 0.01650796,
       0.05401744, 0.11330824, 0.00384615, 0.        , 0.00489718,
       0.        , 0.        , 0.02097785, 0.        , 0.        ,
       0.00830601, 0.01481128, 0.00761834, 0.02471865, 0.01416276,
       0.00848485, 0.        , 0.00771766, 0.12103044, 0.01119617,
       0.        , 0.06025599, 0.04579527, 0.02134273, 0.02590522])

In [69]:
"""
    Build classifier using SVM as hypothesis and select the hyperparameters using 5 fold cross validation.
    Since, we have very limited number of samples SVM (support vector machine) would be a good choice as it has been
    proven to work well in such situation. 
    There's a chance for model to overfit to training data, so it's important to tune regularization parameter 'C'
    cross validation to avoid overfitting as its often the case from small datasets
    'gamma' is the rbf kernel parameter that dictates the infleunce of a single parameter
"""

# Set the parameters by cross-validation
tuning_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4],'C': [1, 10, 100]},
                    {'kernel': ['linear'], 'C': [1, 10, 100]}]
scores = ['precision', 'recall', 'f1']
           

In [72]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuning_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, Y_train)

    print("Best parameters set found on training set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on training set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full training set.")
    print("The scores are computed on the full validation set.")
    print()
    y_true, y_pred = Y_validation, clf.predict(X_validation)
    print(classification_report(y_true, y_pred))
    print()
    

# Tuning hyper-parameters for precision

Best parameters set found on training set:

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

Grid scores on training set:

0.128 (+/-0.171) for {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.128 (+/-0.171) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.128 (+/-0.171) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.386 (+/-0.642) for {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.128 (+/-0.171) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.128 (+/-0.171) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.602 (+/-0.229) for {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.386 (+/-0.642) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.128 (+/-0.171) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.602 (+/-0.229) for {'C': 1, 'kernel': 'linear'}
0.602 (+/-0.229) for {'C': 10, 'kernel': 'linear'}
0.602 (+/-0.229) for {'C': 100, 'kernel': 'linear'}

Detailed classification report:

The model is trained on the full training set.
The scores are computed on the 

In [98]:
"""
    Since, the data is limited its a good idea to try ensemble approach known as Bagging which trains mulitple models 
    randomly selected proportion of dataset in terms of features and samples. 
    This often helps counteract the data limitation and curse ofdimensionality problems. It has a similar effect as 
    oversampling.
"""

# hypothesis: Bagging Classifier with decision tree as base estimator
print("classification model with Bagging Classifier")
# Set the parameters by cross-validation
tuning_parameters = [{'n_estimators': [10,100,500], 
                      'max_samples':[1,0.3,0.7], 
                      'max_features':[1,0.3,0.75]},
                     ]
scores = ['precision', 'recall', 'f1']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(BaggingClassifier(), tuning_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, Y_train)

    print("Best parameters set found on training set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on training set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full training set.")
    print("The scores are computed on the full validation set.")
    print()
    y_true, y_pred = Y_validation, clf.predict(X_validation)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on training set:

{'max_features': 0.75, 'max_samples': 0.7, 'n_estimators': 10}

Grid scores on training set:

0.117 (+/-0.182) for {'max_features': 1, 'max_samples': 1, 'n_estimators': 10}
0.128 (+/-0.171) for {'max_features': 1, 'max_samples': 1, 'n_estimators': 100}
0.128 (+/-0.171) for {'max_features': 1, 'max_samples': 1, 'n_estimators': 500}
0.128 (+/-0.171) for {'max_features': 1, 'max_samples': 0.3, 'n_estimators': 10}
0.128 (+/-0.171) for {'max_features': 1, 'max_samples': 0.3, 'n_estimators': 100}
0.128 (+/-0.171) for {'max_features': 1, 'max_samples': 0.3, 'n_estimators': 500}
0.137 (+/-0.184) for {'max_features': 1, 'max_samples': 0.7, 'n_estimators': 10}
0.128 (+/-0.171) for {'max_features': 1, 'max_samples': 0.7, 'n_estimators': 100}
0.128 (+/-0.171) for {'max_features': 1, 'max_samples': 0.7, 'n_estimators': 500}
0.119 (+/-0.186) for {'max_features': 0.3, 'max_samples': 1, 'n_estimators': 10}
0.128 (+/-0

In [75]:
"""
    In order to discriminate between business and personal expenses when no labeled data is present is an unsupervised
    learning problem. 
    To tackle this, first features needs to be prepared that are apt for a clustering algorithm input. 
    There are 2 real valued features 'pre-tax amount' and 'tax amount' that are used along with the categorical features
    'tax name', 'role','category'. The categorical features need to binarized as we did with classification model in 
    order to map them to real values. 

    Also note 'expense description' can be vital for the clustering algorithm but it leads a high dimensional 
    feature space and with such a small dataset it wouldn't be feasible. But for a sufficiently large dataset, 
    same approach can be followed as in classification feature engineering ie generate vocabulary and binarize.
    
"""

min_max_scaler = MinMaxScaler()
training_data_clustering = training_data
training_data_clustering['pre-tax amount'] = min_max_scaler.\
                            fit_transform(np.array(training_data['pre-tax amount']).reshape(-1, 1))
training_data_clustering['tax amount'] = min_max_scaler.\
                            fit_transform(np.array(training_data['tax amount']).reshape(-1, 1)) 
training_data_clustering = pd.get_dummies(training_data_clustering, columns=['tax name', 'role','category'], \
                                          drop_first=True)
# drop columns 
training_data_clustering = training_data_clustering.drop(['employee id','date','type','expense description'],axis=1).\
                                    astype('float')
X_train_clustering = training_data_clustering.values

X_train_clustering.shape

(24, 10)

In [92]:
# kmeans Clustering algorithm is used with 2 clusters
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_train_clustering)
cluster_index = kmeans.labels_
training_data['cluster_index'] = kmeans.labels_
# cluster index is an arbitrary attribute that distinguishes one cluster from the other
print("note:  cluster_index column shows the cluster a data point belongs to")
print()
print(training_data.drop(['type'],axis=1))

note:  cluster_index column shows the cluster a data point belongs to

          date                 category  employee id  \
0    11/1/2016                   Travel            7   
1   11/21/2016  Meals and Entertainment            7   
2   10/12/2016      Computer - Hardware            7   
3   11/15/2016  Meals and Entertainment            1   
4   12/11/2016      Computer - Software            1   
5    9/18/2016                   Travel            1   
6    11/7/2016                   Travel            1   
7   11/30/2016      Computer - Hardware            3   
8   11/14/2016      Computer - Software            3   
9    11/3/2016      Computer - Software            3   
10   9/30/2016          Office Supplies            3   
11   11/6/2016      Computer - Software            4   
12  11/12/2016                   Travel            4   
13  12/30/2016  Meals and Entertainment            4   
14  12/15/2016  Meals and Entertainment            4   
15   12/1/2016  Meals and Enterta

          date                 category  employee id  \
0    11/1/2016                   Travel            7   
1   11/21/2016  Meals and Entertainment            7   
2   10/12/2016      Computer - Hardware            7   
3   11/15/2016  Meals and Entertainment            1   
4   12/11/2016      Computer - Software            1   
5    9/18/2016                   Travel            1   
6    11/7/2016                   Travel            1   
7   11/30/2016      Computer - Hardware            3   
8   11/14/2016      Computer - Software            3   
9    11/3/2016      Computer - Software            3   
10   9/30/2016          Office Supplies            3   
11   11/6/2016      Computer - Software            4   
12  11/12/2016                   Travel            4   
13  12/30/2016  Meals and Entertainment            4   
14  12/15/2016  Meals and Entertainment            4   
15   12/1/2016  Meals and Entertainment            4   
16   12/8/2016  Meals and Entertainment         