In [1]:
#imports

from numpy import array
from math import sqrt

import pandas as pd                                                                                                
import numpy as np

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark import SparkContext, SparkConf

from pyspark.ml.linalg import Vectors
from pyspark.sql import SQLContext

import preprocess

In [2]:
# load the data into Pandas dataframes
t_data = pd.read_csv('training_data_example.csv')
v_data = pd.read_csv('validation_data_example.csv')
e_data = pd.read_csv('employee.csv', index_col='employee id')

# Problem description

Here I am assuming that data is small enough that I can fit into single computer. However, I want to take advantage of the distributed computing power available through HPC. Therefore, I run Kmeans clustering algorithm using pyspark and present the results. It is not surprising to see that the clusters formed by pyspark's Kmeans are exactly identical to that of sklearn's as shown below.

In [3]:
# creating spark context

conf = SparkConf()
sc = SparkContext(conf=conf)
sqlCtx = SQLContext(sc)


In [4]:
# Obtain the processed features (excluding expense description) for training and validation datasets
(Xtrain, Xval) = preprocess.data_matrix(t_data, v_data, e_data)
with open("glove.6B.50d.txt", "rb") as lines:
    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}
embed_tfidf = preprocess.TfidfEmbeddingVectorizer(w2v)

# merge the "expense description" and "category" columns
t_data['description'] = t_data['expense description'] + " " + t_data['category']
v_data['description'] = v_data['expense description'] + " " + v_data['category']

# combine training and validation data
data = [t_data, v_data]
c_data = pd.concat(data)

embed_tfidf.fit(t_data['description'])
c_embed = embed_tfidf.transform(c_data['description'])

In [5]:
# Convert the numpy array to Spark rdd
#dff = map(lambda x: (int(x[0]), Vectors.dense(x)), c_embed)
#parsedData = sqlCtx.createDataFrame(dff)
parsedData = sc.parallelize(c_embed)

In [6]:
# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 2, maxIterations=100, initializationMode="random", epsilon=1e-3)

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Sum of Squared Error between the points and cluster center = " + str(WSSSE))

Sum of Squared Error between the points and cluster center = 25.7618534834


In [7]:
# collect the clusters
points = []
for point in range(len(c_embed)):
    points.append(clusters.predict(parsedData.collect()[point]))
points = np.array(points)
cluster1 = c_data[points==1]
cluster2 = c_data[points==0]

In [8]:
cluster1.drop(['description'], axis=1)

Unnamed: 0,date,category,employee id,expense description,pre-tax amount,tax name,tax amount
2,11/30/2016,Computer - Hardware,3,HP Laptop Computer,999.0,CA Sales tax,129.87
3,11/14/2016,Computer - Software,3,Microsoft Office,899.0,CA Sales tax,116.87
4,11/6/2016,Computer - Software,4,Dropbox Subscription,50.0,CA Sales tax,6.5
5,11/3/2016,Computer - Software,3,Dropbox Subscription,50.0,CA Sales tax,6.5
9,10/4/2016,Travel,6,Flight to Miami,200.0,CA Sales tax,26.0
10,10/12/2016,Computer - Hardware,7,Macbook Air Computer,1999.0,NY Sales tax,177.41
11,12/11/2016,Computer - Software,1,iCloud Subscription,15.0,CA Sales tax,1.95
13,9/30/2016,Office Supplies,3,Paper,200.0,CA Sales tax,26.0
15,11/6/2016,Computer - Hardware,6,iPhone,200.0,CA Sales tax,26.0
17,12/3/2016,Meals and Entertainment,5,Starbucks coffee,4.0,CA Sales tax,0.52


In [9]:
cluster2.drop(['description'], axis=1)

Unnamed: 0,date,category,employee id,expense description,pre-tax amount,tax name,tax amount
0,11/1/2016,Travel,7,Taxi ride,40.0,NY Sales tax,3.55
1,11/15/2016,Meals and Entertainment,1,Team lunch,235.0,CA Sales tax,30.55
6,12/9/2016,Meals and Entertainment,6,Coffee with Steve,300.0,CA Sales tax,39.0
7,11/12/2016,Travel,4,Taxi ride,230.0,CA Sales tax,29.9
8,11/21/2016,Meals and Entertainment,7,Client dinner,200.0,NY Sales tax,17.75
12,9/18/2016,Travel,1,Taxi ride,60.0,CA Sales tax,7.8
14,12/30/2016,Meals and Entertainment,4,Dinner with potential client,200.0,CA Sales tax,26.0
16,11/7/2016,Travel,1,Airplane ticket to NY,200.0,CA Sales tax,26.0
18,12/18/2016,Travel,6,Airplane ticket to NY,1500.0,CA Sales tax,195.0
19,12/15/2016,Meals and Entertainment,4,Dinner with client,200.0,CA Sales tax,26.0
