# Distributed version of Question 1 (Spark)

This part is mostly done by searching and implementing as it is my first Spark experience (I was interested in learning basics of Spark).

In [16]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('distributed-ml-challenge').getOrCreate()

Read both training and validation csv files.

In [3]:
train = spark.read.csv('training_data_example.csv', header = True, inferSchema = True)
train_count = train.count()

validation = spark.read.csv('validation_data_example.csv', header = True, inferSchema = True)
validation_count = validation.count()

print("number of training / validation samples: {0} / {1}".format(train_count, validation_count))

number of training / validation samples: 24 / 12


In [4]:
validation.show()

+----------+--------------------+-----------+--------------------+--------------+------------+----------+
|      date|            category|employee id| expense description|pre-tax amount|    tax name|tax amount|
+----------+--------------------+-----------+--------------------+--------------+------------+----------+
|11/10/2016|              Travel|          7|           Taxi ride|          30.0|NY Sales tax|      2.66|
|11/12/2016|Meals and Enterta...|          1|  Dinner with Family|         235.0|CA Sales tax|     30.55|
|  9/2/2016| Computer - Hardware|          4|Macbook Air Computer|        4000.0|CA Sales tax|     520.0|
|  9/2/2016|     Office Supplies|          4|               Paper|          20.0|CA Sales tax|       2.6|
|  9/2/2016|     Office Supplies|          4|                Pens|          20.0|CA Sales tax|       2.6|
|11/21/2016|              Travel|          1|Airplane ticket t...|         200.0|CA Sales tax|      26.0|
| 12/4/2016|Meals and Enterta...|          2| 

Concatenate both train and validation data frame in order to make changes easier. They will be separated in next steps for creating model and evaluating its performance.

In [5]:
whole_dataset = train.union(validation)

Code taken from this page: https://docs.databricks.com/spark/latest/mllib/binary-classification-mllib-pipelines.html

In [6]:
cols = whole_dataset.columns
categoricalColumns = ['expense description', 'tax name', 'category']
stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]
label_stringIdx = StringIndexer(inputCol = 'category', outputCol = 'label')
stages += [label_stringIdx]
numericCols = ['employee id', 'pre-tax amount', 'tax amount']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [7]:
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(whole_dataset)
whole_dataset = pipelineModel.transform(whole_dataset)
selectedCols = ['label', 'features'] + cols
whole_dataset = whole_dataset.select(selectedCols)

Separate train and validation.

In [8]:
train, validation = whole_dataset.randomSplit([0.6, 0.4], seed = 0)
print("Training Dataset Count: {0}".format(train.count()))
print("Test Dataset Count: {0}".format(validation.count()))

Training Dataset Count: 24
Test Dataset Count: 12


In [20]:
models = {'Losistic Regression': LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10),
          'Random Forest': RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=5)}

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [21]:
for key, value in models.items():
    print("\nClassification results using -> {0}".format(key))
    model = value
    model = model.fit(train)
    predictions_train = model.transform(train)
    predictions_validation = model.transform(validation)
    predictions_train.select('category','prediction').show()
    predictions_validation.select('category','prediction').show()
    
    
    train_acc = evaluator.evaluate(predictions_train)
    validation_acc = evaluator.evaluate(predictions_validation)
    print("Train accuracy = {0:.2f} \nValidation accuracy = {1:.2f}".format(train_acc, validation_acc))

    
    



Classification results using -> Losistic Regression
+--------------------+----------+
|            category|prediction|
+--------------------+----------+
|Meals and Enterta...|       0.0|
|Meals and Enterta...|       0.0|
|Meals and Enterta...|       0.0|
|Meals and Enterta...|       0.0|
|Meals and Enterta...|       0.0|
|              Travel|       1.0|
|              Travel|       1.0|
|              Travel|       1.0|
|              Travel|       1.0|
|              Travel|       1.0|
|              Travel|       1.0|
| Computer - Software|       2.0|
| Computer - Software|       2.0|
| Computer - Software|       2.0|
| Computer - Hardware|       3.0|
| Computer - Hardware|       3.0|
|Meals and Enterta...|       0.0|
|Meals and Enterta...|       0.0|
|Meals and Enterta...|       0.0|
|Meals and Enterta...|       0.0|
+--------------------+----------+
only showing top 20 rows

+--------------------+----------+
|            category|prediction|
+--------------------+----------+
|Me