In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [15]:
spark = SparkSession.builder.appName("multilayer_perceptron").getOrCreate()

In [16]:
df = spark.read.csv('test.csv', header=True, inferSchema=True)
df.show(5)

+---+--------------------+--------------------+---+---+------+------+----+------+-----+
|_c0|        rotationRate|    userAcceleration|act| id|weight|height| age|gender|trial|
+---+--------------------+--------------------+---+---+------+------+----+------+-----+
|  0|0.010253424306055027|0.006959199379238966|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  1|0.010920351047470954|0.010672920359489243|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  2|0.008376644793710666|0.007009658764875...|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  3|0.006554577255628314|0.014892331247994722|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  4|0.007723848846268292|0.013001225519157802|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
+---+--------------------+--------------------+---+---+------+------+----+------+-----+
only showing top 5 rows



In [23]:
vectorAssembler = VectorAssembler(inputCols = ['rotationRate','userAcceleration','weight','height','age'], outputCol = 'features')
v_df = vectorAssembler.transform(df)
v_df.show(5)
# Let's split our data into training data and testing data
indexer = StringIndexer(inputCol = 'act', outputCol ='label')
i_v_df = indexer.fit(v_df).transform(v_df)
i_v_df.show(5)

+---+--------------------+--------------------+---+---+------+------+----+------+-----+--------------------+
|_c0|        rotationRate|    userAcceleration|act| id|weight|height| age|gender|trial|            features|
+---+--------------------+--------------------+---+---+------+------+----+------+-----+--------------------+
|  0|0.010253424306055027|0.006959199379238966|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|[0.01025342430605...|
|  1|0.010920351047470954|0.010672920359489243|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|[0.01092035104747...|
|  2|0.008376644793710666|0.007009658764875...|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|[0.00837664479371...|
|  3|0.006554577255628314|0.014892331247994722|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|[0.00655457725562...|
|  4|0.007723848846268292|0.013001225519157802|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|[0.00772384884626...|
+---+--------------------+--------------------+---+---+------+------+----+------+-----+--------------------+
only showing top 5 

In [24]:
i_v_df.select('act','label').groupBy('act','label').count().show()
splits = i_v_df.randomSplit([0.6,0.4],1)
train_df = splits[0]
test_df = splits[1]
print(train_df.count(),test_df.count(), i_v_df.count())

+---+-----+------+
|act|label| count|
+---+-----+------+
|1.0|  1.0|224816|
|4.0|  2.0|158645|
|5.0|  3.0|104327|
|3.0|  4.0| 58204|
|0.0|  0.0|234657|
|2.0|  5.0| 50246|
+---+-----+------+

498552 332343 830895


In [31]:
layers = [5,5,5,6]
mlp = MultilayerPerceptronClassifier().setLayers(layers).setSeed(13).setMaxIter(10)
mlp_model = mlp.fit(train_df)
pred_df = mlp_model.transform(test_df)
print(pred_df)
pred_df.select('Id','features','label','rawPrediction','probability','prediction').show(5)

DataFrame[_c0: int, rotationRate: double, userAcceleration: double, act: double, id: double, weight: double, height: double, age: double, gender: double, trial: double, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double]
+---+--------------------+-----+--------------------+--------------------+----------+
| Id|            features|label|       rawPrediction|         probability|prediction|
+---+--------------------+-----+--------------------+--------------------+----------+
|0.0|[0.01025342430605...|  0.0|[0.60496897970682...|[0.28166146587660...|       0.0|
|0.0|[0.00772384884626...|  0.0|[0.60496897970682...|[0.28166146587660...|       0.0|
|0.0|[0.00854399678136...|  0.0|[0.60496897970682...|[0.28166146587660...|       0.0|
|0.0|[0.01131570452954...|  0.0|[0.60496897970682...|[0.28166146587660...|       0.0|
|0.0|[0.01605642877479...|  0.0|[0.60496897970682...|[0.28166146587660...|       0.0|
+---+--------------------+-----+--------------

In [32]:
evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction', metricName = 'accuracy')
mlpacc = evaluator.evaluate(pred_df)
print(mlpacc)

0.28379716136641964
