## 1. Load dataset

In [12]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
import os

sqlCtx = SQLContext(sc)

df_train = sqlCtx.read.load(os.path.join('..', 'data', 'task3_train.csv'),
                     format='com.databricks.spark.csv', header='true',
                     inferSchema='true')
df_test = sqlCtx.read.load(os.path.join('..', 'data', 'task3_test.csv'),
                     format='com.databricks.spark.csv', header='true',
                     inferSchema='true')

In [14]:
FEATURES = [
    # 'UserID',
    # 'UUID',
    # 'Version',
    # 'TimeStemp',
    'GyroscopeStat_x_MEAN',
    'GyroscopeStat_z_MEAN',
    'GyroscopeStat_COV_z_x',
    'GyroscopeStat_COV_z_y',
    'MagneticField_x_MEAN',
    'MagneticField_z_MEAN',
    'MagneticField_COV_z_x',
    'MagneticField_COV_z_y',
    'Pressure_MEAN',
    'LinearAcceleration_COV_z_x',
    'LinearAcceleration_COV_z_y',
    'LinearAcceleration_x_MEAN',
    'LinearAcceleration_z_MEAN',
    # 'attack'
    ]

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=FEATURES, outputCol='features')
X_train = assembler.transform(df_train)
X_test = assembler.transform(df_test)

X_train.columns
#X_train = df_train[FEATURES]
#X_test = df_test[FEATURES]

#y_train = df_train['attack']
#y_test = df_test['attack']

['UserID',
 'UUID',
 'Version',
 'TimeStemp',
 'GyroscopeStat_x_MEAN',
 'GyroscopeStat_z_MEAN',
 'GyroscopeStat_COV_z_x',
 'GyroscopeStat_COV_z_y',
 'MagneticField_x_MEAN',
 'MagneticField_z_MEAN',
 'MagneticField_COV_z_x',
 'MagneticField_COV_z_y',
 'Pressure_MEAN',
 'LinearAcceleration_COV_z_x',
 'LinearAcceleration_COV_z_y',
 'LinearAcceleration_x_MEAN',
 'LinearAcceleration_z_MEAN',
 'attack',
 'features']

In [15]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol='attack', featuresCol='features',
                           maxDepth=5, minInstancesPerNode=20,
                           impurity='gini')

In [16]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[dt])
model = pipeline.fit(X_train)

In [17]:
predictions = model.transform(X_test)

In [19]:
predictions.select('prediction', 'attack').show(10)

+----------+------+
|prediction|attack|
+----------+------+
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
|       0.0|     0|
+----------+------+
only showing top 20 rows



In [20]:
predictions.select('prediction', 'attack').write.save(
    path=os.path.join('..', 'output', 'dt_predictions'),
    format='com.databricks.spark.csv', header='true')

In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='attack',
                                            predictionCol='prediction',
                                            metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print('Accuracy:', accuracy)

Accuracy: 0.9993914807302231
