In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

In [133]:
#!wget https://github.com/IBM/coursera/raw/master/coursera_ml/a2.parquet

In [134]:
df=spark.read.load('a2.parquet')
df.createOrReplaceTempView("df")
spark.sql("SELECT * from df").show(5)

+-----+-----------+-------------------+-------------------+-------------------+
|CLASS|   SENSORID|                  X|                  Y|                  Z|
+-----+-----------+-------------------+-------------------+-------------------+
|    0|         26| 380.66434005495194| -139.3470983812975|-247.93697521077704|
|    0|         29| 104.74324299209692| -32.27421440203938|-25.105013725863852|
|    0| 8589934658| 118.11469236129976| 45.916682927433534| -87.97203782706572|
|    0|34359738398| 246.55394030642543|-0.6122810693132044|-398.18662513951506|
|    0|17179869241|-190.32584900181487|  234.7849657520335|-206.34483804019288|
+-----+-----------+-------------------+-------------------+-------------------+
only showing top 5 rows



In [135]:
#rename column class to label
df1 = df.withColumnRenamed('CLASS','label')
df1.show(5)

+-----+-----------+-------------------+-------------------+-------------------+
|label|   SENSORID|                  X|                  Y|                  Z|
+-----+-----------+-------------------+-------------------+-------------------+
|    0|         26| 380.66434005495194| -139.3470983812975|-247.93697521077704|
|    0|         29| 104.74324299209692| -32.27421440203938|-25.105013725863852|
|    0| 8589934658| 118.11469236129976| 45.916682927433534| -87.97203782706572|
|    0|34359738398| 246.55394030642543|-0.6122810693132044|-398.18662513951506|
|    0|17179869241|-190.32584900181487|  234.7849657520335|-206.34483804019288|
+-----+-----------+-------------------+-------------------+-------------------+
only showing top 5 rows



In [136]:
# Alternatively
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='CLASS',outputCol='label')
df2 = indexer.fit(df).transform(df)
df2.show(5)

+-----+-----------+-------------------+-------------------+-------------------+-----+
|CLASS|   SENSORID|                  X|                  Y|                  Z|label|
+-----+-----------+-------------------+-------------------+-------------------+-----+
|    0|         26| 380.66434005495194| -139.3470983812975|-247.93697521077704|  0.0|
|    0|         29| 104.74324299209692| -32.27421440203938|-25.105013725863852|  0.0|
|    0| 8589934658| 118.11469236129976| 45.916682927433534| -87.97203782706572|  0.0|
|    0|34359738398| 246.55394030642543|-0.6122810693132044|-398.18662513951506|  0.0|
|    0|17179869241|-190.32584900181487|  234.7849657520335|-206.34483804019288|  0.0|
+-----+-----------+-------------------+-------------------+-------------------+-----+
only showing top 5 rows



In [137]:
from pyspark.ml.feature import VectorAssembler,Normalizer

assembler = VectorAssembler(inputCols=['X','Y','Z'],outputCol='features')
normalize = Normalizer(inputCol='features',outputCol='features_norm')


In [138]:
# Using logistic regression classifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

classifier = LogisticRegression(maxIter=100, regParam = 0.1) #elasticNetParam=0.8)
pipeline = Pipeline(stages=[indexer,assembler,normalize,classifier])
model = pipeline.fit(df)
prediction = model.transform(df)
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("label")  
binEval.evaluate(prediction)

0.534516765285996

In [139]:
# Using linear Support vector machine
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCol='label',outputCol='labelVec')
lsvc = LinearSVC(maxIter=10, regParam=0.1)
pipeline = Pipeline(stages=[indexer,encoder,assembler,lsvc])
model2 = pipeline.fit(df)
predict = model2.transform(df)
binEval.evaluate(predict)

0.534516765285996

In [140]:
# Using Gradient boosted tree
from pyspark.ml.classification import GBTClassifier

gbtc = GBTClassifier(labelCol='label',featuresCol='features',maxIter=10)
pipeline = Pipeline(stages=[indexer,encoder,assembler,gbtc])
model3 = pipeline.fit(df)
predict = model3.transform(df)
binEval.evaluate(predict)

0.9986850756081526

In [98]:
#prediction.show(4)