In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

In [2]:
df=spark.read.load('a2.parquet')
df.createOrReplaceTempView("df")
spark.sql("SELECT * from df").show(5)

+-----+-----------+-------------------+-------------------+-------------------+
|CLASS|   SENSORID|                  X|                  Y|                  Z|
+-----+-----------+-------------------+-------------------+-------------------+
|    0|         26| 380.66434005495194| -139.3470983812975|-247.93697521077704|
|    0|         29| 104.74324299209692| -32.27421440203938|-25.105013725863852|
|    0| 8589934658| 118.11469236129976| 45.916682927433534| -87.97203782706572|
|    0|34359738398| 246.55394030642543|-0.6122810693132044|-398.18662513951506|
|    0|17179869241|-190.32584900181487|  234.7849657520335|-206.34483804019288|
+-----+-----------+-------------------+-------------------+-------------------+
only showing top 5 rows



In [3]:
# count the class distribution
spark.sql("SELECT count(class), class from df group by class").show()

+------------+-----+
|count(class)|class|
+------------+-----+
|        1416|    1|
|        1626|    0|
+------------+-----+



In [4]:
#create the 'features' column from vectors X,Y,Z
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['X','Y','Z'],outputCol='features')

In [5]:
def evaluator(prediction):
    prediction.createOrReplaceTempView('prediction')
    eva = spark.sql('''
        select max(correct)/max(total) as accuracy from (

        select sum(correct) as correct, count(correct) as total from (
            select case when class != prediction then 1 else 0 end as correct from prediction 
        ) 
    
        union
    
        select sum(correct) as correct, count(correct) as total from (
            select case when class = prediction then 1 else 0 end as correct from prediction 
        ) 
    )
    ''').rdd.map(lambda row: row.accuracy).collect()[0]
    return eva

In [6]:
# use KMeans clustering, via pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline

clust = KMeans().setK(2).setSeed(1)
pipeline = Pipeline(stages=[assembler,clust])
model = pipeline.fit(df)
prediction = model.transform(df)
prediction.show(5)
print(evaluator(prediction))

+-----+-----------+-------------------+-------------------+-------------------+--------------------+----------+
|CLASS|   SENSORID|                  X|                  Y|                  Z|            features|prediction|
+-----+-----------+-------------------+-------------------+-------------------+--------------------+----------+
|    0|         26| 380.66434005495194| -139.3470983812975|-247.93697521077704|[380.664340054951...|         1|
|    0|         29| 104.74324299209692| -32.27421440203938|-25.105013725863852|[104.743242992096...|         0|
|    0| 8589934658| 118.11469236129976| 45.916682927433534| -87.97203782706572|[118.114692361299...|         1|
|    0|34359738398| 246.55394030642543|-0.6122810693132044|-398.18662513951506|[246.553940306425...|         1|
|    0|17179869241|-190.32584900181487|  234.7849657520335|-206.34483804019288|[-190.32584900181...|         1|
+-----+-----------+-------------------+-------------------+-------------------+--------------------+----

In [7]:
# use Hierarchical clustering (or bisecting KMeans)
from pyspark.ml.clustering import BisectingKMeans

bkm = BisectingKMeans().setK(2).setSeed(1)
pipeline = Pipeline(stages=[assembler,bkm])
model = pipeline.fit(df)
prediction2 = model.transform(df)
prediction2.show(5)
print(evaluator(prediction2))

+-----+-----------+-------------------+-------------------+-------------------+--------------------+----------+
|CLASS|   SENSORID|                  X|                  Y|                  Z|            features|prediction|
+-----+-----------+-------------------+-------------------+-------------------+--------------------+----------+
|    0|         26| 380.66434005495194| -139.3470983812975|-247.93697521077704|[380.664340054951...|         0|
|    0|         29| 104.74324299209692| -32.27421440203938|-25.105013725863852|[104.743242992096...|         0|
|    0| 8589934658| 118.11469236129976| 45.916682927433534| -87.97203782706572|[118.114692361299...|         0|
|    0|34359738398| 246.55394030642543|-0.6122810693132044|-398.18662513951506|[246.553940306425...|         0|
|    0|17179869241|-190.32584900181487|  234.7849657520335|-206.34483804019288|[-190.32584900181...|         0|
+-----+-----------+-------------------+-------------------+-------------------+--------------------+----

In [8]:
# use GaussianMixture model
from pyspark.ml.clustering import GaussianMixture

gmm = GaussianMixture().setK(2)
pipeline = Pipeline(stages=[assembler,gmm])
model = pipeline.fit(df)
prediction3 = model.transform(df)
print(evaluator(prediction3))

0.9927679158448389


Let's breakdown the nested evaluator function

In [11]:
prediction3.createOrReplaceTempView('prediction')
step1a = spark.sql('select case when class = prediction then 1 else 0 end as correct from prediction')
step1b = spark.sql('select case when class != prediction then 1 else 0 end as correct from prediction')
step1a.show(4)
step1b.show(4)

+-------+
|correct|
+-------+
|      0|
|      0|
|      0|
|      0|
+-------+
only showing top 4 rows

+-------+
|correct|
+-------+
|      1|
|      1|
|      1|
|      1|
+-------+
only showing top 4 rows



In [13]:
step1a.createOrReplaceTempView('step1a')
step1b.createOrReplaceTempView('step1b')

step2a = spark.sql('select sum(correct) as correct, count(correct) as total from step1a')
step2b = spark.sql('select sum(correct) as correct, count(correct) as total from step1b')
step2a.show(4)
step2b.show(4)

+-------+-----+
|correct|total|
+-------+-----+
|     22| 3042|
+-------+-----+

+-------+-----+
|correct|total|
+-------+-----+
|   3020| 3042|
+-------+-----+



In [17]:
step2a.createOrReplaceTempView('step2a')
step2b.createOrReplaceTempView('step2b')

step3 = spark.sql('select * from step2a union select * from step2b')
step3.show(4)

+-------+-----+
|correct|total|
+-------+-----+
|     22| 3042|
|   3020| 3042|
+-------+-----+



In [18]:
step3.createOrReplaceTempView('step3')

step4 = spark.sql('select max(correct)/max(total) as accuracy from step3')
step4.show()

+------------------+
|          accuracy|
+------------------+
|0.9927679158448389|
+------------------+



In [19]:
step4.rdd.map(lambda row: row.accuracy).collect()[0]

0.9927679158448389