## Purpose of script:
#### Reviewing Spark's tree-based models implementation
#### Referencing Jose Portilla's "Spark and Python for Big Data with PySpark" course

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans

In [3]:
spark = SparkSession.builder.appName('clustering').getOrCreate()

In [5]:
data = spark.read.format('libsvm').load('../Datasets/sample_kmeans_data.txt')

In [6]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [7]:
# removing labels as unsupervised learning do not require labels
data = data.select('features')

In [8]:
# set number of clusters with setK
# set seed value to make outputs consistent with setSeed
km = KMeans().setK(2).setSeed(1)

In [9]:
kmm = km.fit(data)

In [12]:
pdt = kmm.transform(data)

In [13]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [14]:
evaluator = ClusteringEvaluator()

In [15]:
# model.computeCost() is deprecated for within set sum of squared error,
# use ClusteringEvaluator instead
silhouette = evaluator.evaluate(pdt)
print(silhouette)

0.9997530305375207


In [16]:
centers = kmm.clusterCenters()
centers

[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

In [17]:
# predictions are added
pdt.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+

