In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Cluster").getOrCreate()

In [11]:
from pyspark.ml.clustering import KMeans

## Load data

In [3]:
import os
os.listdir()

['.ipynb_checkpoints',
 'Clustering Code Along.ipynb',
 'Clustering_Code_Example.ipynb',
 'Clustering_Consulting_Project.ipynb',
 'Clustering_Consulting_Project_SOLUTIONS.ipynb',
 'hack_data.csv',
 'My_doc_example.ipynb',
 'sample_kmeans_data.txt',
 'seeds_dataset.csv',
 'seeds_dataset.txt']

In [4]:
df = spark.read.format("libsvm").load("sample_kmeans_data.txt")

In [7]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [6]:
print(df.count(), len(df.columns))

6 2


In [8]:
data = df.select("features")

In [9]:
data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



## Learning

In [14]:
# set a cluster
cl = KMeans(featuresCol="features").setK(2).setSeed(1)

In [17]:
model = cl.fit(data)

In [23]:
#Within Set Sum of Squared Errors.
wsse = model.computeCost(data)
print(wsse)

0.11999999999994547


In [18]:
centers = model.clusterCenters()

In [19]:
centers

[array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]

In [20]:
# prediction
pred = model.transform(data)

In [21]:
pred.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         0|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+

