In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [3]:
from pyspark.ml.clustering import KMeans

In [4]:
dataset = spark.read.format('libsvm').load("sample_kmeans_data.txt")

In [5]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [6]:
final_data = dataset.select('features')

In [7]:
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [8]:
# build the model
# set k =2 , set seed = 120 (random number make sure the result is same every time )
kmeans = KMeans().setK(2).setSeed(120)

In [9]:
# train the model 
model = kmeans.fit(final_data)

In [10]:
# evaluate 
wssse = model.computeCost(final_data)
print (wssse)

0.11999999999994547


In [11]:
# check the cluster center 
centers = model.clusterCenters()
centers

[array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]

In [12]:
# show the result 
results = model.transform(final_data)
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         0|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+



In [13]:
# re-run again with different k 

for k in range(2,6):
    print ('-'*30)
    print ('k = ', k )
    kmeans = KMeans().setK(k).setSeed(120)
    model = kmeans.fit(final_data)
    wssse = model.computeCost(final_data)
    print (wssse)
    centers = model.clusterCenters()
    print (centers)
    results = model.transform(final_data)
    results.show()
    print ('-'*30)

------------------------------
k =  2
0.11999999999994547
[array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]
+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         0|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+

------------------------------
------------------------------
k =  3
0.07499999999994544
[array([0.05, 0.05, 0.05]), array([9.1, 9.1, 9.1]), array([0.2, 0.2, 0.2])]
+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+

------------------------------
---------------

In [14]:
# end of 14.51
# next : 14.52