In [1]:
from pyspark.ml.clustering import KMeans
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession\
        .builder\
        .appName("KMeansExample")\
        .getOrCreate()

In [3]:
dataset = spark.read.format("libsvm").load('/trainer/BigDataSparkSajan/sample_kmeans_data.txt')

In [4]:
dataset

DataFrame[label: double, features: vector]

In [5]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [6]:
# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)

In [7]:
kmeans

KMeans_449590648a14aa358045

In [8]:
model = kmeans.fit(dataset)

In [9]:
model

KMeans_449590648a14aa358045

In [10]:
# Make predictions
predictions = model.transform(dataset)

In [11]:
predictions.show()

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|           (3,[],[])|         0|
|  1.0|(3,[0,1,2],[0.1,0...|         0|
|  2.0|(3,[0,1,2],[0.2,0...|         0|
|  3.0|(3,[0,1,2],[9.0,9...|         1|
|  4.0|(3,[0,1,2],[9.1,9...|         1|
|  5.0|(3,[0,1,2],[9.2,9...|         1|
+-----+--------------------+----------+



In [12]:
# Shows the result.
centers = model.clusterCenters()

In [13]:
centers

[array([ 0.1,  0.1,  0.1]), array([ 9.1,  9.1,  9.1])]

In [14]:
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 0.1  0.1  0.1]
[ 9.1  9.1  9.1]


In [15]:
spark.stop()