In [1]:
import findspark

In [2]:
findspark.init('/home/sushant/spark-2.1.0-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('kmeans').getOrCreate()

In [5]:
from pyspark.ml.clustering import KMeans

In [6]:
!pwd

/home/sushant/Documents/SparkUdemyCourse


In [7]:
data = spark.read.format('libsvm').load('./Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Clustering/sample_kmeans_data.txt')

In [8]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



Since, we are doing an unsupervised problem here, we don't actually need the labels.

In [9]:
dataUnlabeled = data.select('features')

In [10]:
dataUnlabeled.show(3)

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
+--------------------+
only showing top 3 rows



In [11]:
kmeans = KMeans().setK(2).setSeed(1)
# seed is for the random number generator which chooses the initial centroids. Set the seed for repeatability. 

In [13]:
model = kmeans.fit(dataUnlabeled)

In [14]:
withinSetSumOfSquareErrors = model.computeCost(dataUnlabeled)

In [15]:
print(withinSetSumOfSquareErrors)

0.11999999999994547


In [16]:
dataUnlabeled.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



Note that the features are three dimensional. So cluster centers will also be in a three-dimensional space.

In [17]:
model.clusterCenters()

[array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]

Note that we set K = 2. 

In [18]:
model.transform(dataUnlabeled).show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         0|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+

