## DEMO 1 :   sample_kmeans_data.txt

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [3]:
from pyspark.ml.clustering import KMeans

In [4]:
dataset = spark.read.format('libsvm').load("sample_kmeans_data.txt")

In [5]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [6]:
final_data = dataset.select('features')

In [7]:
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [8]:
# build the model
# set k =2 , set seed = 120 (random number make sure the result is same every time )
kmeans = KMeans().setK(2).setSeed(120)

In [9]:
# train the model 
model = kmeans.fit(final_data)

In [10]:
# evaluate 
wssse = model.computeCost(final_data)
print (wssse)

0.11999999999994547


In [11]:
# check the cluster center 
centers = model.clusterCenters()
centers

[array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]

In [12]:
# show the result 
results = model.transform(final_data)
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         0|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+



In [13]:
# re-run again with different k 

for k in range(2,6):
    print ('-'*30)
    print ('k = ', k )
    kmeans = KMeans().setK(k).setSeed(120)
    model = kmeans.fit(final_data)
    wssse = model.computeCost(final_data)
    print (wssse)
    centers = model.clusterCenters()
    print (centers)
    results = model.transform(final_data)
    results.show()
    print ('-'*30)

------------------------------
k =  2
0.11999999999994547
[array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]
+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         0|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+

------------------------------
------------------------------
k =  3
0.07499999999994544
[array([0.05, 0.05, 0.05]), array([9.1, 9.1, 9.1]), array([0.2, 0.2, 0.2])]
+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+

------------------------------
---------------

In [14]:
# end of 14.51
# next : 14.52

## DEMO 2 : SEEDS DATASET : seeds_dataset.csv 

In [15]:
# data source 
# https://archive.ics.uci.edu/ml/datasets/seeds
# https://github.com/datapolitan/lede_algorithms/blob/master/class3_2/data/seeds_dataset.txt

In [16]:
dataset2 = spark.read.csv("seeds_dataset.csv",
                          header=True,
                          inferSchema=True)

In [17]:
dataset2.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_kernel_groove: double (nullable = true)



In [18]:
dataset2.show()

+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+
| area|perimeter|compactness|length_of_kernel|width_of_kernel|asymmetry_coefficient|length_of_kernel_groove|
+-----+---------+-----------+----------------+---------------+---------------------+-----------------------+
|15.26|    14.84|      0.871|           5.763|          3.312|                2.221|                   5.22|
|14.88|    14.57|     0.8811|           5.554|          3.333|                1.018|                  4.956|
|14.29|    14.09|      0.905|           5.291|          3.337|                2.699|                  4.825|
|13.84|    13.94|     0.8955|           5.324|          3.379|                2.259|                  4.805|
|16.14|    14.99|     0.9034|           5.658|          3.562|                1.355|                  5.175|
|14.38|    14.21|     0.8951|           5.386|          3.312|                2.462|                  4.956|
|14.69|    14.49|  

In [19]:
# import library 
from pyspark.ml.feature import VectorAssembler

In [20]:
dataset2.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_kernel_groove']

In [21]:
######## transform data to the form that spark ML can train ########
assembler = VectorAssembler(inputCols=dataset2.columns,
                            outputCol= 'features')
final_data2 = assembler.transform(dataset2)

In [22]:
final_data2.printSchema() # notice that there is new added desired "features" column 

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_kernel_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [23]:
from pyspark.ml.feature import StandardScaler

In [24]:
# re-scale the feature column
scaler = StandardScaler(inputCol='features',
                        outputCol = 'scaledFeatures')

In [25]:
# define the scale model to transform data
scaler_model = scaler.fit(final_data2)

In [26]:
# transform the data (re-scale)
final_data2 = scaler_model.transform(final_data2)

In [27]:
final_data2.head(1) # check the scaled scaledFeatures column 

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_kernel_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [28]:
# train the model 
kmeans = KMeans(featuresCol='scaledFeatures', k=3)

In [29]:
model = kmeans.fit(final_data2)

In [30]:
# print the fitting performance 
print('WSSSE :', )
print(model.computeCost(final_data2) )

WSSSE :
428.8534414407127


In [31]:
# print the cluster centers
centers = model.clusterCenters()

In [32]:
print (centers)

[array([ 6.32636687, 12.38115343, 37.39222755, 13.9206997 ,  9.75485787,
        2.41428438, 12.28078861]), array([ 4.0648023 , 10.14242485, 35.82143905, 11.81918014,  7.51855717,
        3.19362266, 10.40520609]), array([ 4.91309043, 10.92012526, 37.32658724, 12.37724251,  8.59393872,
        1.82261116, 10.35389957])]


In [33]:
# check the prediciton 
#model.transform(final_data2).show()
model.transform(final_data2).select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
+----------+
only showing top 20 rows

