## Import

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Cluster").getOrCreate()

## Load data

In [3]:
import os
os.listdir()

['.ipynb_checkpoints',
 'Clustering Code Along.ipynb',
 'Clustering_Code_Example.ipynb',
 'Clustering_Consulting_Project.ipynb',
 'Clustering_Consulting_Project_SOLUTIONS.ipynb',
 'hack_data.csv',
 'My_code_along.ipynb',
 'My_doc_example.ipynb',
 'sample_kmeans_data.txt',
 'seeds_dataset.csv',
 'seeds_dataset.txt']

About the data: real experiment at university. They randomly select seeds from 3 different types of wheat and make high resolution xray to get features of selected seeds -> form the dataset

In [5]:
df = spark.read.csv("seeds_dataset.csv", inferSchema=True, header=True)

In [7]:
df.show(5)

+-----+---------+-----------+------------------+------------------+---------------------+----------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|            5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|           4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|           4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|           4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|           5.175|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
only showing top 5 rows



In [8]:
print(df.count(), len(df.columns))

210 7


In [9]:
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



## Clustering

In [10]:
from pyspark.ml.clustering import KMeans

In [12]:
from pyspark.ml.feature import VectorAssembler

### Create vector column

In [13]:
VecAss = VectorAssembler(inputCols = df.columns, outputCol='features')

In [14]:
data = VecAss.transform(df)

In [15]:
data.show(5)

+-----+---------+-----------+------------------+------------------+---------------------+----------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|            features|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|            5.22|[15.26,14.84,0.87...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|           4.956|[14.88,14.57,0.88...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|           4.825|[14.29,14.09,0.90...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|           4.805|[13.84,13.94,0.89...|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|           5.17

In [16]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



### Scale the features

In [19]:
# Scale features!!
from pyspark.ml.feature import StandardScaler

scl = StandardScaler(inputCol="features", outputCol="scaled")

In [22]:
scaler = scl.fit(data)

In [23]:
scaled_data = scaler.transform(data)

In [24]:
scaled_data.show(5)

+-----+---------+-----------+------------------+------------------+---------------------+----------------+--------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|            features|              scaled|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+--------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|            5.22|[15.26,14.84,0.87...|[5.24452795332028...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|           4.956|[14.88,14.57,0.88...|[5.11393027165175...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|           4.825|[14.29,14.09,0.90...|[4.91116018695588...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|           4.805|

### Clustering

In [28]:
km = KMeans(featuresCol="scaled").setK(3).setSeed(1)

In [29]:
model = km.fit(scaled_data)

In [31]:
# within sum of square error
wsse = model.computeCost(scaled_data)
wsse

428.6333432285446

In [32]:
# get the centers
centers = model.clusterCenters()
print(centers)

[array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
        2.41585013, 12.29286107]), array([ 4.07135818, 10.14438097, 35.86461803, 11.81349589,  7.53471695,
        3.18317127, 10.39230304]), array([ 4.94114963, 10.95557919, 37.3028184 , 12.42383591,  8.60815545,
        1.80983376, 10.40657797])]


In [35]:
# prediction
pred = model.transform(scaled_data).select("prediction")

In [37]:
pred.show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
+----------+
only showing top 20 rows

