## Purpose of script:
#### Reviewing Spark's tree-based models implementation
#### Referencing Jose Portilla's "Spark and Python for Big Data with PySpark" course

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans

In [3]:
spark = SparkSession.builder.appName('clustering').getOrCreate()

In [5]:
data = spark.read.format('libsvm').load('../Datasets/sample_kmeans_data.txt')

In [6]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [7]:
# removing labels as unsupervised learning do not require labels
data = data.select('features')

In [8]:
# set number of clusters with setK
# set seed value to make outputs consistent with setSeed
km = KMeans().setK(2).setSeed(1)

In [9]:
kmm = km.fit(data)

In [12]:
pdt = kmm.transform(data)

In [13]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [14]:
evaluator = ClusteringEvaluator()

In [15]:
# model.computeCost() is deprecated for within set sum of squared error,
# use ClusteringEvaluator instead
silhouette = evaluator.evaluate(pdt)
print(silhouette)

0.9997530305375207


In [16]:
centers = kmm.clusterCenters()
centers

[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

In [17]:
# predictions are added
pdt.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



## Example 2 - Seed Dataset

In [18]:
data = spark.read.csv('../Datasets/seeds_dataset.csv', inferSchema=True, header=True)

In [20]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [19]:
from pyspark.ml.feature import VectorAssembler

In [21]:
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')

In [22]:
final_data = assembler.transform(data)

In [23]:
from pyspark.ml.feature import StandardScaler

In [24]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [25]:
scaler_model = scaler.fit(final_data)

In [26]:
final_data = scaler_model.transform(final_data)

In [27]:
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [28]:
km = KMeans(featuresCol='scaledFeatures', k=3)

In [29]:
kmm = km.fit(final_data)

In [30]:
pdt = kmm.transform(final_data)

In [31]:
silhouette = evaluator.evaluate(pdt)
print(silhouette)

0.6018627534901196


In [32]:
centers = kmm.clusterCenters()
centers

[array([ 4.87257659, 10.88120146, 37.27692543, 12.3410157 ,  8.55443412,
         1.81649011, 10.32998598]),
 array([ 6.31670546, 12.37109759, 37.39491396, 13.91155062,  9.748067  ,
         2.39849968, 12.2661748 ]),
 array([ 4.06105916, 10.13979506, 35.80536984, 11.82133095,  7.50395937,
         3.27184732, 10.42126018])]

In [35]:
pdt.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   70|
|         2|   65|
|         0|   75|
+----------+-----+

