# Module 3 Clustering

In [1]:
# Start a Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()

In [2]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

In [3]:
iris = spark.read.csv('data/iris.csv',header=True,inferSchema=True)

In [15]:
iris.show()

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|
|         4.8|        3.4|         1.6|        0.2|Iris-setosa|
|         4.8|        3.0|         1.4| 

In [4]:
v = VectorAssembler(inputCols=["sepal_length","sepal_width","petal_length","petal_width"],outputCol='features')

In [5]:
iris2 = v.transform(iris)

In [17]:
iris2.show(5)

+------------+-----------+------------+-----------+-----------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|
+------------+-----------+------------+-----------+-----------+-----------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
+------------+-----------+------------+-----------+-----------+-----------------+
only showing top 5 rows



In [6]:
clustering = KMeans().setK(3)

In [7]:
clustering = clustering.setSeed(10)

In [8]:
model = clustering.fit(iris2)

In [9]:
centers = model.clusterCenters()

In [10]:
centers

[array([5.006, 3.418, 1.464, 0.244]),
 array([5.9016129 , 2.7483871 , 4.39354839, 1.43387097]),
 array([6.85      , 3.07368421, 5.74210526, 2.07105263])]

## Ex: Hierarchical Clustering

In [11]:
from pyspark.ml.clustering import BisectingKMeans

bkmeans = BisectingKMeans().setK(3)
bkmeans = bkmeans.setSeed(10)


In [12]:
model = bkmeans.fit(iris2)
centers = model.clusterCenters()

In [13]:
centers

[array([5.00566038, 3.36037736, 1.56226415, 0.28867925]),
 array([5.94745763, 2.76610169, 4.45423729, 1.45423729]),
 array([6.85      , 3.07368421, 5.74210526, 2.07105263])]