In [1]:
import findspark
findspark.init('/home/yerke/spark-3.3.0-bin-hadoop3')

In [27]:
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [9]:
dataset = spark.read.csv('../course_data/seeds_dataset.csv', header=True, inferSchema=True)

In [10]:
dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [11]:
dataset.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [26]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [14]:
from pyspark.ml.feature import VectorAssembler

In [15]:
dataset.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [16]:
assembler = VectorAssembler(inputCols=dataset.columns, outputCol='features')

In [18]:
final_data = assembler.transform(dataset)

In [19]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [20]:
from pyspark.ml.feature import StandardScaler

In [31]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [32]:
scaler_model = scaler.fit(final_data)

In [33]:
final_data = scaler_model.transform(final_data)

In [34]:
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [35]:
kmeans = BisectingKMeans(featuresCol='scaledFeatures', k=3)

In [36]:
model = kmeans.fit(final_data)

In [37]:
print('WSSSE')
print(model.computeCost(final_data))

WSSSE
458.4231632114777


In [38]:
centers = model.clusterCenters()

In [39]:
centers

[array([ 4.05829295, 10.14045513, 35.77528896, 11.82997208,  7.4942816 ,
         3.30390703, 10.44460049]),
 array([ 4.76419932, 10.77289423, 37.19598408, 12.24860707,  8.43849984,
         1.86180151, 10.22931219]),
 array([ 6.20884577, 12.25651292, 37.43485358, 13.77282897,  9.67731721,
         2.2989371 , 12.09236686])]

In [44]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         1|
|         1|
|         1|
|         1|
|         2|
|         1|
|         1|
|         1|
|         2|
|         2|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         2|
|         1|
|         1|
+----------+
only showing top 20 rows

