In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("cluster").getOrCreate()

In [2]:
data = spark.read.csv("seeds_dataset.csv", inferSchema=True, header=True)

In [3]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [4]:
data.head()

Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)

In [7]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [5]:
from pyspark.ml.clustering import KMeans

In [10]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [9]:
assembler = VectorAssembler(inputCols=['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove'], outputCol="features")

In [11]:
data = assembler.transform(data)

In [13]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", )

In [16]:
final_data = scaler.fit(data).transform(data)

In [17]:
final_data.head()

Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))

In [24]:
kmeans = KMeans(featuresCol="scaledFeatures", k=3)

In [25]:
model = kmeans.fit(final_data)

In [26]:
print("WSSSE")
print(model.computeCost(final_data))

WSSSE
428.60820118716356


In [28]:
for center in model.clusterCenters():
    print(center)

[ 4.07497225 10.14410142 35.89816849 11.80812742  7.54416916  3.15410901
 10.38031464]
[ 6.35645488 12.40730852 37.41990178 13.93860446  9.7892399   2.41585013
 12.29286107]
[ 4.96198582 10.97871333 37.30930808 12.44647267  8.62880781  1.80061978
 10.41913733]


In [30]:
results = model.transform(final_data)

In [35]:
results.select("prediction").show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
+----------+
only showing top 20 rows

