In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [3]:
df = spark.read.csv('../Arquivos/seeds_dataset.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [4]:
df.toPandas().head()

Unnamed: 0,area,perimeter,compactness,length_of_kernel,width_of_kernel,asymmetry_coefficient,length_of_groove
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175


In [5]:
df.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [6]:
assembler = VectorAssembler(inputCols=df.columns, outputCol='features')

In [7]:
df = assembler.transform(df)

In [8]:
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [9]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [10]:
scaler_model = scaler.fit(df)

In [11]:
df = scaler_model.transform(df)

In [12]:
df.toPandas().head()

Unnamed: 0,area,perimeter,compactness,length_of_kernel,width_of_kernel,asymmetry_coefficient,length_of_groove,features,scaledFeatures
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,"[15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]","[5.244527953320284, 11.363299389287777, 36.860..."
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,"[14.88, 14.57, 0.8811, 5.553999999999999, 3.33...","[5.113930271651758, 11.156554723849252, 37.288..."
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,"[14.29, 14.09, 0.905, 5.291, 3.336999999999999...","[4.911160186955888, 10.789008651958541, 38.299..."
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,"[13.84, 13.94, 0.8955, 5.324, 3.37899999999999...","[4.756505037611581, 10.674150504492696, 37.897..."
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,"[16.14, 14.99, 0.9034, 5.6579999999999995, 3.5...","[5.546964689815818, 11.478157536753622, 38.232..."


In [13]:
kmeans = KMeans(featuresCol='scaledFeatures', k=3)

In [14]:
model = kmeans.fit(df)

In [15]:
print(f"WSSSE: {model.summary.trainingCost}")

WSSSE: 428.6082011872446


In [16]:
centers = model.clusterCenters()

In [17]:
print(centers)

[array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,
        1.80061978, 10.41913733]), array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
        2.41585013, 12.29286107]), array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,
        3.15410901, 10.38031464])]


In [18]:
model.transform(df).select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows

