In [1]:
import findspark, pyspark
findspark.find()

'C:\\Bigdata\\spark-2.4.5-bin-hadoop2.7'

In [2]:
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession 
spark = SparkSession.builder.master('local').appName('cluster').getOrCreate()

In [3]:
dataset = spark.read.csv("seeds_dataset.csv", inferSchema=True, header=True)

In [4]:
dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [5]:
dataset.show(2)

+-----+---------+-----------+-----------------+---------------+---------------------+----------------+
| area|perimeter|compactness| length_of_kernel|width_of_kernel|asymmetry_coefficient|length_of_groove|
+-----+---------+-----------+-----------------+---------------+---------------------+----------------+
|15.26|    14.84|      0.871|            5.763|          3.312|                2.221|            5.22|
|14.88|    14.57|     0.8811|5.553999999999999|          3.333|                1.018|           4.956|
+-----+---------+-----------+-----------------+---------------+---------------------+----------------+
only showing top 2 rows



In [6]:
dataset.describe().show()

+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|summary|              area|         perimeter|         compactness|   length_of_kernel|   width_of_kernel|asymmetry_coefficient|   length_of_groove|
+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|  count|               210|               210|                 210|                210|               210|                  210|                210|
|   mean|14.847523809523816|14.559285714285718|  0.8709985714285714|  5.628533333333335| 3.258604761904762|   3.7001999999999997|  5.408071428571429|
| stddev|2.9096994306873647|1.3059587265640225|0.023629416583846364|0.44306347772644983|0.3777144449065867|   1.5035589702547392|0.49148049910240543|
|    min|             10.59|             12.41|              0.8081|              4.899|            

In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
dataset.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [9]:
assembler = VectorAssembler(inputCols=dataset.columns, outputCol='features')
final_data = assembler.transform(dataset)

In [10]:
from pyspark.ml.feature import StandardScaler

# withStd표준편차랑 withMean평균값을 가지고
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures',
                       withStd=True, withMean=True)

In [11]:
scalerModel = scaler.fit(final_data)
final_data = scalerModel.transform(final_data)

In [12]:
from pyspark.ml.clustering import KMeans

In [13]:
kmeans = KMeans(featuresCol='scaledFeatures', k=3)

In [14]:
model = kmeans.fit(final_data)

In [15]:
wssse = model.computeCost(final_data)

In [16]:
print('Within Set Sum of Squared Errors :' + str(wssse))

Within Set Sum of Squared Errors :428.60820118724456


In [17]:
centers = model.clusterCenters()
print('cluster Centers:')
for center in centers:
    print(center)

cluster Centers:
[-1.02779666 -1.00424915 -0.96260496 -0.89554512 -1.08299564  0.693148
 -0.62331915]
[ 1.25368596  1.25895795  0.55912833  1.23493193  1.1620751  -0.04511088
  1.28922727]
[-0.14078309 -0.16963724  0.44853463 -0.25719987  0.00164301 -0.66034122
 -0.58449646]


In [18]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
+----------+
only showing top 20 rows

