In [4]:
import findspark
findspark.init('/home/ubuntu/spark-2.2.0-bin-hadoop2.7')

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('kmeans').getOrCreate()

# Import Data

In [5]:
folder = '/home/ubuntu/data/raw'
file = '/seeds_dataset.csv'
data = spark.read.csv(folder+file,header=True, inferSchema=True)

In [6]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [16]:
for row in data.head(2):
    print(row)
data.columns

Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)
Row(area=14.88, perimeter=14.57, compactness=0.8811, length_of_kernel=5.553999999999999, width_of_kernel=3.333, asymmetry_coefficient=1.018, length_of_groove=4.956)


['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

# Create Features Column

In [31]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')
final_data = assembler.transform(data)
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



# Scale Data

In [32]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features',
                        outputCol='scaledFeatures')
scaler_fitted = scaler.fit(final_data)
final_data = scaler_fitted.transform(final_data)
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



# Create Cluster Model

In [37]:
from pyspark.ml.clustering import KMeans

k_clf = KMeans(k=3, featuresCol='scaledFeatures')
k_fitted = k_clf.fit(final_data)
print('WSSE', k_fitted.computeCost(final_data))
print('Centers', k_fitted.clusterCenters())

WSSE 428.60820118716356
Centers [array([  4.07497225,  10.14410142,  35.89816849,  11.80812742,
         7.54416916,   3.15410901,  10.38031464]), array([  6.35645488,  12.40730852,  37.41990178,  13.93860446,
         9.7892399 ,   2.41585013,  12.29286107]), array([  4.96198582,  10.97871333,  37.30930808,  12.44647267,
         8.62880781,   1.80061978,  10.41913733])]


In [38]:
k_fitted.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
+----------+
only showing top 20 rows

