In [8]:
import pandas as pd

In [9]:
def get_data():
    seeds_dataset = pd.read_csv('https://raw.githubusercontent.com/lettergram/PCA/master/seedAnalysis/seeds_dataset.csv',header=None)
    
    seeds_dataset.columns = ['area','perimeter','compactness','length_of_kernel',
    'width_of_kernel','asymmetry_coefficient',
    'length_of_groove','class']
    seeds_dataset.to_csv('./seeds_dataset.csv',index=None)
    

In [12]:
get_data()

In [30]:
from pyspark.sql import SparkSession
from pyspark import SparkContext,SparkConf

In [3]:
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('my_first_app_name') \
    .getOrCreate()
df = spark.read.csv('./seeds_dataset.csv',header=True,inferSchema=True)
df.show(5)

+-----+---------+-----------+------------------+------------------+---------------------+----------------+-----+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|class|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+-----+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|            5.22|    1|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|           4.956|    1|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|           4.825|    1|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|           4.805|    1|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|           5.175|    1|
+-----+---------+-----------+------------------+------------------+---------------------+-------

In [4]:
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- class: integer (nullable = true)



In [11]:
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.clustering import KMeans

In [12]:
df.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove',
 'class']

In [15]:
assembler = VectorAssembler(inputCols = df.columns, outputCol = 'features')
final_df = assembler.transform(df)
final_df.show(3)

+-----+---------+-----------+-----------------+------------------+---------------------+----------------+-----+--------------------+
| area|perimeter|compactness| length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|class|            features|
+-----+---------+-----------+-----------------+------------------+---------------------+----------------+-----+--------------------+
|15.26|    14.84|      0.871|            5.763|             3.312|                2.221|            5.22|    1|[15.26,14.84,0.87...|
|14.88|    14.57|     0.8811|5.553999999999999|             3.333|                1.018|           4.956|    1|[14.88,14.57,0.88...|
|14.29|    14.09|      0.905|            5.291|3.3369999999999997|                2.699|           4.825|    1|[14.29,14.09,0.90...|
+-----+---------+-----------+-----------------+------------------+---------------------+----------------+-----+--------------------+
only showing top 3 rows



VectorAssembler_6f1742c56bd8

In [40]:
final_df.take(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, class=1, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22, 1.0]))]

In [42]:
#instantiated kmeans with 3 cluster 
kmeans = KMeans(k=3)

# fitting the model
model = kmeans.fit(final_df)

In [43]:
print('WSSSE:', model.computeCost(final_df))

WSSSE: 625.7199966009451


In [44]:
# cluster centeroids
centers = model.clusterCenters()
print(centers)

[array([11.90906667, 13.25026667,  0.85154933,  5.22233333,  2.86509333,
        4.72218667,  5.09304   ,  2.86666667]), array([18.72180328, 16.29737705,  0.88508689,  6.20893443,  3.72267213,
        3.60359016,  6.06609836,  1.98360656]), array([14.63202703, 14.45324324,  0.8790973 ,  5.56178378,  3.27489189,
        2.74404324,  5.18493243,  1.13513514])]


In [46]:
model.transform(final_df).select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
+----------+
only showing top 20 rows



In [47]:
model.transform(final_df).show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+-----+--------------------+----------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|class|            features|prediction|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+-----+--------------------+----------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|    1|[15.26,14.84,0.87...|         2|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|    1|[14.88,14.57,0.88...|         2|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|    1|[14.29,14.09,0.90...|         2|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|    1|[13.

In [49]:
import uuid
uuid.uuid1()

UUID('c2d61724-b28d-11e9-af70-0242ac110002')