In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler , VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings("ignore")

In [3]:
spark = SparkSession.builder \
.appName("KMeans") \
.master("local[4]") \
.getOrCreate()

In [4]:
df = spark.read.format("csv") \
.option("header",True) \
.option("sep",",") \
.option("inferSchema",True) \
.load("/home/taha/Downloads/Mall_Customers.csv")

In [5]:
df.toPandas().head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [6]:
df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Annual Income (k$): integer (nullable = true)
 |-- Spending Score (1-100): integer (nullable = true)



In [7]:
df.describe().toPandas().head()

Unnamed: 0,summary,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,count,200.0,200,200.0,200.0,200.0
1,mean,100.5,,38.85,60.56,50.2
2,stddev,57.87918451395112,,13.96900733155888,26.26472116527124,25.823521668370173
3,min,1.0,Female,18.0,15.0,1.0
4,max,200.0,Male,70.0,137.0,99.0


In [8]:
title = ["CustomerID","Gender","Age","AnnualIncome","SpendingScore"]

In [9]:
df2 = df.toDF(*title)

In [10]:
df2.toPandas().head()

Unnamed: 0,CustomerID,Gender,Age,AnnualIncome,SpendingScore
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [11]:
vector_assembler = VectorAssembler() \
.setInputCols(["AnnualIncome","SpendingScore"]) \
.setOutputCol("features")

In [12]:
standard_scaler = StandardScaler() \
.setInputCol("features") \
.setOutputCol("scaled_features")

##### Gecenki Projede k degerini kendimiz 5 vermistik burda yapacagimiz islem k degerini bir algoritmaya gore en uygun halini bulmak Silhouette Skoru ile

In [18]:
def runKMeans(df2,k):
    kmeans_obj = KMeans() \
    .setK(k) \
    .setSeed(142) \
    .setFeaturesCol("scaled_features") \
    .setPredictionCol("cluster")
    
    pipeline_obj = Pipeline() \
    .setStages([vector_assembler,standard_scaler,kmeans_obj])
    
    pipeline_model = pipeline_obj.fit(df2)
    
    return pipeline_model

In [16]:
# simdi belli sayidaki k degerlerinden hangisi iyi onu bulalim

In [20]:
for k in range(2,11):
    pipeline_model=runKMeans(df2,k)
    transformed_df = pipeline_model.transform(df2)
    
    evaluator = ClusteringEvaluator() \
    .setFeaturesCol("scaled_features") \
    .setPredictionCol("cluster") \
    .setMetricName("silhouette") 
    
    score = evaluator.evaluate(transformed_df)
    
    print(k,score)

2 0.5389608053072632
3 0.6288672765684974
4 0.657293825903269
5 0.7389823353524685
6 0.7348177297879248
7 0.7273693883434802
8 0.7075606835679795
9 0.673148267261042
10 0.6567557661228416


#### Burda 1 en yakin olan k degeri 5 cikti yani bize en dogru degeri k = 5 oldugunda vericektir bir onceki uygulamada dogru bir islem yapmisiz

# Tekrardan 5 e ayarlayalim

In [24]:
kmeans_obj = KMeans() \
.setK(5) \
.setSeed(142) \
.setFeaturesCol("scaled_features") \
.setPredictionCol("cluster")
    
pipeline_obj = Pipeline() \
.setStages([vector_assembler,standard_scaler,kmeans_obj])
    
pipeline_model = pipeline_obj.fit(df2)
transformed_df = pipeline_model.transform(df2)

In [25]:
transformed_df.limit(5).toPandas().head()

Unnamed: 0,CustomerID,Gender,Age,AnnualIncome,SpendingScore,features,scaled_features,cluster
0,1,Male,19,15,39,"[15.0, 39.0]","[0.5711082903036444, 1.510251022337088]",1
1,2,Male,21,15,81,"[15.0, 81.0]","[0.5711082903036444, 3.1366752002385674]",2
2,3,Female,20,16,6,"[16.0, 6.0]","[0.6091821763238874, 0.2323463111287828]",1
3,4,Female,23,16,77,"[16.0, 77.0]","[0.6091821763238874, 2.9817776594860455]",2
4,5,Female,31,17,40,"[17.0, 40.0]","[0.6472560623441304, 1.5489754075252185]",1


In [26]:
transformed_df.groupBy("cluster").count().show()

+-------+-----+
|cluster|count|
+-------+-----+
|      1|   23|
|      3|   39|
|      4|   36|
|      2|   22|
|      0|   80|
+-------+-----+

