In [1]:
import findspark
findspark.init("D:/spark")

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings('ignore')

In [3]:
spark = SparkSession.builder \
.appName("KMeansMallCustomerBasic") \
.getOrCreate()

In [4]:
df = spark.read.format("csv") \
.option("header", True) \
.option("sep",",") \
.option("inferSchema",True) \
.load("D:/Datasets/Mall_Customers.csv")

In [9]:
df.limit(4).toPandas().head()

Unnamed: 0,CustomerID,Gender,Age,AnnualIncome,SpendingScore
0,1,Male,19,15000,39
1,2,Male,21,15000,81
2,3,Female,20,16000,6
3,4,Female,23,16000,77


In [11]:
df.describe().show()

+-------+------------------+------+-----------------+------------------+------------------+
|summary|        CustomerID|Gender|              Age|      AnnualIncome|     SpendingScore|
+-------+------------------+------+-----------------+------------------+------------------+
|  count|               200|   200|              200|               200|               200|
|   mean|             100.5|  null|            38.85|           60560.0|              50.2|
| stddev|57.879184513951124|  null|13.96900733155888|26264.721165271247|25.823521668370173|
|    min|                 1|Female|               18|             15000|                 1|
|    max|               200|  Male|               70|            137000|                99|
+-------+------------------+------+-----------------+------------------+------------------+



In [12]:
vector_assembler = VectorAssembler() \
.setInputCols(['AnnualIncome','SpendingScore']) \
.setOutputCol('features')

In [13]:
standard_scaler = StandardScaler() \
.setInputCol('features') \
.setOutputCol('scaled_features')

In [19]:
kmeans_obj = KMeans() \
.setSeed(142) \
.setK(5) \
.setPredictionCol('cluster') \
.setFeaturesCol('scaled_features') \
.setMaxIter(40) \
.setTol(1.0e-5)

In [20]:
pipeline_obj = Pipeline() \
.setStages([vector_assembler, standard_scaler, kmeans_obj])

In [21]:
pipeline_model = pipeline_obj.fit(df)

In [22]:
transformed_df = pipeline_model.transform(df)

In [23]:
transformed_df.limit(5).toPandas().head()

Unnamed: 0,CustomerID,Gender,Age,AnnualIncome,SpendingScore,features,scaled_features,cluster
0,1,Male,19,15000,39,"[15000.0, 39.0]","[0.5711082903036442, 1.510251022337088]",1
1,2,Male,21,15000,81,"[15000.0, 81.0]","[0.5711082903036442, 3.1366752002385674]",2
2,3,Female,20,16000,6,"[16000.0, 6.0]","[0.6091821763238872, 0.2323463111287828]",1
3,4,Female,23,16000,77,"[16000.0, 77.0]","[0.6091821763238872, 2.9817776594860455]",2
4,5,Female,31,17000,40,"[17000.0, 40.0]","[0.64725606234413, 1.5489754075252185]",1
