<a href="https://colab.research.google.com/github/varshachawan/SparkML/blob/master/KMeans_BisectingKMeans_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Download the dataset
<b>Dataset location: </b>https://www.kaggle.com/c/3136/download/train.csv

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirror.olnevhost.net/pub/apache/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz

In [2]:
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"]="/content/spark-2.4.5-bin-hadoop2.7"
!echo $JAVA_HOME
import findspark
findspark.init()

/usr/lib/jvm/java-8-openjdk-amd64


In [0]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('Examine data about passengers on the Titanic') \
    .getOrCreate()

rawData = spark.read\
            .format('csv')\
            .option('header', 'true')\
            .load('./titanic.csv')

In [0]:
rawData.toPandas().head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


#### Select the columns which we required
Also cast the numeric values as float

In [4]:
from pyspark.sql.functions import col

dataset = rawData.select(col('Survived').cast('float'),
                         col('Pclass').cast('float'),
                         col('Sex'),
                         col('Age').cast('float'),
                         col('Fare').cast('float'),
                         col('Embarked')
                        )

dataset.toPandas().head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0.0,3.0,male,22.0,7.25,S
1,1.0,1.0,female,38.0,71.283302,C
2,1.0,3.0,female,26.0,7.925,S
3,1.0,1.0,female,35.0,53.099998,S
4,0.0,3.0,male,35.0,8.05,S


#### Drop rows containing missing values

In [0]:
dataset = dataset.replace('?', None)\
        .dropna(how='any')

#### Define StringIndexers for categorical columns

In [6]:
from pyspark.ml.feature import StringIndexer

dataset = StringIndexer(
    inputCol='Sex', 
    outputCol='Gender', 
    handleInvalid='keep').fit(dataset).transform(dataset)

dataset = StringIndexer(
    inputCol='Embarked', 
    outputCol='Boarded', 
    handleInvalid='keep').fit(dataset).transform(dataset)

dataset.toPandas().head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Gender,Boarded
0,0.0,3.0,male,22.0,7.25,S,0.0,0.0
1,1.0,1.0,female,38.0,71.283302,C,1.0,1.0
2,1.0,3.0,female,26.0,7.925,S,1.0,0.0
3,1.0,1.0,female,35.0,53.099998,S,1.0,0.0
4,0.0,3.0,male,35.0,8.05,S,0.0,0.0


#### Drop the redundant columns

In [7]:
dataset = dataset.drop('Sex')
dataset = dataset.drop('Embarked')

dataset.toPandas().head()

Unnamed: 0,Survived,Pclass,Age,Fare,Gender,Boarded
0,0.0,3.0,22.0,7.25,0.0,0.0
1,1.0,1.0,38.0,71.283302,1.0,1.0
2,1.0,3.0,26.0,7.925,1.0,0.0
3,1.0,1.0,35.0,53.099998,1.0,0.0
4,0.0,3.0,35.0,8.05,0.0,0.0


#### Define the required features to use in the VectorAssembler
Since we are only examining data and not making predictions, we include all columns

In [0]:
requiredFeatures = ['Survived',
                    'Pclass',
                    'Age',
                    'Fare',
                    'Gender',
                    'Boarded'
                   ]

#### The VectorAssembler vectorises all the features
The transformed data will be used for clustering

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=requiredFeatures, outputCol='features')

#### Transorm our dataset for use in our clustering algorithm

In [0]:
transformed_data = assembler.transform(dataset)

In [12]:
transformed_data.toPandas().head()

Unnamed: 0,Survived,Pclass,Age,Fare,Gender,Boarded,features
0,0.0,3.0,22.0,7.25,0.0,0.0,"[0.0, 3.0, 22.0, 7.25, 0.0, 0.0]"
1,1.0,1.0,38.0,71.283302,1.0,1.0,"[1.0, 1.0, 38.0, 71.2833023071289, 1.0, 1.0]"
2,1.0,3.0,26.0,7.925,1.0,0.0,"[1.0, 3.0, 26.0, 7.925000190734863, 1.0, 0.0]"
3,1.0,1.0,35.0,53.099998,1.0,0.0,"[1.0, 1.0, 35.0, 53.099998474121094, 1.0, 0.0]"
4,0.0,3.0,35.0,8.05,0.0,0.0,"[0.0, 3.0, 35.0, 8.050000190734863, 0.0, 0.0]"


### Define the clustering model
Use K-means clustering
* <b>k: </b>Defines the number of clusters
* <b>seed: </b>This value is used to set the cluster centers. A different value of seed for the same k will result in clusters being defined differently. In order to reproduce similar clusters when re-running the clustering algorithm use the same values of k and seed

In [0]:
from pyspark.ml.clustering import KMeans,BisectingKMeans

kmeans = KMeans(k=5, seed=3)
model = kmeans.fit(transformed_data)
Bkmeans = BisectingKMeans(k=6,seed =3)
model1 = Bkmeans.fit(transformed_data)


#### Create the clusters using the model

In [0]:
clusterdData = model.transform(transformed_data)
clusterdData1 =model1.transform(transformed_data)

In [33]:
# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(transformed_data)
wssse1 = model1.computeCost(transformed_data)

print("Within Set Sum of Squared Errors = ", wssse ,"mode12",wssse1 )


Within Set Sum of Squared Errors =  234474.9481701427 mode12 335397.25139421254


#### Use ClusteringEvaluator to evaluate the clusters
<b>From Wikipedia: </b>The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. If most objects have a high value, then the clustering configuration is appropriate. If many points have a low or negative value, then the clustering configuration may have too many or too few clusters.

In [34]:
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(clusterdData)
silhouette1 = evaluator.evaluate(clusterdData1)
print('Silhouette with squared euclidean distance = ', silhouette,silhouette1)

Silhouette with squared euclidean distance =  0.777943052058315 0.5327625141747653


#### View the cluster centers for each of the features

In [15]:
centers = model.clusterCenters()
print('Cluster Centers: ')
for center in centers:
    print(center)

Cluster Centers: 
[ 0.33510638  2.51595745 28.41710993 15.68403444  0.32092199  0.21808511]
[  0.73333333   1.          30.33333333 239.99193726   0.73333333
   0.53333333]
[  0.73333333   1.          32.43066667 131.18388341   0.63333333
   0.5       ]
[1.00000000e+00 1.00000000e+00 3.53333333e+01 5.12329224e+02
 3.33333333e-01 1.00000000e+00]
[ 0.63        1.28       35.44       66.93704395  0.47        0.37      ]


#### View the output of the KMeans model
The prediction field denotes the cluster number

In [0]:
clusterdData.toPandas().head()

Unnamed: 0,Survived,Pclass,Age,Fare,Gender,Boarded,features,prediction
0,0.0,3.0,22.0,7.25,0.0,0.0,"[0.0, 3.0, 22.0, 7.25, 0.0, 0.0]",0
1,1.0,1.0,38.0,71.283302,1.0,1.0,"[1.0, 1.0, 38.0, 71.2833023071289, 1.0, 1.0]",4
2,1.0,3.0,26.0,7.925,1.0,0.0,"[1.0, 3.0, 26.0, 7.925000190734863, 1.0, 0.0]",0
3,1.0,1.0,35.0,53.099998,1.0,0.0,"[1.0, 1.0, 35.0, 53.099998474121094, 1.0, 0.0]",4
4,0.0,3.0,35.0,8.05,0.0,0.0,"[0.0, 3.0, 35.0, 8.050000190734863, 0.0, 0.0]",0


#### Get the average of each feature in the original data
This is the equivalent of the cluster center when our dataset is one big cluster
* We import all sql functions as we need the avg and count functions among others

In [16]:
from pyspark.sql.functions import *

dataset.select(avg('Survived'),
               avg('Pclass'),
               avg('Age'),
               avg('Fare'),
               avg('Gender'),
               avg('Boarded')).toPandas()

Unnamed: 0,avg(Survived),avg(Pclass),avg(Age),avg(Fare),avg(Gender),avg(Boarded)
0,0.404494,2.240169,29.642093,34.567251,0.363764,0.261236


#### A more intuitive way to view the cluster centers in our clusterdData
* We group by clusterID (prediction) and compute the average of all features
* We do a count of values in each cluster

In [17]:
clusterdData.groupBy('prediction').agg(avg('Survived'),
                                      avg('Pclass'),
                                      avg('Age'),
                                      avg('Fare'),
                                      avg('Gender'),
                                      avg('Boarded'),
                                      count('prediction')
                                     ).orderBy('prediction').toPandas()

Unnamed: 0,prediction,avg(Survived),avg(Pclass),avg(Age),avg(Fare),avg(Gender),avg(Boarded),count(prediction)
0,0,0.335106,2.515957,28.41711,15.684034,0.320922,0.218085,564
1,1,0.733333,1.0,30.333333,239.991937,0.733333,0.533333,15
2,2,0.733333,1.0,32.430667,131.183883,0.633333,0.5,30
3,3,1.0,1.0,35.333333,512.329224,0.333333,1.0,3
4,4,0.63,1.28,35.44,66.937044,0.47,0.37,100


#### Examine all rows in one of the clusters

In [0]:
clusterdData.filter(clusterdData.prediction == 1).toPandas()

Unnamed: 0,Survived,Pclass,Age,Fare,Gender,Boarded,features,prediction
0,0.0,1.0,19.0,263.0,0.0,0.0,"[0.0, 1.0, 19.0, 263.0, 0.0, 0.0]",1
1,1.0,1.0,23.0,263.0,1.0,0.0,"[1.0, 1.0, 23.0, 263.0, 1.0, 0.0]",1
2,0.0,1.0,24.0,247.520798,0.0,1.0,"[0.0, 1.0, 24.0, 247.5207977294922, 0.0, 1.0]",1
3,1.0,1.0,50.0,247.520798,1.0,1.0,"[1.0, 1.0, 50.0, 247.5207977294922, 1.0, 1.0]",1
4,1.0,1.0,18.0,262.375,1.0,1.0,"[1.0, 1.0, 18.0, 262.375, 1.0, 1.0]",1
5,1.0,1.0,24.0,263.0,1.0,0.0,"[1.0, 1.0, 24.0, 263.0, 1.0, 0.0]",1
6,0.0,1.0,27.0,211.5,0.0,1.0,"[0.0, 1.0, 27.0, 211.5, 0.0, 1.0]",1
7,1.0,1.0,42.0,227.524994,1.0,1.0,"[1.0, 1.0, 42.0, 227.52499389648438, 1.0, 1.0]",1
8,0.0,1.0,64.0,263.0,0.0,0.0,"[0.0, 1.0, 64.0, 263.0, 0.0, 0.0]",1
9,1.0,1.0,15.0,211.337494,1.0,0.0,"[1.0, 1.0, 15.0, 211.33749389648438, 1.0, 0.0]",1
