In [1]:
#Instantiate SparkSession
from pyspark.sql import SparkSession
spark=SparkSession \
.builder \
.appName('Examine Data about passengers on Titanic') \
.getOrCreate()

In [2]:
spark

In [3]:
# wget www.kaggle.com/c/3136/download/train.csv
# gsutil cp train.csv gs://dexdebra-123/datasets
# Use sparkSession to read csv file

rawData=spark.read \
           .format('csv') \
           .option('header','true') \
           .load('gs://dexdebra-123/datasets/train.csv')

In [4]:
rawData.toPandas().head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [10]:
# Select columns useful for clustering
from pyspark.sql.functions import col
dataset=rawData.select(
    col('Survived').cast('float'),
    col('Pclass').cast('float'),
    col('Sex'),
    col('Age').cast('float'),
    col('Fare').cast('float'),
    col('Embarked')
)


In [11]:
dataset.toPandas().head()

# Survied   - If the passenger survived sinking , '0' indicates 'Did Not Survive', '1' indicates "Did Survive'
# Pclass    - Class the passenger travelled, 1st ,2nd or 3rd class
# Sex       - Sex of the Passenger, Male or Female
# Age       - Age of the Passenger
# Fare      - Amount Paid by Passenger
# Embarked  - Place the passenger boarded

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0.0,3.0,male,22.0,7.25,S
1,1.0,1.0,female,38.0,71.283302,C
2,1.0,3.0,female,26.0,7.925,S
3,1.0,1.0,female,35.0,53.099998,S
4,0.0,3.0,male,35.0,8.05,S


In [12]:
# Replace All '?' value in all cells with specific value('None') in dataFrame
print('Count Before: ' , dataset.count())

# Drop All rows which have None value for any cell
dataset=dataset.dropna(how='any')
print('Count After: ' , dataset.count())

('Count Before: ', 891)
('Count After: ', 712)


In [14]:
#Convert Categorical values in String form to Numeric Form using StringIndexer

# There are 2 columns with categorical data, 'Sex' and 'Embarked'
# The output columns which represent this categorical data in numeric form are 'Gender' and 'Boarded'

from pyspark.ml.feature import StringIndexer

dataset=StringIndexer( 
                            inputCol='Sex',
                            outputCol='Gender',
                            handleInvalid='keep'
                    ).fit(dataset).transform(dataset)


dataset=StringIndexer( 
                            inputCol='Embarked',
                            outputCol='Boarded',
                            handleInvalid='keep'
                     ).fit(dataset).transform(dataset)

dataset.toPandas().head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Gender,Boarded
0,0.0,3.0,male,22.0,7.25,S,0.0,0.0
1,1.0,1.0,female,38.0,71.283302,C,1.0,1.0
2,1.0,3.0,female,26.0,7.925,S,1.0,0.0
3,1.0,1.0,female,35.0,53.099998,S,1.0,0.0
4,0.0,3.0,male,35.0,8.05,S,0.0,0.0


In [15]:
# We can drop the original categorical columns as they are no longer needed
dataset=dataset.drop('Sex')
dataset=dataset.drop('Embarked')

In [16]:
dataset.toPandas().head()
# All Column Values are numeric
# That means, they are ready to be fed into ML Algorithm

Unnamed: 0,Survived,Pclass,Age,Fare,Gender,Boarded
0,0.0,3.0,22.0,7.25,0.0,0.0
1,1.0,1.0,38.0,71.283302,1.0,1.0
2,1.0,3.0,26.0,7.925,1.0,0.0
3,1.0,1.0,35.0,53.099998,1.0,0.0
4,0.0,3.0,35.0,8.05,0.0,0.0


In [17]:
# Setup the RequiredFeatures for K-means Clustering algorithm

requiredFeatures=[
    'Survived',
    'Pclass',
    'Age',
    'Fare',
    'Gender',
    'Boarded'
]

# Pass these requiredFeatures to VectorAssembler, which will assemble then into a single column

In [19]:
# Assemble values into a Single Column using VectorAssembler

# Vector Assembler is a Transformer, which takes a DF as input and returns a new DF with 
# all features added to it
from pyspark.ml.feature import VectorAssembler

assembler=VectorAssembler(inputCols=requiredFeatures,outputCol='features')

transformed_data=assembler.transform(dataset);


In [34]:
# import estimator for k-means clustering
from pyspark.ml.clustering import KMeans

# k    : number of clusters
# seed : Cluster Centers , initialize before starting cluster

#kmeans=KMeans(k=5,seed=1)

kmeans=KMeans(k=8,seed=8)

# change to '8' clusters and a seed of '3'


In [35]:
# Start training K-means Clustering model using fit() method on transformed_data
model=kmeans.fit(transformed_data)

In [36]:
# We call model.transform() on testingDat to get clustered Results in a DataFrame
clusteredData=model.transform(transformed_data)

In [37]:
#Evaluate how well the underlying data is clustered
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator=ClusteringEvaluator()


In [38]:
# silhouette: how similar every point is to other points in same cluster
# value of '1' is ideal

silhouette=evaluator.evaluate(clusteredData)
print('Silhouette with squared euclidean distance = ',silhouette)
# the Value is not too bad
# with 8 clusters, it is 0.5988 now

('Silhouette with squared euclidean distance = ', 0.5988875582419804)


In [44]:
centers=model.clusterCenters()

print('Cluster Centers')
for center in centers:
  print(center)

# Every Cluster center is an array whose length is equal number of features in our training data
# which in our case was '6'

Cluster Centers
[ 0.2741433   2.73520249 25.43613707 10.3686904   0.28660436  0.21806854]
[1.00000000e+00 1.00000000e+00 3.53333333e+01 5.12329224e+02
 3.33333333e-01 1.00000000e+00]
[ 0.31818182  2.06493506 45.92857143 20.65706167  0.28571429  0.20779221]
[  0.73333333   1.          30.33333333 239.99193726   0.73333333
   0.53333333]
[ 0.61333333  1.22666667 37.77333333 64.12494776  0.44        0.34666667]
[  0.75         1.          33.995      147.02447414   0.75
   0.3125    ]
[ 0.55208333  2.51041667  9.87760417 27.59748246  0.48958333  0.22916667]
[ 0.8125      1.         30.65625    98.57369447  0.59375     0.625     ]


In [45]:
# Examine clusteredData
clusteredData.toPandas().head()

# Notic that transform method has added 'prediction' column to DataFrame in addition to the
# original columns which contain the individual features  and
# 'features' column which contain all features grouped together
# 'prediction' column contains cluster number associated with each record

Unnamed: 0,Survived,Pclass,Age,Fare,Gender,Boarded,features,prediction
0,0.0,3.0,22.0,7.25,0.0,0.0,"[0.0, 3.0, 22.0, 7.25, 0.0, 0.0]",0
1,1.0,1.0,38.0,71.283302,1.0,1.0,"[1.0, 1.0, 38.0, 71.2833023071289, 1.0, 1.0]",4
2,1.0,3.0,26.0,7.925,1.0,0.0,"[1.0, 3.0, 26.0, 7.925000190734863, 1.0, 0.0]",0
3,1.0,1.0,35.0,53.099998,1.0,0.0,"[1.0, 1.0, 35.0, 53.099998474121094, 1.0, 0.0]",4
4,0.0,3.0,35.0,8.05,0.0,0.0,"[0.0, 3.0, 35.0, 8.050000190734863, 0.0, 0.0]",0


In [46]:
# Let examine dataset  as a whole and then examine the individual clusters
# Find the Average value for every column across the entire dataset

from pyspark.sql.functions import *

dataset.select(
        avg('Survived'),
        avg('Pclass'),
        avg('Age'),
        avg('Fare'),
        avg('Gender'),
        avg('Boarded')
).toPandas()

# We will use these averages to use how the averages across the entire dataset 
# stackup up against average values of individual clusters

Unnamed: 0,avg(Survived),avg(Pclass),avg(Age),avg(Fare),avg(Gender),avg(Boarded)
0,0.404494,2.240169,29.642093,34.567251,0.363764,0.261236


In [43]:
# Find Averages for Each Cluster
# Let us group by 'prediction' column.
# prediction' column contains clusters associated with every record
# For Every Cluster we want to find average of feature values for values within the cluster
# Also, total number of data points within each cluster
# display results ordered by cluster number

clusteredData.groupBy('prediction').agg (
        avg('Survived'),
        avg('Pclass'),
        avg('Age'),
        avg('Fare'),
        avg('Gender'),
        avg('Boarded'),
        count('prediction'),
).orderBy('prediction').toPandas()



Unnamed: 0,prediction,avg(Survived),avg(Pclass),avg(Age),avg(Fare),avg(Gender),avg(Boarded),count(prediction)
0,0,0.274143,2.735202,25.436137,10.36869,0.286604,0.218069,321
1,1,1.0,1.0,35.333333,512.329224,0.333333,1.0,3
2,2,0.318182,2.064935,45.928571,20.657062,0.285714,0.207792,154
3,3,0.733333,1.0,30.333333,239.991937,0.733333,0.533333,15
4,4,0.613333,1.226667,37.773333,64.124948,0.44,0.346667,75
5,5,0.75,1.0,33.995,147.024474,0.75,0.3125,16
6,6,0.552083,2.510417,9.877604,27.597482,0.489583,0.229167,96
7,7,0.8125,1.0,30.65625,98.573694,0.59375,0.625,32


In [None]:
# Analysis

# Average values for each of the features within each cluster are same as Cluster Centers
# We calculated the Cluster Centers by taking the avg values of all points within the cluster

# Let us consider the very first cluster, cluster '0'.
# Compare its value against entire dataset
# The survival rate for cluster '0' passenger is only 28% , whereas average survival rate is about 40%
# Most of the passengers in Cluster '0'  did not survive sinking

# Look at the Average of pClass column.
# Notice that the passengers from Cluster '0' are more likely to belong to 3rd class.
# The Average class values is pretty high, it is 2.54%  compared with 2.24% for entire dataset
# passengers in Cluster '0' are more likely to be in 2nd class or 3rd class.
# This is clear by the average fare they paid.
# The average passenger fare is around '13'. Across the entire dataset average fare in 34.5
# Average of gender column. Most of them are Male. Average is closer to '0' where 0 represents 'MALE' and 
# '1' is FEMALE

# Similarly we can do for other clusters.
# For cluster 1, Survival rate is very high.
# The pclass in this cluster in First Class.
# The fare paid is also very high
# Most of the passengers are female.





In [50]:
# Filter the data only for a Single Cluster

clusteredData.filter( clusteredData.prediction == 1).toPandas()


Unnamed: 0,Survived,Pclass,Age,Fare,Gender,Boarded,features,prediction
0,1.0,1.0,35.0,512.329224,1.0,1.0,"[1.0, 1.0, 35.0, 512.3292236328125, 1.0, 1.0]",1
1,1.0,1.0,36.0,512.329224,0.0,1.0,"[1.0, 1.0, 36.0, 512.3292236328125, 0.0, 1.0]",1
2,1.0,1.0,35.0,512.329224,0.0,1.0,"[1.0, 1.0, 35.0, 512.3292236328125, 0.0, 1.0]",1
