# Clustering Objective

    Maximize intra-cluster similarity
    Minimize inter-cluster similarity

# Mean-shift clustering 

* Computationally very intensive
* O(N^2) in number of data points
* Copes better with outliers



In [1]:
import pandas as pd

In [2]:
titanic_data = pd.read_csv(".//data//titanic.csv", quotechar ='"')
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Features to Drop

        PassengerID, Name, Ticket, Cabin
 * These features are too specific to individual passengers to be useful when looking for patterns

In [4]:
titanic_data.drop(['PassengerId','Name','Ticket','Cabin'], 'columns', inplace = True)
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Converting categorical varible to Label/one hot encoding form

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
titanic_data['Sex'] = le.fit_transform(titanic_data['Sex'].astype(str))
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.925,S
3,1,1,0,35.0,1,0,53.1,S
4,0,3,1,35.0,0,0,8.05,S


In [8]:
titanic_data = pd.get_dummies(titanic_data, columns=['Embarked'])
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [9]:
titanic_data[titanic_data.isnull().any(axis=1)]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
5,0,3,1,,0,0,8.4583,0,1,0
17,1,2,1,,0,0,13.0000,0,0,1
19,1,3,0,,0,0,7.2250,1,0,0
26,0,3,1,,0,0,7.2250,1,0,0
28,1,3,0,,0,0,7.8792,0,1,0
...,...,...,...,...,...,...,...,...,...,...
859,0,3,1,,0,0,7.2292,1,0,0
863,0,3,0,,8,2,69.5500,0,0,1
868,0,3,1,,0,0,9.5000,0,0,1
878,0,3,1,,0,0,7.8958,0,0,1


In [10]:
titanic_data = titanic_data.dropna()

In [14]:
titanic_data.describe(include='all')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,0.406162,2.236695,0.634454,29.699118,0.512605,0.431373,34.694514,0.182073,0.039216,0.77591
std,0.49146,0.83825,0.481921,14.526497,0.929783,0.853289,52.91893,0.386175,0.194244,0.417274
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,20.125,0.0,0.0,8.05,0.0,0.0,1.0
50%,0.0,2.0,1.0,28.0,0.0,0.0,15.7417,0.0,0.0,1.0
75%,1.0,3.0,1.0,38.0,1.0,1.0,33.375,0.0,0.0,1.0
max,1.0,3.0,1.0,80.0,5.0,6.0,512.3292,1.0,1.0,1.0


      The shape of the kernel- smaller values are tall skinny kernels, larger values are short fat kernels

In [16]:
from sklearn.cluster import MeanShift
analyzer = MeanShift(bandwidth=50)
analyzer.fit(titanic_data)

MeanShift(bandwidth=50, bin_seeding=False, cluster_all=True, max_iter=300,
          min_bin_freq=1, n_jobs=None, seeds=None)

# estimate_bandwidth

A helper function to help estimate a good value for the bandwidth based on the data
    Runs in quadratic (N^2) time - where N can be specified as an input argument (n_samples)

In [17]:
from sklearn.cluster import estimate_bandwidth
estimate_bandwidth(titanic_data)

30.44675914497196

In [19]:
labels = analyzer.labels_

In [20]:
import numpy as np
np.unique(labels)

array([0, 1, 2], dtype=int64)

In [22]:
import numpy as np
titanic_data['cluster_group'] = np.nan
data_length = len(titanic_data)
for i in range(data_length):
    titanic_data.iloc[i, titanic_data.columns.get_loc('cluster_group')]= labels[i]

In [23]:
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
0,0,3,1,22.0,1,0,7.25,0,0,1,0.0
1,1,1,0,38.0,1,0,71.2833,1,0,0,0.0
2,1,3,0,26.0,0,0,7.925,0,0,1,0.0
3,1,1,0,35.0,1,0,53.1,0,0,1,0.0
4,0,3,1,35.0,0,0,8.05,0,0,1,0.0


In [24]:
titanic_data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,0.406162,2.236695,0.634454,29.699118,0.512605,0.431373,34.694514,0.182073,0.039216,0.77591,0.051821
std,0.49146,0.83825,0.481921,14.526497,0.929783,0.853289,52.91893,0.386175,0.194244,0.417274,0.24004
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,20.125,0.0,0.0,8.05,0.0,0.0,1.0,0.0
50%,0.0,2.0,1.0,28.0,0.0,0.0,15.7417,0.0,0.0,1.0,0.0
75%,1.0,3.0,1.0,38.0,1.0,1.0,33.375,0.0,0.0,1.0,0.0
max,1.0,3.0,1.0,80.0,5.0,6.0,512.3292,1.0,1.0,1.0,2.0


# Examine Mean Cluster

In [25]:
titanic_cluster_data = titanic_data.groupby(['cluster_group']).mean()
titanic_cluster_data

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.388235,2.298529,0.651471,29.559191,0.505882,0.404412,25.415625,0.167647,0.041176,0.788235
1.0,0.741935,1.0,0.258065,32.223226,0.709677,1.032258,192.008732,0.419355,0.0,0.580645
2.0,1.0,1.0,0.666667,35.333333,0.0,0.333333,512.3292,1.0,0.0,0.0


# To find out samples of data in each cluster
    create count column and fill the cluster_group.size in this

In [26]:
titanic_cluster_data['Counts'] = pd.Series(titanic_data.groupby(['cluster_group']).size())
titanic_cluster_data

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Counts
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,0.388235,2.298529,0.651471,29.559191,0.505882,0.404412,25.415625,0.167647,0.041176,0.788235,680
1.0,0.741935,1.0,0.258065,32.223226,0.709677,1.032258,192.008732,0.419355,0.0,0.580645,31
2.0,1.0,1.0,0.666667,35.333333,0.0,0.333333,512.3292,1.0,0.0,0.0,3


# More detailed information about one of the clusters

In [27]:
# cluster 1

titanic_data[titanic_data['cluster_group']==1].describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
count,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0
mean,0.741935,1.0,0.258065,32.223226,0.709677,1.032258,192.008732,0.419355,0.0,0.580645,1.0
std,0.444803,0.0,0.444803,15.327994,1.006431,1.016001,50.203716,0.50161,0.0,0.50161,0.0
min,0.0,1.0,0.0,0.92,0.0,0.0,133.65,0.0,0.0,0.0,1.0
25%,0.5,1.0,0.0,22.0,0.0,0.0,151.55,0.0,0.0,0.0,1.0
50%,1.0,1.0,0.0,31.0,0.0,1.0,164.8667,0.0,0.0,1.0,1.0
75%,1.0,1.0,0.5,41.5,1.0,2.0,237.5229,1.0,0.0,1.0,1.0
max,1.0,1.0,1.0,64.0,3.0,4.0,263.0,1.0,0.0,1.0,1.0


In [28]:
titanic_data[titanic_data['cluster_group']==1]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,cluster_group
27,0,1,1,19.0,3,2,263.0,0,0,1,1.0
88,1,1,0,23.0,3,2,263.0,0,0,1,1.0
118,0,1,1,24.0,0,1,247.5208,1,0,0,1.0
195,1,1,0,58.0,0,0,146.5208,1,0,0,1.0
268,1,1,0,58.0,0,1,153.4625,0,0,1,1.0
269,1,1,0,35.0,0,0,135.6333,0,0,1,1.0
297,0,1,0,2.0,1,2,151.55,0,0,1,1.0
299,1,1,0,50.0,0,1,247.5208,1,0,0,1.0
305,1,1,1,0.92,1,2,151.55,0,0,1,1.0
311,1,1,0,18.0,2,2,262.375,1,0,0,1.0
