In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans

In [2]:
data = pd.read_csv("train.csv")
data

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
data.info()

In [None]:
missing_value = data.isnull().sum()
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame({
    'count_missing': missing_value,
    'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
data[["Age"]] = imputer.fit_transform(data[["Age"]])

imputer2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data[["Embarked"]] = imputer2.fit_transform(data[["Embarked"]])

In [None]:
data2 = data.drop(["Cabin", "Ticket", "Name", "PassengerId"], axis=1)
data2 = pd.get_dummies(data2, columns=["Pclass", "Sex", "Embarked"])
data2

In [None]:
inertia = []
max_n_clusters = 15 
for k in range(1, max_n_clusters+1):
    kmeans = KMeans(n_clusters=k, random_state=1).fit(data2)
    inertia.append(np.sqrt(kmeans.inertia_))

plt.plot(range(1, max_n_clusters+1), inertia, marker='s');
plt.xlabel('k')
plt.ylabel('$J(C_k)$');


In [None]:
km = KMeans(n_clusters=6)
clusters = km.fit_predict(data2)
data['clusters'] = clusters
data["clusters"].value_counts()

# from sklearn.cluster import DBSCAN
# from sklearn.decomposition import PCA
# db = DBSCAN(eps=10.5, min_samples=5).fit(data2)
# data['clusters'] = db.labels_
# data["clusters"].value_counts()

In [None]:
# sns.scatterplot(data=data, x="Age", y="Fare", hue="clusters")
plt.figure(figsize=[16, 10])
# fig, ax = plt.subplots()
colors = ['red', 'blue', 'green', 'black', 'yellow', 'pink']
plt.scatter(data['Age'], data['Fare'], c=data['clusters'].apply(lambda x: colors[x]))


In [None]:
plt.figure(figsize=[16, 10])
sns.stripplot(data=data, x="Embarked", y="Fare", hue="clusters", jitter=0.4, size = 10, dodge = True)  




In [None]:
plt.figure(figsize=[16, 10])
sns.stripplot(data=data, x="Sex", y="Fare", hue="clusters", jitter=0.4, size = 10, dodge = True)  

In [None]:
plt.figure(figsize=[16, 10])
sns.stripplot(data=data, x="Parch", y="Fare", hue="clusters", jitter=0.4, size = 10, dodge = True)  

In [None]:
plt.figure(figsize=[16, 10])
sns.stripplot(data=data, x="SibSp", y="Fare", hue="clusters", jitter=0.4, size = 10, dodge = True) 

In [None]:
plt.figure(figsize=[16, 10])
sns.stripplot(data=data, x="Pclass", y="Fare", hue="clusters", jitter=0.4, size = 10, dodge = True) 

In [None]:
plt.figure(figsize=[16, 10])
sns.stripplot(data=data, x="Survived", y="Fare", hue="clusters", jitter=0.4, size = 10, dodge = True) 

# Conclusion

### Cluster 0

- most bought lowest price 1st class tickets
- 65% survived
- embarked either in Southhampton or in Cherbourg
- everyone has at least 1 family member on board

### Cluster 1

- most men
- with 3d class ticket 
- embarked in Southampton 
- with lowest ticket's price
- most likely didn't survive

### Cluster 2

- most have bought very expensive 1st class tickets(probably in the last moment)
- everyone has at least 1 family member on board
- 65% to survive

### Cluster 3

- 100% of survivers
- the most expansive ticket's price
- embarked in Cherbourg
- noone has siblings

To sum up: if person is embarked in Southhampton or Queenstown, he' most likely to have the cheapest ticket from 3d class, he's man with 50% to have family member on the board and has only 30% to survive. Most chances to get survived have people from 1st class with expensive tickets and it doesn't depend on gender and number of family members on board, ~ 65% to survive. But if you've bought the most expensive tickets and have embarked in Cherbourg you will have almost 100% chance to survive.