In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.graph_objs as go
import plotly .offline as offline
import plotly.figure_factory as ff

In [6]:
# Importing dataset and examining it
dataset = pd.read_csv('/Users/ayushvarhadi/Desktop/Assignment/Titanic.csv')
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

   Survived  Pclass     Sex   Age     Fare Embarked
0         0       3    male  22.0   7.2500        S
1         1       1  female  38.0  71.2833        C
2         1       3  female  26.0   7.9250        S
3         1       1  female  35.0  53.1000        S
4         0       3    male  35.0   8.0500        S
(712, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  712 non-null    int64  
 1   Pclass    712 non-null    int64  
 2   Sex       712 non-null    object 
 3   Age       712 non-null    float64
 4   Fare      712 non-null    float64
 5   Embarked  712 non-null    object 
dtypes: float64(2), int64(2), object(2)
memory usage: 33.5+ KB
None
         Survived      Pclass         Age        Fare
count  712.000000  712.000000  712.000000  712.000000
mean     0.404494    2.240169   29.642093   34.567251
std      0.491139    0.836854   1

In [7]:
# Plotting Correlation Heatmap
corrs = dataset.corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
offline.plot(figure,filename='corrheatmap.html')

'corrheatmap.html'

In [8]:
# Converting Categorical features into Numerical features
dataset['Sex'] = dataset['Sex'].map({'male':1, 'female':0})

# Defining Child & Adult
def converter(column):
    if column <= 13:
        return 1 # Child
    else:
        return 0 # Adult

dataset['Age'] = dataset['Age'].apply(converter)
print(dataset.head())
print(dataset.info())


   Survived  Pclass  Sex  Age     Fare Embarked
0         0       3    1    0   7.2500        S
1         1       1    0    0  71.2833        C
2         1       3    0    0   7.9250        S
3         1       1    0    0  53.1000        S
4         0       3    1    0   8.0500        S
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  712 non-null    int64  
 1   Pclass    712 non-null    int64  
 2   Sex       712 non-null    int64  
 3   Age       712 non-null    int64  
 4   Fare      712 non-null    float64
 5   Embarked  712 non-null    object 
dtypes: float64(1), int64(4), object(1)
memory usage: 33.5+ KB
None


In [9]:
# Dividing dataset into label and feature sets
X = dataset.drop(['Survived','Embarked', 'Fare'], axis = 1) # Features
Y = dataset['Survived'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(712, 3)
(712,)


In [10]:
# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)


In [11]:
# Implementing PCA to visualize dataset
pca = PCA(n_components = 2)
pca.fit(X_scaled)
x_pca = pca.transform(X_scaled)
print("Variance explained by each of the n_components: ",pca.explained_variance_ratio_)
print("Total variance explained by the n_components: ",sum(pca.explained_variance_ratio_))

Variance explained by each of the n_components:  [0.3937737  0.35985149]
Total variance explained by the n_components:  0.7536251924038924


In [12]:
survival= list(dataset['Survived'])
age = list(dataset['Age'])
sex = list(dataset['Sex'])
pclass = list(dataset['Pclass'])
data = [go.Scatter(x=x_pca[:,0], y=x_pca[:,1], mode='markers',
                    marker = dict(color=Y, colorscale='Rainbow', opacity=0.5),
                                text=[f'Survived: {a}; Age: {b}; Sex:{c}, Pclass:{d}' for a,b,c,d in list(zip(survival,age,sex,pclass))],
                                hoverinfo='text')]

layout = go.Layout(title = 'PCA Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Principal Component'),
                    yaxis = dict(title='Second Principal Component'))
fig = go.Figure(data=data, layout=layout)
offline.plot(fig,filename='pca.html')


'pca.html'

In [21]:
# Implementing t-SNE to visualize dataset
tsne = TSNE(n_components = 2, perplexity = 5,n_iter=2000)
x_tsne = tsne.fit_transform(X_scaled)

data = [go.Scatter(x=x_tsne[:,0], y=x_tsne[:,1], mode='markers',
                    marker = dict(color=Y, colorscale='Rainbow', opacity=0.5),
                                text=[f'Survived: {a}; Age: {b}; Sex:{c}, Pclass:{d}' for a,b,c,d in list(zip(survival,age,sex,pclass))],
                                hoverinfo='text')]

layout = go.Layout(title = 't-SNE Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
offline.plot(fig,filename='t-SNE.html')


't-SNE.html'