In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [0]:
df =pd.read_csv('titanic_train.csv')

In [0]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
df.shape

(891, 12)

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [0]:
df.drop(['PassengerId','Name'],axis=1,inplace=True)

In [0]:
df['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [0]:
df.groupby(['Sex','Survived'])['Survived'].count()

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64

In [0]:
df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [0]:
df.groupby(['Pclass','Survived'])['Survived'].count()

Pclass  Survived
1       0            80
        1           136
2       0            97
        1            87
3       0           372
        1           119
Name: Survived, dtype: int64

In [0]:
#df['Age'].fillna(df['Age'].median(),inplace=True)

## Binning the age variable

In [0]:
def age(x):
    if 0 < x <=10:
        return '0-10'
    elif 10 < x <=20:
        return '11-20'
    elif 20 < x <=30:
        return '21-30'
    elif 30 < x <=40:
        return '31-40'
    elif 40 < x <=50:
        return '41-50'
    elif 50 < x <=60:
        return '51-60'
    elif 60 < x <=70:
        return '61-70'
    elif x > 70:
        return 'above 70'
    else:
        return 'unknown'

In [0]:
df['Age']=df['Age'].apply(age)

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Sex           891 non-null object
Age           891 non-null object
SibSp         891 non-null int64
Parch         891 non-null int64
Ticket        891 non-null object
Fare          891 non-null float64
Cabin         204 non-null object
Embarked      891 non-null object
Binned_Age    891 non-null object
dtypes: float64(1), int64(4), object(6)
memory usage: 76.6+ KB


In [0]:
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)

In [0]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Embarked      0
Binned_Age    0
dtype: int64

In [0]:
df.drop(['Ticket','Binned_Age'],axis=1,inplace=True)

In [0]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,21-30,1,0,7.25,S
1,1,1,female,31-40,1,0,71.2833,C
2,1,3,female,21-30,0,0,7.925,S
3,1,1,female,31-40,1,0,53.1,S
4,0,3,male,31-40,0,0,8.05,S


In [0]:
df['Pclass']=df['Pclass'].astype('category')

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null category
Sex         891 non-null object
Age         891 non-null object
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null object
dtypes: category(1), float64(1), int64(3), object(3)
memory usage: 49.8+ KB


###  Segregating the categorical and continuous features

In [0]:
encoded_cols = ['Pclass','Sex','Age','Embarked']
continuous_cols=['SibSp','Parch','Fare']

### One-hot-encoding of categorical columns

In [0]:
encoded_DF = pd.get_dummies(df[encoded_cols],drop_first=True)

In [0]:
continuous_Df = df[continuous_cols]

### Concatenating encoded and continuous columns

In [0]:
modellingDataset = pd.concat([encoded_DF,continuous_Df],axis=1)

In [0]:
modellingDataset.head()

Unnamed: 0,Pclass_2,Pclass_3,Sex_male,Age_11-20,Age_21-30,Age_31-40,Age_41-50,Age_51-60,Age_61-70,Age_above 70,Age_unknown,Embarked_Q,Embarked_S,SibSp,Parch,Fare
0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,7.25
1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,71.2833
2,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,7.925
3,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,53.1
4,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,8.05


In [0]:
from sklearn.preprocessing import StandardScaler

In [0]:
sc_x = StandardScaler()

In [0]:
scaled_modelDF

array([[-0.51015154,  0.90258736,  0.73769513, ...,  0.43279337,
        -0.47367361, -0.50244517],
       [-0.51015154, -1.10792599, -1.35557354, ...,  0.43279337,
        -0.47367361,  0.78684529],
       [-0.51015154,  0.90258736, -1.35557354, ..., -0.4745452 ,
        -0.47367361, -0.48885426],
       ...,
       [-0.51015154,  0.90258736, -1.35557354, ...,  0.43279337,
         2.00893337, -0.17626324],
       [-0.51015154, -1.10792599,  0.73769513, ..., -0.4745452 ,
        -0.47367361, -0.04438104],
       [-0.51015154,  0.90258736,  0.73769513, ..., -0.4745452 ,
        -0.47367361, -0.49237783]])