# Titanic dataset cleaning and preprocessing


![Taitanic](https://resize.indiatvnews.com/en/resize/newbucket/715_-/2017/04/76abd9aa85d89cd53c5297129ea57cee-1492247929.jpg)


In [85]:
# import first
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# change the style from the very beging
plt.style.use('ggplot')
%matplotlib inline

### First let's import our dataset and take a look to it.

In [86]:
dataset = pd.read_csv("titanic.csv")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [87]:
dataset.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [51]:
corrs = dataset.corr()
corrs

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


## now we need to seperate the dependant and independent variables

In [52]:
dataset.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [53]:
dataset.drop(['Ticket', 'PassengerId','Cabin','Name'], inplace=True, axis=1)
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [54]:
features_matrix = dataset.drop('Survived', axis=1)
features_matrix


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [55]:
goal_vector = dataset['Survived']
goal_vector

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

## Handling the missing data
### we have two options : 
###    1. remove the row 
###    2. replace it by the mean 


In [56]:
features_matrix.isna().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [57]:
features_matrix.shape

(891, 7)

In [58]:
###  replace it by the mean 
features_matrix=features_matrix.fillna(features_matrix.mean()) 

###  remove the row 
features_matrix=features_matrix.fillna('None')
index=features_matrix.loc[features_matrix['Embarked'] == 'None'].index
features_matrix.drop(index, inplace=True, axis=0)
goal_vector.drop(index, inplace=True, axis=0)



In [59]:
features_matrix.shape

(889, 7)

In [60]:
features_matrix.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

## Handling Categorical Data 
### models prefer to deal with numbers rather than words if possible, that's what we are going to do now 

In [61]:
# import the needed librarys
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
features_matrix['Sex'] = encoder.fit_transform(features_matrix['Sex'])
features_matrix['Embarked'] = encoder.fit_transform(features_matrix['Embarked'])


features_matrix

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.000000,1,0,7.2500,2
1,1,0,38.000000,1,0,71.2833,0
2,3,0,26.000000,0,0,7.9250,2
3,1,0,35.000000,1,0,53.1000,2
4,3,1,35.000000,0,0,8.0500,2
...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,13.0000,2
887,1,0,19.000000,0,0,30.0000,2
888,3,0,29.699118,1,2,23.4500,2
889,1,1,26.000000,0,0,30.0000,0


In [62]:
from sklearn.preprocessing import OneHotEncoder
hot_encoder=OneHotEncoder()
hot_encoder.fit_transform


<bound method OneHotEncoder.fit_transform of OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)>

In [63]:
# import the oneHotEncoder class
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder(categorical_features=[0])
features_matrix = oneHotEncoder.fit_transform(features_matrix).toarray()
features_matrix[:, :3]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [64]:
features_matrix

array([[ 0.    ,  0.    ,  1.    , ...,  0.    ,  7.25  ,  2.    ],
       [ 1.    ,  0.    ,  0.    , ...,  0.    , 71.2833,  0.    ],
       [ 0.    ,  0.    ,  1.    , ...,  0.    ,  7.925 ,  2.    ],
       ...,
       [ 0.    ,  0.    ,  1.    , ...,  2.    , 23.45  ,  2.    ],
       [ 1.    ,  0.    ,  0.    , ...,  0.    , 30.    ,  0.    ],
       [ 0.    ,  0.    ,  1.    , ...,  0.    ,  7.75  ,  1.    ]])

In [65]:
goal_vector = encoder.fit_transform(goal_vector)


In [66]:
from sklearn.preprocessing import OneHotEncoder
hot_encoder=OneHotEncoder()
hot_encoder.fit_transform


<bound method OneHotEncoder.fit_transform of OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)>

In [67]:
# import the oneHotEncoder class
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder(categorical_features=[0])
features_matrix = oneHotEncoder.fit_transform(features_matrix).toarray()
features_matrix

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 1.    ,  0.    ,  0.    , ...,  0.    ,  7.25  ,  2.    ],
       [ 0.    ,  1.    ,  0.    , ...,  0.    , 71.2833,  0.    ],
       [ 1.    ,  0.    ,  0.    , ...,  0.    ,  7.925 ,  2.    ],
       ...,
       [ 1.    ,  0.    ,  0.    , ...,  2.    , 23.45  ,  2.    ],
       [ 0.    ,  1.    ,  0.    , ...,  0.    , 30.    ,  0.    ],
       [ 1.    ,  0.    ,  0.    , ...,  0.    ,  7.75  ,  1.    ]])

## Now we need to split our data to train and test data ..

In [68]:
# import the modules 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_matrix, goal_vector, test_size = 0.2, random_state = 0)


## It's better to scale/standardize our features value, so that they have the same scale

# try with standard scaler

In [81]:
# import the library we need from our beloved sklearn !
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


In [82]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [84]:
print(mean_squared_error(y_train, model.predict(x_train)))
model.score(x_train, y_train)

0.1350523976237901


0.420479017277258

In [72]:
x_test = scaler.transform(x_test)
print(mean_squared_error(y_test, model.predict(x_test)))
model.score(x_test, y_test)

1.9036693050336126e+23


-7.868996511504891e+23

# try with Min-Max Scaler

In [73]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_matrix, goal_vector, test_size=.2, random_state=0)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)

In [74]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [75]:
print(mean_squared_error(y_train, model.predict(x_train)))
model.score(x_train, y_train)

0.13106460007937146


0.44110480398889956

In [76]:
x_test = scaler.transform(x_test)
print(mean_squared_error(y_test, model.predict(x_test)))
model.score(x_test, y_test)

0.19285114159744776


0.2028316281313065

# try without any scaler

In [77]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_matrix, goal_vector, test_size=.3, random_state=0)

In [78]:
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [79]:
print(mean_squared_error(y_train, model.predict(x_train)))
model.score(x_train, y_train)

0.13457404128321182


0.4225316838086276

In [80]:
print(mean_squared_error(y_test, model.predict(x_test)))
model.score(x_test, y_test)

0.1634183536030067


0.32542379791518583