# Reading the dataset

In [4]:
import pandas as pd
import numpy as np

In [33]:
data = pd.read_csv("titanic.csv")

In [34]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Dropping the columns that are not required.

In [35]:
columns_to_be_removed = ["Name", "Ticket", "Cabin"]

In [36]:
data = data.drop(columns_to_be_removed, 1)

In [37]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [38]:
def missing_values_count():
    missing_perc = (((data.isnull().sum())/len(data))*100)
    return missing_perc

In [39]:
missing_perc = missing_values_count()

In [40]:
missing_perc

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Fare            0.000000
Embarked        0.224467
dtype: float64

In [41]:
def replace_missing_values(column_name, method):
    """
    :param column_name: Name of the column with missing values
    :param method: Method to use to replace the missing values.
    :Result:
        Missing values will be replaced.
    """
    try:
        data[column_name] = data[column_name].fillna(method(data[column_name]))
    except Exception:
        try:
            data[column_name] = data[column_name].fillna(method)
        except Exception as ex:
            print("There is an {} exception".format(ex))

In [42]:
replace_missing_values("Age", np.mean)

In [43]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [44]:
data["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [45]:
replace_missing_values("Embarked", "S")

In [46]:
data.Embarked.value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [47]:
missing_perc = missing_values_count()

In [48]:
missing_perc

PassengerId    0.0
Survived       0.0
Pclass         0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Fare           0.0
Embarked       0.0
dtype: float64

# Encoding the columns

In [49]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


### Columns to be encoded: Sex and Embarked

In [50]:
df = data.copy()

In [51]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data["Sex"] = le.fit_transform(data["Sex"])

In [52]:
data["Embarked"] = le.fit_transform(data["Embarked"])

In [53]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


In [54]:
df = pd.get_dummies(df, prefix = ["Sex_", "Embarked_"])

In [55]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex__female,Sex__male,Embarked__C,Embarked__Q,Embarked__S
0,1,0,3,22.000000,1,0,7.2500,0,1,0,0,1
1,2,1,1,38.000000,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.000000,0,0,7.9250,1,0,0,0,1
3,4,1,1,35.000000,1,0,53.1000,1,0,0,0,1
4,5,0,3,35.000000,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.000000,0,0,13.0000,0,1,0,0,1
887,888,1,1,19.000000,0,0,30.0000,1,0,0,0,1
888,889,0,3,29.699118,1,2,23.4500,1,0,0,0,1
889,890,1,1,26.000000,0,0,30.0000,0,1,1,0,0


# Re-sampling

In [56]:
data["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [57]:
X = df.drop("Survived", 1)
y = df[["Survived"]]

In [58]:
y.tail(50)

Unnamed: 0,Survived
841,0
842,1
843,0
844,0
845,0
846,0
847,0
848,0
849,1
850,0


In [59]:
print(X.shape, y.shape)

(891, 11) (891, 1)


In [63]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()

X, y = sm.fit_resample(X, y)
print(X.shape, y.shape)

(1098, 11) (1098, 1)


In [61]:
y.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64