### Importing Libraries, Methods and Modules 

In [139]:
import pandas as pd 
from matplotlib import pyplot as plt
import numpy as np 
from sklearn.tree import DecisionTreeClassifier
import re

### Reading the dataset

In [110]:
#Importing the training dataset
train_df = pd.read_csv("train.csv")

#Importing the testing dataset
test_df = pd.read_csv("test.csv")

In [111]:
#Printing the features of the dataset 
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [112]:
#Printing the first 5 rows of the dataset
train_df.head(n=5)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [113]:
#Counting the total number of NaN values are present in the data set without ordering it in ascending order.
total= train_df.isnull().sum().sort_values(ascending=False)

#Getting the percentage of each column's null values
percent_1= train_df.isnull().sum()/train_df.isnull().count()*100
percent_2= (round(percent_1, 1)).sort_values(ascending=False)

#Creating a new dataframe with the total null values and their percentages
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

Unnamed: 0,Total,%
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2
Fare,0,0.0
Ticket,0,0.0


By doing the above, we get a heads up as to what columns have to be worked on to remove null values.

In [114]:
#Printing the names of all the columns in the dataframe
train_df.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

Deleting the column PassengerId since it is similar to serial number and is not in any way required to determine survival probability.

In [115]:
train_df=train_df.drop(["PassengerId"], axis=1)

We have to deal with Cabin (687), Embarked (2) and Age (177) since only these columns consist of null values based on the above analysis. 

A cabin number looks like ‘C123’ and the letter refers to the deck. Therefore we’re going to create a new feature that contains only a person's deck. After that, we will convert the feature into a numeric variable. The missing values will be converted to zero. 

In [116]:
#Mapping the values letter wise to respective numbers
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
data = [train_df, test_df]
for dataset in data:
    #Filling all the null values with "U0"
    dataset['Cabin'] = dataset['Cabin'].fillna("U0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    #Mapping the alphabets with the numbers and placing it into deck
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int)
# we can now drop the cabin feature
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)


In [137]:
#Printing the first five values for reference
train_df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Age_Class,relatives,not_alone,Fare_Per_Person
0,0,3,0,2,1,0,7,0,8,1,6,1,0,3
1,1,1,1,5,1,0,71,1,3,3,5,1,0,35
2,1,3,1,3,0,0,7,0,8,2,9,0,1,7
3,1,1,1,5,1,0,53,0,3,3,5,1,0,26
4,0,3,0,5,0,0,8,0,8,1,15,0,1,8


### Getting rid of the null values in the Age column.

For this, we replace all the NaN values in the Age column with a random value that lies within the range of the mean and the standard deviation. 

In [118]:

data = [train_df, test_df]

for dataset in data:
    #Calculate the mean and the standard deviation.
    mean = train_df["Age"].mean()
    std = test_df["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    # compute random numbers between the mean, std and is_null.
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated.
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train_df["Age"].astype(int)
#Check if there are any other null values left in the column.
train_df["Age"].isnull().sum()

0

### Getting rid of the null values in the Embarked column.

Since the Embarked feature has only 2 missing values, we will just fill these with the most common one.

In [119]:
#To check the features of the column.
train_df['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [120]:
#Since the most frequent value in the column is given by 'top'
common_value = 'S'
data = [train_df, test_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

In [121]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    int64  
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Embarked  891 non-null    object 
 10  Deck      891 non-null    int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 76.7+ KB


Fare: Converting “Fare” from float to int64, using the “astype()” function pandas provides:

In [122]:
data = [train_df, test_df]

for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

### Converting the Name column to numeric.

In [123]:
data = [train_df, test_df]
#Mapping values
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    # extract titles
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # convert titles into numbers
    dataset['Title'] = dataset['Title'].map(titles)
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna(0)
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

### Converting the Sex column to numeric.

In [124]:
genders={"male":0, "female":1}
data=[train_df, test_df]
for dataset in data:
    dataset['Sex']=dataset['Sex'].map(genders)

In [125]:
train_df['Ticket'].describe()

count        891
unique       681
top       347082
freq           7
Name: Ticket, dtype: object

Since the Ticket attribute has 681 unique tickets, it will be a bit tricky to convert them into useful categories. So we will drop it from the dataset.

In [126]:
train_df['Ticket'].describe()
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

### Converting the Embarked column into numeric.

In [127]:
ports = {"S": 0, "C": 1, "Q": 2}
data = [train_df, test_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(ports)

Now, create categories within the feature of Age. First, convert it from float into integer. Then create the new ‘AgeGroup” variable, by categorizing every age into a specific group. Ensure to not have too many values in a single category.

In [128]:
dataset['Age']

0      22
1      38
2      26
3      35
4      35
       ..
413    21
414    44
415    28
416    34
417    18
Name: Age, Length: 418, dtype: int64

In [129]:
data = [train_df, test_df]
for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    #The conditions with which the age groups are divided.
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

# let's see how it's distributed train_df['Age'].value_counts()

In [130]:
#Printing the first 10 values.
train_df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title
0,0,3,0,2,1,0,7,0,8,1
1,1,1,1,5,1,0,71,1,3,3
2,1,3,1,3,0,0,7,0,8,2
3,1,1,1,5,1,0,53,0,3,3
4,0,3,0,5,0,0,8,0,8,1
5,0,3,0,1,0,0,8,2,8,1
6,0,1,0,6,0,0,51,0,5,1
7,0,3,0,0,3,1,21,0,8,4
8,1,3,1,3,0,2,11,0,8,3
9,1,2,1,1,1,0,30,1,8,3


### Inserting two new fields.

Age time class

In [131]:
for dataset in data:
    dataset['Age_Class']= dataset['Age']* dataset['Pclass']

Fare per person

In [132]:
for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0
    dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1
    dataset['not_alone'] = dataset['not_alone'].astype(int)

for dataset in data:
    dataset['Fare_Per_Person'] = dataset['Fare']/(dataset['relatives']+1)
    dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int)

In [133]:
# Let's take a last look at the training set, before we start training the models.
train_df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Age_Class,relatives,not_alone,Fare_Per_Person
0,0,3,0,2,1,0,7,0,8,1,6,1,0,3
1,1,1,1,5,1,0,71,1,3,3,5,1,0,35
2,1,3,1,3,0,0,7,0,8,2,9,0,1,7
3,1,1,1,5,1,0,53,0,3,3,5,1,0,26
4,0,3,0,5,0,0,8,0,8,1,15,0,1,8
5,0,3,0,1,0,0,8,2,8,1,3,0,1,8
6,0,1,0,6,0,0,51,0,5,1,6,0,1,51
7,0,3,0,0,3,1,21,0,8,4,0,4,0,4
8,1,3,1,3,0,2,11,0,8,3,9,2,0,3
9,1,2,1,1,1,0,30,1,8,3,2,1,0,15


In [134]:
#Deleting the survived column from the training data
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()

### Applying the decision tree.

In [135]:
#Create tree object
decision_tree = DecisionTreeClassifier() 

#Train DT based on scaled training set
decision_tree.fit(X_train, Y_train) 

# Predicting the Test set results
Y_pred = decision_tree.predict(X_test) 

acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

In [138]:
print('The accuracy of the Decision Tree classifier on test data is {:.2f}'.format(acc_decision_tree))

The accuracy of the Decision Tree classifier on test data is 94.39
