From the titanic data, we will see who survived and who did not.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df= pd.read_csv("D:/iAnalyst/Machine Learning/Decision Tree/Titanic/titanic_train.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Cleaning

Very important way of dealing with null values= #https://www.analyticsvidhya.com/blog/2021/05/dealing-with-missing-values-in-python-a-complete-guide/

In [4]:
df.info()  

#Another way of finding missing values  

print(df.isnull().sum())    #Age and Cabin has missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [5]:
##deleting non required variables. Also Cabin as it has a lot of null values

todrop= ['Name','PassengerId','Ticket','Cabin']  
df= df.drop(todrop,axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
#filling the missing values by their mode

df['Embarked']=df['Embarked'].fillna(df['Embarked'].mode())
df.Embarked.isnull().count

<bound method Series.count of 0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Embarked, Length: 891, dtype: bool>

In [7]:
#filling the missing ages by their median

df['Age']=df['Age'].fillna(df['Age'].median())  
#df.info()

## Converting Categorical Features 

We'll need to convert categorical features to dummy variables using pandas! Otherwise our machine learning algorithm won't be able to directly take in those features as inputs.

In [8]:
#Using Label encoding
from sklearn.preprocessing import LabelEncoder

labEn= LabelEncoder()
df['Sex'] = labEn.fit_transform(df['Sex'])  
#df['Sex']  #converted to 0= female, 1=male

df.Embarked =labEn.fit_transform(df['Embarked'])
df.Embarked

0      2
1      0
2      2
3      2
4      2
      ..
886    2
887    2
888    2
889    0
890    1
Name: Embarked, Length: 891, dtype: int32

In [9]:
df.shape

(891, 8)

# Training the model

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import auc, roc_auc_score, accuracy_score, classification_report
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,RepeatedStratifiedKFold #use it while doing grid search

In [11]:
#Try using Grid search. It uses all the possible combination of parameters. a dict needs to be created first- check google
#dtree= DecisionTreeClassifier()
#GridSearchCV(dtree,cv=10)

In [12]:
X= df.drop('Survived', axis=1)
y= df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20,random_state=101)

In [13]:
#We will use Grid search for hyperparameter tuning. #can use this of random forest as well as all other classifiers
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [19]:
dtree= DecisionTreeClassifier()
params = [{'max_depth': list(range(10, 20)), 'max_features': list(range(0,14)), 'min_samples_leaf': list(range(2,20))}]

clf = GridSearchCV(dtree, params, cv = 5, scoring='accuracy')
clf.fit(X_train, y_train)
#acc.append(roc_auc_score(y_test, y_pred))
#print(acc)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid=[{'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
                          'max_features': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                           12, 13],
                          'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                               12, 13, 14, 15, 16, 17, 18,
                                               19]}],
             scoring='accuracy')

In [20]:
print(clf.best_params_)

#We are getting mazimum score when leaf size is around 10

{'max_depth': 17, 'max_features': 5, 'min_samples_leaf': 12}


In [21]:
print(clf.best_score_)

0.8230276765488034


In [22]:
dtree= DecisionTreeClassifier(max_depth=24 , max_features=4 ,min_samples_leaf=11,random_state=101)
dtree.fit(X_train,y_train)
y_pred = dtree.predict(X_test)
accuracy_score(y_test, y_pred)

0.8044692737430168

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.89      0.83        99
           1       0.84      0.70      0.76        80

    accuracy                           0.80       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.80      0.80       179



In [24]:
roc_auc_score(y_test, y_pred) 

#The roc auc score is close to the accuracy, that means data is not much imbalanced

0.7944444444444444

# Now trying randomized CV using random forest

# Building model on the validation dataset

In [48]:
#Performing the same data cleaning and preprocessing techniques as for train data.
t= pd.read_csv("D:/iAnalyst/Machine Learning/Decision Tree/Task/titanic_test.csv")
test= t.copy()

todrop= ['Name','PassengerId','Ticket','Cabin']  
test= test.drop(todrop,axis=1)
test.head()

test['Embarked']=test['Embarked'].fillna(test['Embarked'].mode())
test.Embarked.isnull().count

test['Age']=test['Age'].fillna(test['Age'].median())
test['Fare']=test['Fare'].fillna(test['Fare'].mean())

test['Sex'] = labEn.fit_transform(test['Sex']) 
#print(X_test['Sex'])

test.Embarked =labEn.fit_transform(test['Embarked'])

In [49]:
#test.isnull().sum()

In [50]:
pred = dtree.predict(test)

In [51]:
t['Survived']= pred

In [52]:
output= t[['PassengerId','Survived']]
output.to_csv('Submission.csv', index=None)