In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
titanic=sns.load_dataset("titanic")

In [7]:
titanic.info()
titanic.isnull().sum()
titanic.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
features =["pclass","sex","fare","embarked","age"]
target=["survived"]

In [None]:
#handling the missing values
from sklearn.impute import SimpleImputer

imp_median = SimpleImputer(strategy ="median")
titanic[["age"]] = imp_median.fit_transform(titanic[["age"]])

imp_freq=SimpleImputer(strategy ="most_frequent")
titanic[["embarked"]] = imp_freq.fit_transform(titanic[["embarked"]])


## encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

titanic["sex"]=le.fit_transform(titanic["sex"])
titanic["embarked"]=le.fit_transform(titanic["embarked"])

titanic.head()

In [None]:
X=titanic[features]
y=titanic[target]

X_train,X_test,y_train,y_test= train_test_split(
    X,y,test_size = 0.2 , random_state =42
)

#Decision Tree model
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import  accuracy_score

model=DecisionTreeClassifier()

model.fit(X_train,y_train)
y_pred=model.predict(X_test)

print(" accuracy score : ",accuracy_score(y_test,y_pred))

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(18,10))
plot_tree(
    model,
    feature_names=X.columns,
    class_names=["Died","survived"],
    filled=True,
    max_depth=3
)

plt.tight_layout()


## Decision Tree with prepunning

In [None]:
max_depth=[2,3,4,5,6,7,8]

for depth in max_depth:
    model=DecisionTreeClassifier(max_depth=depth)
    model.fit(X_train,y_train)

    acc=model.score(X_test,y_test)
    print(f"for depth {depth} ,accuracy={acc}")    

    if depth==4:
        plt.figure(figsize=(18,10))  
        plot_tree(
            model,
            feature_names=X.columns,
            class_names=["Died","survived"],
            filled=True,
        )

        plt.tight_layout()
        plt.show()


In [None]:
min_samples=[5,10,15,20,25,30]

for sample in min_samples:
    model=DecisionTreeClassifier(min_samples_split=sample)
    model.fit(X_train,y_train)

    acc=model.score(X_test,y_test)
    print(f"for min_samples{sample},accuracy={acc}")  
    
    if sample==10:
        plt.figure(figsize=(18,10))
        plot_tree(
            model,
            feature_names=X.columns,
            class_names=["Died","survived"],
            filled=True,
        )

        plt.tight_layout()
        plt.show()


## Decison Tree with Postpruning

In [14]:
full_tree=DecisionTreeClassifier(random_state=42)
model.fit(X_train,y_train)

In [21]:
path=full_tree.cost_complexity_pruning_path(X_train,y_train)
ccp_alpha=path.ccp_alphas
print(ccp_alphas)

[0.00000000e+00 0.00000000e+00 6.68806849e-05 1.45921494e-04
 1.84428555e-04 2.00642055e-04 2.34082397e-04 2.34082397e-04
 3.51123596e-04 4.68164794e-04 4.68164794e-04 5.61797753e-04
 6.24219725e-04 6.67991230e-04 7.02247191e-04 7.02247191e-04
 8.19288390e-04 8.19288390e-04 8.32292967e-04 8.42696629e-04
 8.42696629e-04 8.42696629e-04 8.94231048e-04 8.99610781e-04
 9.24769963e-04 9.36329588e-04 9.36329588e-04 9.36329588e-04
 9.88347898e-04 1.00253471e-03 1.05337079e-03 1.07400895e-03
 1.08038029e-03 1.11865144e-03 1.12359551e-03 1.12359551e-03
 1.13139825e-03 1.17041199e-03 1.18841832e-03 1.22566125e-03
 1.22893258e-03 1.22893258e-03 1.24361593e-03 1.24843945e-03
 1.27565833e-03 1.30996111e-03 1.33761370e-03 1.37044603e-03
 1.46301498e-03 1.47927070e-03 1.51142557e-03 1.54072312e-03
 1.56675436e-03 1.60434338e-03 1.66892062e-03 1.76144762e-03
 1.86751451e-03 2.10674157e-03 2.34082397e-03 2.41226470e-03
 2.64273538e-03 2.69412911e-03 3.43770913e-03 3.56327570e-03
 5.47433606e-03 9.055400

In [28]:
#train model for all alphas
tree=[]

for alpha in ccp_alpha:
    full_tree=DecisionTreeClassifier(random_state=42,ccp_alpha=alpha)
    model.fit(X_train,y_train)
    acc=model.score(X_test,y_test)

    tree.append((model,alpha))

In [23]:
best_acc=0
best_alpha=0

for model,alpha in tree:
    curr_acc=model.score(X_test,y_test)
    
    if curr_acc> best_acc:
        best_acc=curr_acc
        best_alpha=alpha

In [27]:
best_alpha

0.7932960893854749