### Let's try to visualize a more complex dataset, for example, the Titanic-dataset

<b>We already processed this data in the Introduction to Data Analytics course.</b>

#### NOTE: the first parts is just loading the data and handling missing values etc. The real decision tree part starts with the X/y -split -phase

In [20]:
# import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree
import pandas as pd
import dtreeviz

In [21]:
# load the Titanic -dataset
df = pd.read_csv("Titanic-Dataset.csv")

In [22]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
# remove: PassengerId, Name, Ticket
# remove Cabin, because it's missing in 80% of the cases (and it's also just a identifier)
# remove the 2 rows with missing embark (city of departure)

In [24]:
df = df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

In [25]:
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [26]:
# Convert Sex to numeric format (LabelEncoder)
# this just converts the value of column to 0 or 1
# factorize in pandas works too, but only one column at a time
from sklearn.preprocessing import LabelEncoder
variables = ['Sex']
encoder = LabelEncoder()
df[variables] = df[variables].apply(encoder.fit_transform)

In [27]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [28]:
# filter data that Embarked is not NaN (only two rows)
df = df[df['Embarked'].notna()]

In [29]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [30]:
 # get average age based on price class
df.groupby(['Pclass'])['Age'].mean()

Pclass
1    38.105543
2    29.877630
3    25.140620
Name: Age, dtype: float64

In [31]:
# a very good example where a category-based
# imputation with averages is better than just using the 
# average of the whole dataset
def impute_age(row):
    age = row['Age']
    pclass = row['Pclass']

    # if age is missing => use average age based one pclass
    # else => leave the age as it is
    if pd.isnull(age):
        if pclass == 1:
            return 38
        elif pclass == 2:
            return 29
        else:
            return 25
    else:
        return age

In [32]:
# use average pclass-based age for missing values
df['Age'] = df.apply(impute_age, axis=1)

In [33]:
 # this makes multiple columns with the variable (Separate for yes/no)
# Make a separate dummy variable for each city (3 cities)
from sklearn.preprocessing import OneHotEncoder
variables = ['Embarked']

# use encoder
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = encoder.fit_transform(df[variables]).astype(int)
df = pd.concat([df,one_hot_encoded],axis=1).drop(columns=variables)

In [34]:
 # small optimization, drop the last one. see linear regression
# materials for why we can do this
df = df.drop("Embarked_S", axis=1)

In [35]:
# all data seems good to go now!
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q
0,0,3,1,22.0,1,0,7.25,0,0
1,1,1,0,38.0,1,0,71.2833,1,0
2,1,3,0,26.0,0,0,7.925,0,0
3,1,1,0,35.0,1,0,53.1,0,0
4,0,3,1,35.0,0,0,8.05,0,0


## X/y -split => Decision Tree

In [36]:
# X/y -split 
X = df.drop("Survived", axis=1)
y = df['Survived']

In [37]:
# create a decision tree classifier for the visualization
# and train the model with our data
clf = DecisionTreeClassifier()
model = clf.fit(X, y)

In [38]:
# visualize the decision tree
from sklearn.tree import export_graphviz
import subprocess
from sklearn import tree

# save the decision tree visualization into an svg-file

# NOTE! Always modify the class_names to match your data's TARGET VARIABLE OPTIONS
export_graphviz(clf, feature_names=X.columns, class_names=["No", "Yes"],
                filled=True, rounded=True, node_ids=True, out_file="titanic_tree.dot")

# convert the DOT-file into SVG-file (which is supported by many tools
subprocess.call(["dot", "-Tsvg", "titanic_tree.dot", '-o', 'dt_titanic_test_sk.svg'])

0

### Version 2: you can also limit the amount of LEAVES in the tree (depth)

In [39]:
# visualize the decision tree
from sklearn.tree import export_graphviz
import subprocess
from sklearn import tree

# save the decision tree visualization into an svg-file
export_graphviz(clf, feature_names=X.columns, class_names=["No", "Yes"],
                filled=True, rounded=True, node_ids=True, out_file="titanic_tree_limited.dot",
                max_depth=4)

# convert the DOT-file into SVG-file (which is supported by many tools
subprocess.call(["dot", "-Tsvg", "titanic_tree_limited.dot", '-o', 'dt_titanic_test_sk_limited.svg'])

0

In [40]:
# pip install dtreeviz
import dtreeviz

viz_model = dtreeviz.model(clf,
                           X_train=X, y_train=y,
                           feature_names=X.columns,
                           target_name="Decision",
                           class_names=["No", "Yes"])

# view in Jupyter notebook
# if decision tree is large, consider saving to .SVG -file
# and view with a web browser
viz_model.view(scale=1.5).save("dt_titanic_tree.svg")



In [41]:
# in order to limit the depth of the dtreeviz,
# you have to create a separate classifier, with max depth

# create a decision tree classifier for the visualization
# and train the model with our data
clf_limited = DecisionTreeClassifier(max_depth=4)
model_limited = clf_limited.fit(X, y)

# pip install dtreeviz
import dtreeviz

viz_model = dtreeviz.model(clf_limited,
                           X_train=X, y_train=y,
                           feature_names=X.columns,
                           target_name="Decision",
                           class_names=["No", "Yes"])

# view in Jupyter notebook
# if decision tree is large, consider saving to .SVG -file
# and view with a web browser
viz_model.view(scale=1.5).save("dt_titanic_tree_limited.svg")

