\setcounter{secnumdepth}{0}

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib as mpl
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_excel('titanic.xls')
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
df.isna().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

### 2.(i)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

#from sklearn.impute import SimpleImputer
# # Impute missing values
# imputer = SimpleImputer(strategy='mean') 
# X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)


X = df.drop(['survived','name', 'home.dest','cabin','boat','body'], axis=1)
y = df['survived']

# Step 1: Identify string columns
string_columns = X.columns[X.dtypes == 'object']
# X = X.drop(string_columns, axis=1)

# Step 2: Convert string columns to categorical
for col in string_columns:
    X[col] = pd.Categorical(X[col])

# Step 3: Create dummy columns
X = pd.get_dummies(X, columns=string_columns, prefix=string_columns,drop_first=True)

X_train = X.iloc[:1100,:]
y_train = y.iloc[:1100]
X_test = X.iloc[1100:,:]
y_test = y.iloc[1100:]

depths = [1, 2, 3, 4, 5, 6, 7, 8]

for depth in depths:
    dt_model = DecisionTreeClassifier(criterion='entropy', max_depth=depth, random_state=0)
    dt_model.fit(X_train, y_train)
    train_accuracy = dt_model.score(X_train, y_train)
    print(f"Depth {depth} - Training Accuracy: {train_accuracy}")



ValueError: Input X contains NaN.
DecisionTreeClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
dt_model_depth1 = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=42)
dt_model_depth1.fit(X_train, y_train)

plt.figure(figsize=(8, 6))
plot_tree(dt_model_depth1, filled=True, feature_names=list(X.columns), class_names=['Not Survived', 'Survived'])
plt.title('Decision Tree (Depth=1)')
plt.tight_layout()
plt.show()

dt_model_depth2 = DecisionTreeClassifier(criterion='entropy', max_depth=2, random_state=42)
dt_model_depth2.fit(X_train, y_train)

plt.figure(figsize=(8, 6))
plot_tree(dt_model_depth2, filled=True, feature_names=list(X.columns), class_names=['Not Survived', 'Survived'])
plt.title('Decision Tree (Depth=2)')
plt.tight_layout()
plt.show()

According to the model with depth = 1, females (sex_male ≤ 0.5) have a higher chance of survival. the depth 2 tree provides a more refined classification by considering an additional feature. For females, regardless of the ticket number, the prediction is survival. For males, the ticket number can alter the prediction from non-survival to survival.

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import zero_one_loss
depths = [1, 2, 3, 4, 5, 6, 7, 8]
test_errors = []

for depth in depths:
    dt_model = DecisionTreeClassifier(criterion='entropy', max_depth=depth, random_state=42)
    dt_model.fit(X_train, y_train)

    y_pred = dt_model.predict(X_test)

    test_error = zero_one_loss(y_test, y_pred)
    test_errors.append(test_error)

plt.figure(figsize=(8, 6))
plt.plot(depths, test_errors, marker='o')
plt.title('Test Error vs. Depth')
plt.xlabel('Depth')
plt.ylabel('Test Error')
plt.show()

best_depth = depths[test_errors.index(min(test_errors))]
print(f"Best depth with minimum test error: {best_depth}")

The results don't change much, the best depth is 8.