In [2]:
# Import packages 
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
titanic_df = pd.read_csv("titanic.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
# You can also drop whichever other columns you'd like here
titanic_df.drop("Cabin", axis=1, inplace=True)

### One-Hot Encoding
One-hot encoding is a technique used to ensure that categorical variables are better represented in the machine. Let's take a look at the "Sex" column

In [6]:
titanic_df["Sex"].unique()

array(['male', 'female'], dtype=object)

Machine Learning classifiers don't know how to handle strings. As a result, you need to convert it into a categorical representation. There are two main ways to go about this:

Label Encoding: Assigning, for example, 0 for "male" and 1 for "female". The problem here is it intrinsically makes one category "larger than" the other category.

One-hot encoding: Assigning, for example, [1, 0] for "male" and [0, 1] for female. In this case, you have an array of size (n_categories,) and you represent a 1 in the correct index, and 0 elsewhere. In Pandas, this would show as extra columns. For example, rather than having a "Sex" column, it would be a "Sex_male" and "Sex_female" column. Then, if the person is male, it would simply show as a 1 in the "Sex_male" column and a 0 in the "Sex_female" column.

There is a nice and easy method that does this in pandas: get_dummies()

In [7]:
titanic_df = pd.get_dummies(titanic_df, prefix="Sex", columns=["Sex"])
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Embarked,Sex_female,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,S,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C,True,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,S,True,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,S,True,False
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,S,False,True


Now, we do the same to the "Embarked" column.

In [8]:
titanic_df = pd.get_dummies(titanic_df, prefix="Embarked", columns=["Embarked"])
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,False,True,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,True,False,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,True,False,False,False,True
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,False,True,False,False,True


Create a Bagged, Random Forest, and Boosted tree for the titanic dataset in the same way that you created a regular Classification Tree.

In [9]:
print('\nNull Values in Training \n{}'.format(titanic_df.isnull().sum()))
# print('\nNull Values in Testing \n{}'.format(titanic_df.isnull().sum()))

print('\nDuplicated values in train {}'.format(titanic_df.duplicated().sum()))
# print('Duplicated values in test {}'.format(titanic_df.duplicated().sum()))


Null Values in Training 
PassengerId      0
Survived         0
Pclass           0
Name             0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Sex_female       0
Sex_male         0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
dtype: int64

Duplicated values in train 0


In [10]:
# Drop rows with NaN values in 'Age'
titanic_df.dropna(subset=['Age'], inplace=True)

# Display Age and other columns for null values.
print('\nNull Values in Training \n{}'.format(titanic_df.isnull().sum()))


Null Values in Training 
PassengerId    0
Survived       0
Pclass         0
Name           0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64


In [11]:
# Select relevant features and target
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
target = 'Survived'

# Split the data into training, development, and test sets.
train_df, test_df = train_test_split(titanic_df, test_size=0.2, random_state=42)
dev_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

# Display the shapes of the datasets
print(f"Training set shape: {train_df.shape}")
print(f"Development set shape: {dev_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Split data into features (X) and target (y)
X_train = train_df[features]
y_train = train_df[target]
X_dev = dev_df[features]
y_dev = dev_df[target]
X_test = test_df[features]
y_test = test_df[target]

# Initialize models
bagging_model = BaggingClassifier(base_estimator=None, n_estimators=100, random_state=42)
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
boosted_tree_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Fit models to training data
bagging_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)
boosted_tree_model.fit(X_train, y_train)


Training set shape: (571, 14)
Development set shape: (71, 14)
Test set shape: (72, 14)




From the Random Forest model, we will determine which of the features is the one that contributes the most to predicting whether a passenger survives or not.<br>
We will examine the feature importance scores provided by the model.

In [12]:
# Get feature importance scores from the Random Forest model
feature_importances = random_forest_model.feature_importances_

# Create a dictionary to pair features with their importance scores
feature_importance_dict = dict(zip(features, feature_importances))

# Sort the features based on their importance scores in descending order
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Display the sorted feature importance
print("Feature Importance:")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")


Feature Importance:
Age: 0.27484997302928105
Fare: 0.23409032243796835
Sex_male: 0.1635178293964902
Sex_female: 0.13415222366607651
Pclass: 0.0855321380207055
SibSp: 0.04103556058185477
Parch: 0.036579537105091584
Embarked_C: 0.0151580335334631
Embarked_S: 0.011009724821406013
Embarked_Q: 0.004074657407663029


<b>Age seems to be the feature with the most importance contributing to the survival of a passenger.</b>

Tuning the parameters n_estimators and max_depth for the Random Forest model.

We will go through a range of values for these parameters and find the combination that performs the best.

In [13]:
# Define the hyperparameter values to try
n_estimators_values = [50, 100, 150]
max_depth_values = [None, 10, 20, 30]

best_accuracy = 0.0
best_n_estimators = None
best_max_depth = None

# Iterate over hyperparameter values
for n_estimators in n_estimators_values:
    for max_depth in max_depth_values:
        # Create Random Forest model with current hyperparameters
        random_forest_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        
        # Fit the model on the training data
        random_forest_model.fit(X_train, y_train)
        
        # Evaluate the model on the development set
        dev_accuracy = random_forest_model.score(X_dev, y_dev)
        
        # Print or store the results
        print(f"n_estimators={n_estimators}, max_depth={max_depth}, Accuracy on Development Set: {dev_accuracy:.4f}")
        
        # Check if this set of hyperparameters gives better accuracy
        if dev_accuracy > best_accuracy:
            best_accuracy = dev_accuracy
            best_n_estimators = n_estimators
            best_max_depth = max_depth

# Print the best hyperparameters found
print("\nBest Hyperparameters:")
print(f"n_estimators={best_n_estimators}, max_depth={best_max_depth}, Best Accuracy on Development Set: {best_accuracy:.4f}")

# Train the final model with the best hyperparameters on the full training set
final_model = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth, random_state=42)
final_model.fit(X_train, y_train)

n_estimators=50, max_depth=None, Accuracy on Development Set: 0.8028
n_estimators=50, max_depth=10, Accuracy on Development Set: 0.8169
n_estimators=50, max_depth=20, Accuracy on Development Set: 0.8028
n_estimators=50, max_depth=30, Accuracy on Development Set: 0.8028
n_estimators=100, max_depth=None, Accuracy on Development Set: 0.8028
n_estimators=100, max_depth=10, Accuracy on Development Set: 0.8028
n_estimators=100, max_depth=20, Accuracy on Development Set: 0.8169
n_estimators=100, max_depth=30, Accuracy on Development Set: 0.8028
n_estimators=150, max_depth=None, Accuracy on Development Set: 0.8028
n_estimators=150, max_depth=10, Accuracy on Development Set: 0.8028
n_estimators=150, max_depth=20, Accuracy on Development Set: 0.8028
n_estimators=150, max_depth=30, Accuracy on Development Set: 0.8028

Best Hyperparameters:
n_estimators=50, max_depth=10, Best Accuracy on Development Set: 0.8169


<b>Given the outcome below, the best model would be the Random Forest with the previous result after tuning the n_estimators = 50 and max_depth = 10.<b>

In [14]:
# Predictions on development set
bagging_dev_predictions = bagging_model.predict(X_dev)
random_forest_dev_predictions = random_forest_model.predict(X_dev)
boosted_tree_dev_predictions = boosted_tree_model.predict(X_dev)

# Calculate accuracy for each model on the development set
bagging_dev_accuracy = accuracy_score(y_dev, bagging_dev_predictions)
random_forest_dev_accuracy = accuracy_score(y_dev, random_forest_dev_predictions)
boosted_tree_dev_accuracy = accuracy_score(y_dev, boosted_tree_dev_predictions)

# Report accuracy for each model
print(f"Bagging Model Accuracy on Development Set: {bagging_dev_accuracy:.4f}")
print(f"Random Forest Model Accuracy on Development Set: {random_forest_dev_accuracy:.4f}")
print(f"Boosted Tree Model Accuracy on Development Set: {boosted_tree_dev_accuracy:.4f}")

# Identify the best-performing model
best_model = max([
    ("Bagging", bagging_dev_accuracy),
    ("Random Forest", random_forest_dev_accuracy),
    ("Boosted Tree", boosted_tree_dev_accuracy)
], key=lambda x: x[1])

# Report the best-performing model and its hyperparameters
print(f"\nBest Performing Model: {best_model[0]}")
print(f"Best Accuracy on Development Set: {best_model[1]:.4f}")
if best_model[0] == "Bagging":
    print(f"Hyperparameters: n_estimators={bagging_model.n_estimators}")
elif best_model[0] == "Random Forest":
    print(f"Hyperparameters: n_estimators={random_forest_model.n_estimators}, max_depth={random_forest_model.max_depth}")
elif best_model[0] == "Boosted Tree":
    print(f"Hyperparameters: n_estimators={boosted_tree_model.n_estimators}, max_depth={boosted_tree_model.max_depth}")

Bagging Model Accuracy on Development Set: 0.8028
Random Forest Model Accuracy on Development Set: 0.8028
Boosted Tree Model Accuracy on Development Set: 0.8028

Best Performing Model: Bagging
Best Accuracy on Development Set: 0.8028
Hyperparameters: n_estimators=100
