In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
import time # to measure how long the models take
from sklearn import datasets
import seaborn as sns
from scipy.stats import mode


In [2]:
# read the Titanic data set from seaborn
data = sns.load_dataset('titanic')

In [3]:
# drop columns
df = data.drop(['deck','class','who','adult_male','embark_town','alive','alone'], axis=1)

In [4]:
# fill in missing values for Age
df['age'] = df['age'].fillna(df.groupby('sex')['age'].transform('mean'))

In [5]:
# add total family size
df['fam'] = df['parch'] + df['sibsp']

In [6]:
df = pd.get_dummies(df,columns=['sex','embarked','pclass'],dtype=int)

In [7]:
y = df["survived"]
X = df.drop("survived", axis=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [9]:
# Get the list of all column names
all_columns = df.columns.to_list()

# Randomly select 3 columns with replacement
random_selections = [np.random.choice(all_columns, 3, replace=True).tolist() for _ in range(10)]

for idx, selection in enumerate(random_selections, 1):
    print(f"Selection {idx}: {selection}")

Selection 1: ['sibsp', 'parch', 'fare']
Selection 2: ['pclass_3', 'pclass_3', 'embarked_C']
Selection 3: ['sibsp', 'embarked_S', 'sex_female']
Selection 4: ['embarked_C', 'age', 'age']
Selection 5: ['embarked_Q', 'pclass_3', 'embarked_C']
Selection 6: ['fare', 'embarked_S', 'sex_male']
Selection 7: ['survived', 'embarked_C', 'embarked_Q']
Selection 8: ['survived', 'pclass_2', 'embarked_C']
Selection 9: ['pclass_1', 'embarked_C', 'fam']
Selection 10: ['survived', 'embarked_Q', 'survived']


In [10]:
def build_decision_trees(X_train, y_train, num_models=10, num_columns=3):
    """
    Build decision trees using random subsets of columns.
    
    Parameters:
    - X_train: Training feature data.
    - y_train: Training target data.
    - num_models: Number of decision tree models to train.
    - num_columns: Number of random columns to select for each tree.
    
    Returns:
    - A list of trained decision tree models and their selected column sets.
    """
    all_columns = X_train.columns.to_list()
    models = []
    
    for _ in range(num_models):
        # Randomly select `num_columns` columns
        selected_columns = np.random.choice(all_columns, num_columns, replace=False).tolist()
        
        # Subset the training data for the selected columns
        X_train_subset = X_train[selected_columns]
        
        # Build and train the decision tree classifier
        tree = DecisionTreeClassifier(random_state=42)
        tree.fit(X_train_subset, y_train)
        
        # Store the model and the selected columns
        models.append((tree, selected_columns))
    
    return models

In [11]:
models = build_decision_trees(X_train, y_train, num_models=50, num_columns=3)


In [12]:
models

[(DecisionTreeClassifier(random_state=42),
  ['embarked_S', 'pclass_2', 'sibsp']),
 (DecisionTreeClassifier(random_state=42), ['age', 'sibsp', 'pclass_1']),
 (DecisionTreeClassifier(random_state=42), ['fam', 'sex_male', 'sex_female']),
 (DecisionTreeClassifier(random_state=42),
  ['embarked_S', 'embarked_C', 'sibsp']),
 (DecisionTreeClassifier(random_state=42),
  ['sex_male', 'parch', 'sex_female']),
 (DecisionTreeClassifier(random_state=42),
  ['pclass_3', 'embarked_S', 'embarked_C']),
 (DecisionTreeClassifier(random_state=42),
  ['pclass_3', 'sex_female', 'parch']),
 (DecisionTreeClassifier(random_state=42),
  ['pclass_3', 'pclass_1', 'sex_male']),
 (DecisionTreeClassifier(random_state=42), ['sex_male', 'fam', 'sex_female']),
 (DecisionTreeClassifier(random_state=42), ['parch', 'age', 'embarked_C']),
 (DecisionTreeClassifier(random_state=42),
  ['sex_male', 'parch', 'embarked_S']),
 (DecisionTreeClassifier(random_state=42), ['parch', 'embarked_S', 'fam']),
 (DecisionTreeClassifier(ra

In [13]:
def measure_accuracy(models, X_train, y_train):
    """
    Measure accuracy of the models on the training data.
    
    Parameters:
    - models: List of models and their selected columns.
    - X_train: Training feature data.
    - y_train: True training labels.
    
    Returns:
    - Overall accuracy of the ensemble model.
    """
    # Store predictions for each model
    predictions = []
    
    for tree, selected_columns in models:
        # Use the selected columns to predict on the training data
        X_train_subset = X_train[selected_columns]
        pred = tree.predict(X_train_subset)
        predictions.append(pred)
    
    # Transpose predictions to get predictions for each sample across all models
    predictions = np.array(predictions).T
    
    # Majority vote across the 10 models for each sample
    final_predictions, _ = mode(predictions, axis=1)
    final_predictions = final_predictions.flatten()  # Convert to 1D array
    
    # Measure accuracy
    accuracy = accuracy_score(y_train, final_predictions)
    return accuracy

# Measure accuracy on the training data
train_accuracy = measure_accuracy(models, X_train, y_train)
print(f"Training Accuracy: {train_accuracy:.2f}")


Training Accuracy: 0.84


In [14]:
def majority_vote_prediction(models, X_test):
    """
    Apply the trained models on the test data and perform majority vote for final prediction.
    
    Parameters:
    - models: List of trained models and their selected columns.
    - X_test: Test feature data.
    
    Returns:
    - Final predictions based on majority voting.
    """
    # Store predictions for each model
    predictions = []
    
    for tree, selected_columns in models:
        # Use the selected columns to predict on the test data
        X_test_subset = X_test[selected_columns]
        pred = tree.predict(X_test_subset)
        predictions.append(pred)
    
    # Transpose predictions to get predictions for each sample across all models
    predictions = np.array(predictions).T
    
    # Majority vote across the models for each test sample
    final_predictions, _ = mode(predictions, axis=1)
    final_predictions = final_predictions.flatten()  # Convert to 1D array
    
    return final_predictions

# Make predictions on the test data
final_test_predictions = majority_vote_prediction(models, X_test)

# Measure accuracy on the test data
test_accuracy = accuracy_score(y_test, final_test_predictions)
print(f"Test Accuracy: {test_accuracy:.2f}")


Test Accuracy: 0.81
