In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
import numpy as np
import random

class DecisionTree:
    def __init__(self, criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,selected_features=None):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None
        self.selected_features = selected_features

    def fit(self, X, y):
        dataset = X.copy()
        dataset['label'] = y
        self.tree = self._build_tree(dataset, self.max_depth)

    def _build_tree(self, dataset, depth):
        if depth == 0 or dataset['label'].nunique() == 1 or len(dataset) < self.min_samples_split:
            return dataset['label'].value_counts().idxmax()

        best_feature, best_split = self._find_best_split(dataset)
        if best_feature is None:
            return dataset['label'].value_counts().idxmax()

        left_split = dataset[dataset[best_feature] <= best_split]
        right_split = dataset[dataset[best_feature] > best_split]

        if len(left_split) < self.min_samples_leaf or len(right_split) < self.min_samples_leaf:
            return dataset['label'].value_counts().idxmax()

        return {
            'feature': best_feature,
            'split': best_split,
            'left': self._build_tree(left_split, depth - 1),
            'right': self._build_tree(right_split, depth - 1)
        }

    def _find_best_split(self, dataset):
        best_feature, best_split, best_info_gain = None, None, 0

        if self.criterion == 'entropy':
            parent_criterion_value = self._calculate_entropy(dataset['label'])
        elif self.criterion == 'gini':
            parent_criterion_value = self._calculate_gini(dataset['label'])
        else:  # misclassification
            parent_criterion_value = self._calculate_misclassification(dataset['label'])

        for feature in dataset.columns.drop('label'):
            for split_value in dataset[feature].unique():
                left_split = dataset[dataset[feature] <= split_value]
                right_split = dataset[dataset[feature] > split_value]

                if len(left_split) < self.min_samples_leaf or len(right_split) < self.min_samples_leaf:
                    continue

                if self.criterion == 'entropy':
                    child_criterion_value = (len(left_split) * self._calculate_entropy(left_split['label']) +
                                             len(right_split) * self._calculate_entropy(right_split['label'])) / len(dataset)
                elif self.criterion == 'gini':
                    child_criterion_value = (len(left_split) * self._calculate_gini(left_split['label']) +
                                             len(right_split) * self._calculate_gini(right_split['label'])) / len(dataset)
                else:  # misclassification
                    child_criterion_value = (len(left_split) * self._calculate_misclassification(left_split['label']) +
                                             len(right_split) * self._calculate_misclassification(right_split['label'])) / len(dataset)

                info_gain = parent_criterion_value - child_criterion_value

                if info_gain > best_info_gain:
                    best_feature, best_split, best_info_gain = feature, split_value, info_gain

        return best_feature, best_split

    @staticmethod
    def _calculate_entropy(labels):
        if len(labels) == 0:
            return 0

        p = labels.value_counts() / len(labels)
        return -sum(p * np.log2(p))

    @staticmethod
    def _calculate_gini(labels):
        if len(labels) == 0:
            return 0

        p = labels.value_counts() / len(labels)
        return 1 - sum(p**2)

    @staticmethod
    def _calculate_misclassification(labels):
        if len(labels) == 0:
            return 0

        p = labels.value_counts() / len(labels)
        return 1 - max(p)

    def predict(self, X):
        return X.apply(self._predict_row, axis=1)

    def _predict_row(self, row):
        node = self.tree
        while isinstance(node, dict):
            if row[node['feature']] <= node['split']:
                node = node['left']
            else:
                node = node['right']
        return node


class RandomForest:
    def __init__(self, n_trees, min_features):
        self.n_trees = n_trees
        self.min_features = min_features
        self.trees = []

    def fit(self, X, y):
        num_features = X.shape[1]
        num_samples = len(X)
        for _ in range(self.n_trees):
            # Resample the data with replacement
            indices = np.random.randint(num_samples, size=num_samples)
            X_sample, y_sample = X.iloc[indices], y.iloc[indices]

            # Random feature selection
            selected_features = random.sample(range(num_features), self.min_features)

            # Create and train a tree with the selected features
            tree = DecisionTree(max_depth=3, min_samples_split=2, min_samples_leaf=1, selected_features=selected_features)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.round(tree_preds.mean(axis=0)).astype(int)

class AdaBoost:
    def __init__(self, n_clfs):
        self.n_clfs = n_clfs
        self.clfs = []
        self.clf_weights = []

    def fit(self, X, y):
        n_samples = len(X)
        weights = np.ones(n_samples) / n_samples

        for _ in range(self.n_clfs):
            # Create and train a classifier
            clf = DecisionTree(max_depth=1)
            clf.fit(X, y)

            # Calculate error
            predictions = clf.predict(X)
            miss = [int(x) for x in (predictions != y)]
            miss2 = [x if x==1 else -1 for x in miss]
            error = np.dot(weights, miss) / sum(weights)

            # Update classifier weight
            clf_weight = 0.5 * np.log((1 - error) / error)

            # Update sample weights
            weights = np.multiply(weights, np.exp([float(x) * clf_weight for x in miss2]))
            weights = weights / np.sum(weights)

            # Save the classifier and its weight
            self.clfs.append(clf)
            self.clf_weights.append(clf_weight)

    def predict(self, X):
        clf_preds = np.array([clf.predict(X) * weight for clf, weight in zip(self.clfs, self.clf_weights)])
        return np.sign(clf_preds.sum(axis=0)).astype(int)



# Load the Titanic dataset from seaborn
data = sns.load_dataset('titanic')

# Function for data preprocessing
def preprocess_data(data):
    data.drop(['deck', 'embark_town', 'alive', 'adult_male', 'who', 'class'], axis=1, inplace=True)
    data['sex'] = data['sex'].map({'male': 0, 'female': 1})
    data['embarked'] = data['embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    data['age'].fillna(data['age'].mean(), inplace=True)
    data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)
    data.dropna(subset=['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'], inplace=True)
    return data

# Function to split data into features (X) and labels (y)
def split_data(data):
    X = data.drop('survived', axis=1)
    y = data['survived']
    return X, y

# Preprocess the data
data = preprocess_data(data)

# Split the data into features (X) and labels (y)
X, y = split_data(data)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

# Create and train the RandomForest model
rf = RandomForest(n_trees=10, min_features=6)
rf.fit(X_train, y_train)


# Fit the Decision Tree model on the training data
dt = DecisionTree(criterion='gini', max_depth=5, min_samples_split=10, min_samples_leaf=5)
dt.fit(X_train, y_train)

# Predict on Decision tree the test set
y_pred = dt.predict(X_test)

# Predict with the RandomForest model
y_pred_rf = rf.predict(X_test)

# Print the first few rows of the training data for each model
print("Decision Tree Training Data:")
print(X_train.head())
print("=================================")
print("Random Forest Training Data:")
print(X_train.head())
print("=================================")
print("AdaBoost Training Data:")
print(X_train.head())
print("=================================")


print('\n The Accuracy data table ')
# Print the accuracy score
print(f'1).Decission Tree Accuracy: {accuracy_score(y_test, y_pred)}')

# Print the RandomForest accuracy
print(f'2).RandomForest accuracy: {accuracy_score(y_test, y_pred_rf)}')

# Create and train the AdaBoost model
ab = AdaBoost(n_clfs=10)
ab.fit(X_train, y_train)

# Predict with the AdaBoost model
y_pred_ab = ab.predict(X_test)

# Print the AdaBoost accuracy
print(f'3).AdaBoost accuracy: {accuracy_score(y_test, y_pred_ab)}')



Decision Tree Training Data:
     pclass  sex        age  sibsp  parch     fare  embarked  alone
182       3    0   9.000000      4      2  31.3875       0.0  False
105       3    0  28.000000      0      0   7.8958       0.0   True
846       3    0  29.699118      8      2  69.5500       0.0  False
112       3    0  22.000000      0      0   8.0500       0.0   True
432       2    1  42.000000      1      0  26.0000       0.0  False
Random Forest Training Data:
     pclass  sex        age  sibsp  parch     fare  embarked  alone
182       3    0   9.000000      4      2  31.3875       0.0  False
105       3    0  28.000000      0      0   7.8958       0.0   True
846       3    0  29.699118      8      2  69.5500       0.0  False
112       3    0  22.000000      0      0   8.0500       0.0   True
432       2    1  42.000000      1      0  26.0000       0.0  False
AdaBoost Training Data:
     pclass  sex        age  sibsp  parch     fare  embarked  alone
182       3    0   9.000000      4