# Assignment 1

## Data Preprocessing
### Importing data sets


In [55]:
import pandas as pd
import numpy as np

#Reading csv-files into variables as DataFrames
wp_raw = pd.read_csv('website-phishing.csv')
bcp_raw = pd.read_csv('bcp.csv')
ar_raw = pd.read_csv('arrhythmia.csv')

# Replace ? with NaN
wp_raw.replace('?', np.nan, inplace=True)
bcp_raw.replace('?', np.nan, inplace=True)
ar_raw.replace('?', np.nan, inplace=True)


#Convert all values to float to find mean
wp_raw = wp_raw.astype(float)
bcp_raw = bcp_raw.astype(float)
ar_raw = ar_raw.astype(float)


#Fillin NaNs with mean of column
wp = wp_raw.fillna(wp_raw.mean())
bcp = bcp_raw.fillna(bcp_raw.mean())
ar = ar_raw.fillna(ar_raw.mean())

#Check for missing values
wp_missing_values = wp.isna().sum().sum()
bcp_missing_values = bcp.isna().sum().sum()
ar_missing_values = ar.isna().sum().sum()

print("Missing values in WP :", wp_missing_values)
print("Missing values in BCP:", bcp_missing_values)
print("Missing values in AR:", ar_missing_values)


Missing values in WP : 0
Missing values in BCP: 0
Missing values in AR: 0


### Splitting data sets
In this section I will split the data sets into training and testing set.

In [51]:
from sklearn.model_selection import train_test_split

#Define feature matrix and target variable
X1 = wp.drop(columns=["  Class "])
y1 = wp["  Class "]

X2 = bcp.drop(columns=["Class"])
y2 = bcp["Class"] 

X3 = ar.drop(columns=["class"])
y3 = ar["class"]

#Define training and test set
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42)

print(y1_train)


480      1.0
10812   -1.0
4064     1.0
8225     1.0
9432    -1.0
        ... 
5734    -1.0
5191    -1.0
5390    -1.0
860      1.0
7270    -1.0
Name:   Class , Length: 8844, dtype: float64


## Implementation of classifiers

### Entropy
To compare the impurity before and after a split, I need a function for calculating entropy. Simply explained, entropy is a way of measuring disorder, and is used to measure the randomness in a set. The formula for entropy is: 
$$
E(S) = \sum_{i=1}^{c} - p_i \log_2(p_i)
$$

Where S is the startset, and $p_i$ is the propability of value i. To avoid logarithm of zero, $1e-9$ is added to $p_i$. This addition is minor, but will make avoid numeric instability. 

In [56]:
#Calculate entropy
import numpy as np

def entropy(S):
    p_i = S.value_counts()/S.shape[0]
    e = np.sum(-p_i * np.log2(p_i + 1e-9))
    return e

### Information Gain
To evaluate the splits, I will calculate the entropy before and anfter the split. The difference will give me a measure of the information gain of the split. If the entropy is lower (less randomness in the subsets) after the split, the information gain will be positive. I want to choose the split that provides the highest information gain. The formula for information gain is: 
$$
InformationGain= Entropy(S) - \sum_{k} \frac{|k|}{|S|}Entropy(k)
$$

Where S is the start set, and $k_i$ is the i-th subset. The fraction factor is weighting the entropy of each of the subsets based on the number of instances in the subset, compared with the total number of instances.

In [57]:
#Calculate information gain
def information_gain(S, splits):
    #Entropy of start set
    S_entropy = entropy(S)

    #Entropy of split
    split_entropy = 0
    tot_samples = len(S)
    for split in splits:
        split_weight = len(split) / tot_samples
        split_entropy += split_weight * entropy(split)
    
    ig = S_entropy - split_entropy
    return ig

## Implementing Classifiers

In [None]:
'''class DecisionTree:
    def __init__(self, max_depth = None, max_split_per_feature = 10):
        self.max_depth = max_depth
        self.max_split_per_feature = max_split_per_feature

    def make_split(self, X, y, feature_ids, thresholds):
        splits = [] 
        for threshold in thresholds:
            index = np.where(X[:,feature_ids] <= threshold)[0] #Split on threshold
            splits.append(index, y[index]) #Add index and label thats lower than threshold
        return splits 
    
    def make_split(self, X, y, feature_ids, thresholds):
        left_indices = np.where(X[:, feature_ids] <= thresholds)[0]
        right_indices = np.where(X[:, feature_ids] > thresholds)[0]
        return (left_indices, y[left_indices]), (right_indices, y[right_indices])

    def best_split(self, X, y):
        best_feature_ids, best_threshold, best_ig = None, None, -1
        for feature_ids in range(X.shape[1]):

            #Unique values in the feature
            n_values = np.unique(X[:, feature_ids])

            #Find thresholds
            if len(n_values) > self.max_split_per_feature:
                #Create possible thresholds. Eaqualy distributed between min and max values found in the feature-column
                thresholds = np.linspace(min(n_values), max(n_values), self.max_split_per_feature)
            else: 
                thresholds = n_values

            for threshold in thresholds:
                splits = self.make_split(X,y, feature_ids, [threshold])
                ig = information_gain(y, [split[1] for split in splits])
                if ig > best_ig:
                    best_ig = ig
                    best_feature_ids = feature_ids
                    best_threshold = threshold
        return best_feature_ids, best_threshold

    #Grow tree
    def grow_tree(self, X, y, depth = 0):

        #Stopping criteria
        if depth == self.max_depth or len(np.unique(y)) == 1: #Max depth or leaf node
            return np.argmax(np.bincount(y))
        
        best_feature_ids, best_threshold = self.best_split(X,y)
        if best_feature_ids is None: #No ig in split
            return np.argmax(np.bincount(y))
        
        #Recursivly grow tree
        splits = self.make_split(X, y, best_feature_ids, [best_threshold])
        branches = []
        for split in splits:
            if len(splits[0]) == 0:
                branches.append(np.argmax(np.bincount(y)))
            else: 
                branches.append(self.grow_tree(X[split[0], split[1], depth + 1]))
        return (best_feature_ids, best_threshold, branches)
    
    #Fit model
    def fit(self, X_train, y_train):
        self.tree = self.grow_tree(X_train.to_numpy(), y_train)

    #Predict instance
    def pred_instance(self, x, tree):
        if isinstance(tree, int):
            return tree
        feature_ids, threshold, branches = tree
        for i, branch in enumerate(branches):
            if x[feature_ids] <= threshold[i]:
                return self.pred_instance(x, branch)
        return self.pred_instance(x, branches[-1])

    #Predictions
    def predict(self, X):
        predictions = []
        for x in X.to_numpy():
            predictions.append(self.pred_instance(x, self.tree))
        return np.array(predictions)
'''

class DecisionTree:
    def __init__(self, max_depth=None, max_split_per_feature=10):
        self.max_depth = max_depth
        self.max_split_per_feature = max_split_per_feature
        self.tree = None

    def make_split(self, X, y, feature_index, threshold):
        left_indices = np.where(X[:, feature_index] <= threshold)[0]
        right_indices = np.where(X[:, feature_index] > threshold)[0]
        return (left_indices, y[left_indices]), (right_indices, y[right_indices])

    def best_split(self, X, y):
        best_feature_index, best_threshold, best_ig = None, None, -1
        for feature_index in range(X.shape[1]):
            unique_values = np.unique(X[:, feature_index])
            if len(unique_values) > self.max_split_per_feature:
                thresholds = np.linspace(min(unique_values), max(unique_values), self.max_split_per_feature)
            else:
                thresholds = unique_values[:-1]
            for threshold in thresholds:
                left_split, right_split = self.make_split(X, y, feature_index, threshold)
                ig = information_gain(y, [left_split[1], right_split[1]])
                if ig > best_ig:
                    best_ig = ig
                    best_feature_index = feature_index
                    best_threshold = threshold
        return best_feature_index, best_threshold

    def grow_tree(self, X, y, depth=0):
        if self.max_depth is not None and depth >= self.max_depth:
            return np.argmax(np.bincount(y))
        if len(np.unique(y)) == 1:
            return y[0]
        best_feature_index, best_threshold = self.best_split(X, y)
        if best_feature_index is None:
            return np.argmax(np.bincount(y))
        left_split, right_split = self.make_split(X, y, best_feature_index, best_threshold)
        left_branch = self.grow_tree(X[left_split[0]], left_split[1], depth + 1)
        right_branch = self.grow_tree(X[right_split[0]], right_split[1], depth + 1)
        return (best_feature_index, best_threshold, left_branch, right_branch)

    def fit(self, X_train, y_train):
        self.tree = self.grow_tree(X_train.to_numpy(), y_train)

    def pred_instance(self, x, tree):
        if isinstance(tree, int):
            return tree
        feature_index, threshold, left_branch, right_branch = tree

        #Check that x is numerical
        numeric_variable = True if x.dtypes != 'O' else False
        if numeric_variable:
            if x[feature_index] <= threshold:
                return self.pred_instance(x, left_branch)
            else:
                return self.pred_instance(x, right_branch)

    def predict(self, X):
        predictions = []
        for x in X.to_numpy():
            predictions.append(self.pred_instance(x, self.tree))
        return np.array(predictions)


In [64]:
from sklearn.metrics import accuracy_score

tree = DecisionTree(max_depth=3, max_split_per_feature=10)

# Step 3: Train the decision tree model
tree.fit(X1_train, y1_train)

# Step 4: Use the trained model to make predictions
predictions = tree.predict(X1_test)

# Step 5: Calculate the accuracy
accuracy = accuracy_score(y1_test, predictions)
print("Accuracy:", accuracy)

KeyError: '[3, 35, 39, 70, 76, 88, 93, 96, 101, 103, 106, 107, 119, 131, 167, 173, 185, 218, 245, 248, 251, 252, 263, 267, 291, 299, 311, 316, 318, 335, 376, 379, 398, 407, 408, 410, 415, 416, 426, 434, 439, 440, 457, 465, 469, 483, 486, 513, 533, 577, 590, 605, 608, 621, 623, 624, 676, 708, 710, 738, 747, 761, 764, 777, 782, 794, 836, 839, 852, 856, 881, 883, 914, 932, 937, 958, 970, 971, 1010, 1026, 1039, 1053, 1056, 1087, 1107, 1123, 1142, 1145, 1156, 1157, 1178, 1180, 1183, 1188, 1190, 1195, 1206, 1217, 1245, 1247, 1261, 1277, 1281, 1282, 1297, 1320, 1347, 1351, 1360, 1373, 1402, 1413, 1427, 1453, 1454, 1488, 1496, 1533, 1566, 1579, 1593, 1597, 1608, 1660, 1670, 1684, 1688, 1692, 1703, 1740, 1783, 1785, 1793, 1803, 1807, 1835, 1839, 1851, 1864, 1886, 1915, 1952, 1953, 1964, 1971, 2012, 2020, 2039, 2107, 2108, 2109, 2124, 2128, 2138, 2140, 2150, 2152, 2168, 2174, 2184, 2191, 2210, 2221, 2232, 2236, 2239, 2245, 2254, 2271, 2301, 2315, 2317, 2319, 2348, 2372, 2389, 2390, 2391, 2392, 2405, 2407, 2417, 2425, 2459, 2464, 2473, 2498, 2531, 2534, 2588, 2595, 2613, 2614, 2629, 2648, 2678, 2680, 2685, 2709, 2751, 2754, 2791, 2815, 2818, 2865, 2874, 2894, 2909, 2922, 2929, 2930, 2932, 2947, 2960, 2996, 3000, 3006, 3011, 3050, 3058, 3060, 3061, 3078, 3082, 3107, 3125, 3131, 3133, 3146, 3160, 3184, 3204, 3206, 3207, 3219, 3231, 3244, 3251, 3264, 3265, 3274, 3288, 3295, 3307, 3315, 3316, 3329, 3352, 3369, 3378, 3393, 3396, 3413, 3418, 3424, 3464, 3465, 3469, 3509, 3527, 3538, 3544, 3570, 3573, 3574, 3614, 3621, 3647, 3656, 3669, 3679, 3680, 3689, 3693, 3753, 3755, 3781, 3833, 3834, 3838, 3842, 3884, 3891, 3921, 3933, 3946, 3948, 3952, 3958, 3971, 3979, 4001, 4004, 4010, 4023, 4026, 4028, 4031, 4043, 4047, 4053, 4139, 4148, 4189, 4193, 4217, 4245, 4253, 4256, 4264, 4279, 4291, 4315, 4329, 4337, 4341, 4342, 4344, 4352, 4366, 4373, 4374, 4376, 4377, 4385, 4397, 4401, 4403, 4411, 4417, 4421, 4457, 4474, 4504, 4525, 4580, 4605, 4607, 4614, 4618, 4640, 4685, 4710, 4718, 4740, 4755, 4768, 4782, 4791, 4793, 4808, 4828, 4850, 4852, 4885, 4889, 4918, 4921, 4934, 4957, 4982, 4989, 5006, 5007, 5009, 5011, 5026, 5027, 5030, 5034, 5082, 5098, 5114, 5123, 5133, 5155, 5165, 5210, 5245, 5254, 5265, 5302, 5303, 5306, 5350, 5358, 5369, 5389, 5391, 5397, 5399, 5412, 5414, 5425, 5440, 5456, 5474, 5484, 5489, 5496, 5500, 5501, 5514, 5515, 5516, 5532, 5553, 5559, 5579, 5609, 5629, 5632, 5640, 5651, 5674, 5683, 5689, 5697, 5703, 5719, 5729, 5730, 5731, 5736, 5747, 5748, 5756, 5773, 5798, 5807, 5825, 5829, 5849, 5856, 5865, 5874, 5885, 5931, 5932, 5937, 5960, 5968, 5998, 6013, 6070, 6080, 6085, 6109, 6133, 6135, 6141, 6211, 6227, 6286, 6319, 6330, 6355, 6369, 6390, 6408, 6434, 6444, 6448, 6462, 6472, 6478, 6492, 6512, 6536, 6553, 6558, 6568, 6582, 6592, 6596, 6636, 6651, 6658, 6662, 6663, 6664, 6671, 6686, 6690, 6691, 6697, 6713, 6774, 6778, 6791, 6800, 6802, 6803, 6805, 6849, 6865, 6885, 6912, 6915, 6958, 6992, 7018, 7025, 7036, 7047, 7048, 7058, 7067, 7072, 7074, 7095, 7108, 7145, 7200, 7203, 7207, 7212, 7222, 7223, 7234, 7235, 7244, 7254, 7258, 7268, 7286, 7299, 7302, 7315, 7319, 7382, 7386, 7388, 7464, 7474, 7495, 7497, 7499, 7522, 7533, 7538, 7562, 7589, 7607, 7619, 7623, 7632, 7642, 7671, 7681, 7712, 7731, 7748, 7752, 7754, 7755, 7760, 7772, 7784, 7799, 7859, 7905, 7923, 7932, 7936, 7942, 7969, 8013, 8014, 8015, 8043, 8062, 8063, 8072, 8079, 8081, 8093, 8141, 8145, 8149, 8163, 8176, 8184, 8196, 8205, 8217, 8243, 8246, 8262, 8274, 8283, 8304, 8328, 8353, 8358, 8361, 8367, 8371, 8379, 8382, 8405, 8426, 8430, 8431, 8437, 8475, 8483, 8517, 8532, 8558, 8565, 8605, 8617, 8621, 8622, 8630, 8651, 8658, 8672, 8683, 8709, 8710, 8721, 8742, 8747, 8751, 8759, 8767, 8797, 8802, 8822, 8834] not in index'

# NOTES


In [None]:
class XDecisionStump:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.prediction = None

    def split_tree(self, X, y):
        best_feature_index, best_threshold = get_best_split(X, y)
        if best_feature_index is None:
            return y.mode()[0]  # Majority class in leaf node
        bol = X.iloc[:, best_feature_index] <= best_threshold
        y_left = y[bol]
        y_right = y[~bol]
        if y_left.empty or y_right.empty:
            return y.mode()[0]  # Majority class in leaf node
        return (best_feature_index, best_threshold, y_left, y_right)
    
    def get_acc(y, left, right):
        acc = -1
        return acc
    
    def get_best_accuracy(self, X,y):
        best_acc = -1
        best_threshold = None
        best_feature_index = None

        for feature_id in range (0, X.shape[1]):
            thresholds = np.unique(X.iloc[: ,feature_id].tolist())
            for threshold_value in thresholds:
                left, right = split(X,y,feature_id, threshold_value)
                acc = self.get_acc(y,left,right)
                if acc > best_acc:
                    best_acc, best_threshold,best_feature_index = acc, threshold_value, feature_id

        return best_feature_index, best_threshold
    
    def fit(self, X, y):
        best_feature_index, best_threshold = self.get_best_accuracy(X,y)
        bol = X.iloc[:, best_feature_index] <= best_threshold
        y_left = y[bol]
        y_right = y[~bol]
        return (best_feature_index, best_threshold, y_left, y_right)

    def fit(self, X, y):
        best_score = float('inf')
        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                y_left = y[X[:, feature_index] < threshold]
                y_right = y[X[:, feature_index] >= threshold]
                score = len(y_left) * self._impurity(y_left) + len(y_right) * self._impurity(y_right)
                if score < best_score:
                    best_score = score
                    self.feature_index = feature_index
                    self.threshold = threshold
                    self.prediction = np.round(y.mean())

    def predict(self, X):
        return np.where(X[:, self.feature_index] < self.threshold, self.prediction, 1 - self.prediction)

    def _impurity(self, y):
        p = np.mean(y)
        return p * (1 - p)

In [None]:
class XDecisionStump:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.prediction = None

    def fit(self, X, y):
        best_accuracy = 0

        # Iterate over each feature and threshold to find the best split
        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])

            for threshold in thresholds:
                # Make predictions based on the current feature and threshold
                predictions = np.where(X[:, feature_index] <= threshold, 1, -1)

                # Calculate accuracy
                accuracy = np.mean(predictions == y)

                # Update the best split if accuracy improves
                if accuracy > best_accuracy:
                    self.feature_index = feature_index
                    self.threshold = threshold
                    self.prediction = 1 if np.mean(y) >= 0.5 else -1
                    best_accuracy = accuracy

    def pred_instance(self, x, tree):
        if isinstance(tree, int) or isinstance(tree, float):
            return tree  # Return the predicted class directly
        feature_index, threshold, left_branch, right_branch = tree
        if float(x[feature_index]) <= float(threshold):
            return self.pred_instance(x, left_branch)
        else:
            return self.pred_instance(x, right_branch)

    def predict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        X = X.astype(float) 
        predictions = []
        for _, x in X.iterrows():
            predictions.append(self.pred_instance(x, self.tree))
        return np.array(predictions)


In [None]:
class xDecisionStump:
    def __init__(self):
        self.feature_index = None
        self.threshold = 0

    def fit(self, X, y):
        best_accuracy = 0

        # Iterate over each feature and threshold to find the best split
        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])

            for threshold in thresholds:
                # Make predictions based on the current feature and threshold
                predictions = np.where(X[:, feature_index] <= threshold, 1, -1)

                # Calculate accuracy
                accuracy = np.mean(predictions == y)

                # Update the best split if accuracy improves
                if accuracy > best_accuracy:
                    self.feature_index = feature_index
                    self.threshold = threshold
                    best_accuracy = accuracy

    def predict_instance(self, x):
        if float(x[self.feature_index]) <= float(self.threshold):
            return 1
        else:
            return -1

        
    
    def xpredict(self, X):
        predictions = []
        for _, x in X.iterrows():
            predictions.append(self.predict_instance(x))
        return np.array(predictions)

    def predict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        predictions = []
        for _, x in X.iterrows():
            predictions.append(self.predict_instance(x))
        return np.array(predictions)
    

    def xpred_instance(self, x, tree):
        if isinstance(tree, int) or isinstance(tree, float):
            return tree  # Return the predicted class directly
        feature_index, threshold, left_branch, right_branch = tree
        if float(x[feature_index]) <= float(threshold):
            return self.pred_instance(x, left_branch)
        else:
            return self.pred_instance(x, right_branch)



In [None]:
#Decision stump
stump1 = DecisionStump()
stump2 = DecisionStump()
stump3 = DecisionStump()

# Fit the models
stump1.fit(X1_train.values, y1_train.values)
stump2.fit(X2_train.values, y2_train.values)
stump3.fit(X3_train.values, y3_train.values)

#Accuracy
y1_pred = stump1.predict(X1_test.values)
y2_pred = stump2.predict(X2_test.values)
y3_pred = stump3.predict(X3_test.values)

def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

acc1 = accuracy(y1_test, y1_pred)
acc2 = accuracy(y2_test, y2_pred)
acc3 = accuracy(y3_test, y3_pred)

print("Accuracy webiste-phising:", acc1)
print("Accuracy bcp:", acc2)
print("Accuracy: arrhythmia", acc3)


In [None]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def grow_tree(self, X, y, depth=0):
        # Stopping criteria
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return y.mode()[0]  # Majority class in leaf node
        best_feature_index, best_threshold = get_best_split(X, y)
        if best_feature_index is None:
            return y.mode()[0]  # Majority class in leaf node
        bol = X.iloc[:, best_feature_index] <= best_threshold
        y_left = y[bol]
        y_right = y[~bol]
        if y_left.empty or y_right.empty:
            return y.mode()[0]  # Majority class in leaf node
        left_tree = self.grow_tree(X[bol], y_left, depth + 1)
        right_tree = self.grow_tree(X[~bol], y_right, depth + 1)
        return (best_feature_index, best_threshold, left_tree, right_tree)

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.tree = {}
        for cls in self.classes:
            self.tree[cls] = self.grow_tree(X, y)

    def predict_instance(self, x, tree):
        if isinstance(tree, int) or isinstance(tree, float):
            return tree  # Return the predicted class directly
        feature_index, threshold, left_branch, right_branch = tree
        if float(x[feature_index]) <= float(threshold):
            return self.predict_instance(x, left_branch)
        else:
            return self.predict_instance(x, right_branch)

    def predict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        predictions = []
        for _, x in X.iterrows():
            class_predictions = {}
            for cls in self.classes:
                class_predictions[cls] = self.predict_instance(x, self.tree[cls])
            predictions.append(max(class_predictions, key=class_predictions.get))
        return np.array(predictions)

# Decision Tree
tree1 = DecisionTree(max_depth=2)
tree2 = DecisionTree(max_depth=2)
tree3 = DecisionTree(max_depth=2)

# Fit the models
tree1.fit(X1_train, y1_train)
tree2.fit(X2_train, y2_train)
tree3.fit(X3_train, y3_train)

# Accuracy
y1_pred = tree1.predict(X1_test)
y2_pred = tree2.predict(X2_test)
y3_pred = tree3.predict(X3_test)

acc1 = accuracy(y1_test, y1_pred)
acc2 = accuracy(y2_test, y2_pred)
acc3 = accuracy(y3_test, y3_pred)

print("Accuracy webiste-phising:", acc1)
print("Accuracy bcp:", acc2)
print("Accuracy: arrhythmia", acc3)


In [None]:
class PrunedTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def grow_tree(self, X, y, depth=0):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return y.mode()[0]  # Majority class in leaf node
        best_feature_index, best_threshold = get_best_split(X, y)
        if best_feature_index is None:
            return y.mode()[0]  # Majority class in leaf node
        bol = X.iloc[:, best_feature_index] <= best_threshold
        y_left = y[bol]
        y_right = y[~bol]
        if y_left.empty or y_right.empty:
            return y.mode()[0]  # Majority class in leaf node
        left_tree = self.grow_tree(X[bol], y_left, depth + 1)
        right_tree = self.grow_tree(X[~bol], y_right, depth + 1)
        return (best_feature_index, best_threshold, left_tree, right_tree)

    def fit(self, X, y, X_val=None, y_val=None):
        self.tree = self.grow_tree(X, y)
        if X_val is not None and y_val is not None:
            self.post_prune(X_val, y_val)

    def post_prune(self, X_val, y_val):
        self.prune_node(X_val, y_val, self.tree)

    def prune_node(self, X_val, y_val, node):
        if isinstance(node, int):
            return node  # Leaf node, no pruning

        feature_index, threshold, left_branch, right_branch = node

        # Recursively prune left and right branches
        left_branch = self.prune_node(X_val, y_val, left_branch)
        right_branch = self.prune_node(X_val, y_val, right_branch)

        # Evaluate performance before and after pruning
        pred_before_prune = self.predict(X_val, node)
        pred_after_prune = self.predict(X_val, (feature_index, threshold))

        # Calculate error before and after pruning
        error_before_prune = np.sum(pred_before_prune != y_val) / len(y_val)
        error_after_prune = np.sum(pred_after_prune != y_val) / len(y_val)

        # Prune node if performance improves or does not significantly worsen
        if error_after_prune <= error_before_prune:
            return (feature_index, threshold, None, None)  # Prune node
        else:
            return node  # Keep node
    
    def predict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        X = X.astype(float) 
        predictions = []
        for _, x in X.iterrows():
            predictions.append(self.pred_instance(x, self.tree))
        return np.array(predictions)
    
    def pred_instance(self, x, tree):
        if isinstance(tree, int) or isinstance(tree, float):
            return tree  # Return the predicted class directly
        feature_index, threshold, left_branch, right_branch = tree
        if float(x[feature_index]) <= float(threshold):
            return self.pred_instance(x, left_branch)
        else:
            return self.pred_instance(x, right_branch)
        
#Pruned Tree
pruned1 = PrunedTree(max_depth=5)
pruned2 = PrunedTree(max_depth=5)
pruned3 = PrunedTree(max_depth=5)

#Fit the models
pruned1.fit(X1_train, y1_train)
pruned2.fit(X2_train, y2_train)
pruned3.fit(X3_train, y3_train)

#Accuracy
y1_pred = pruned1.predict(X1_test)
y2_pred = pruned2.predict(X2_test)
y3_pred = pruned3.predict(X3_test)

acc1 = accuracy(y1_test, y1_pred)
acc2 = accuracy(y2_test, y2_pred)
acc3 = accuracy(y3_test, y3_pred)

print("Accuracy webiste-phising:", acc1)
print("Accuracy bcp:", acc2)
print("Accuracy: arrhythmia", acc3)


In [None]:
class DecisionStump:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.class_labels = None

    def fit(self, X, y):
        self.classes = np.unique(y)
        best_accuracy = 0

        # Iterate over each feature and threshold to find the best split
        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])

            for threshold in thresholds:

                predictions = np.where(X[:, feature_index] <= threshold, -y, y)

                # Overall accuracy
                accuracy = np.mean(predictions == y)

                # Update the best split if accuracy improves
                if accuracy > best_accuracy:
                    self.feature_index = feature_index
                    self.threshold = threshold
                    best_accuracy = accuracy


    def predict_instance(self, x):
        if float(x[self.feature_index]) <= float(self.threshold):
            return self.class_labels[0]
        else:
            return self.class_labels[1]

    def predict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        predictions = []
        for _, x in X.iterrows():
            predictions.append(self.predict_instance(x))
        return np.array(predictions)

#Decision stump
stump1 = DecisionStump()
stump2 = DecisionStump()
stump3 = DecisionStump()

# Fit the models
stump1.fit(X1_train.values, y1_train.values)
stump2.fit(X2_train.values, y2_train.values)
stump3.fit(X3_train.values, y3_train.values)

# Set class labels
stump1.class_labels = np.unique(y1_train.values)
stump2.class_labels = np.unique(y2_train.values)
stump3.class_labels = np.unique(y3_train.values)

# Accuracy
y1_pred = stump1.predict(X1_test.values)
y2_pred = stump2.predict(X2_test.values)
y3_pred = stump3.predict(X3_test.values)

def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

acc1_stump = accuracy(y1_test, y1_pred)
acc2_stump = accuracy(y2_test, y2_pred)
acc3_stump = accuracy(y3_test, y3_pred)

print("Accuracy webiste-phising:", acc1_stump)
print("Accuracy bcp:", acc2_stump)
print("Accuracy: arrhythmia", acc3_stump)


In [None]:
class DecisionTree:    
    def __init__(self):
        self.tree = None
        self.best_feature_index = None
        self.threshold = None
        self.class_labels = None

    def grow_tree(self, X, y, depth=0):
            
            #Stopping criteria. Stop if leaf node
            if len(np.unique(y)) == 1:
                return y.mode()[0]  # Majority class in leaf node
            
            #Get the split with the highest information gain
            self.best_feature_index, self.best_threshold = get_best_split(X, y)

            #If no split provides information gain -> leaf node
            if self.best_feature_index is None:
                return y.mode()[0]  # Majority class in leaf node
            
            #Split the date into left and right node based on best feature and corresponding threshold
            bol = X.iloc[:, self.best_feature_index] <= self.best_threshold
            y_left = y[bol]
            y_right = y[~bol]

            #If all samples are in one node -> leaf node
            if y_left.empty or y_right.empty:
                return y.mode()[0]  # Majority class in leaf node
            
            #If not, grow three further
            left_tree = self.grow_tree(X[bol], y_left, depth + 1)
            right_tree = self.grow_tree(X[~bol], y_right, depth + 1)

            #Return splitting feature, threshold, left and right node
            return (self.best_feature_index, self.best_threshold, left_tree, right_tree)

    #Fit tree to data
    def fit(self, X, y):
        self.tree = self.grow_tree(X, y)

    #Predict one of the two first classes
    def xpredict_instance(self, x):
        if float(x[self.feature_index]) <= float(self.threshold):
            return self.class_labels[0]
        else:
            return self.class_labels[1]
        
    def pred_instance(self, x, tree):
        if isinstance(tree, int) or isinstance(tree, float):
            return tree  # Return the predicted class directly
        feature_index, threshold, left_branch, right_branch = tree
        if float(x[feature_index]) <= float(threshold):
            return self.pred_instance(x, left_branch)
        else:
            return self.pred_instance(x, right_branch)

    def xpredict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        predictions = []
        for _, x in X.iterrows():
            predictions.append(self.predict_instance(x))
        return np.array(predictions)
    
    def pred_instance(self, x, tree):
        if isinstance(tree, int) or isinstance(tree, float):
            return tree  # Return the predicted class directly
        feature_index, threshold, left_branch, right_branch = tree
        if float(x[feature_index]) <= float(threshold):
            return self.pred_instance(x, left_branch)
        else:
            return self.pred_instance(x, right_branch)

    def predict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        X = X.astype(float) 
        predictions = []
        for _, x in X.iterrows():
            predictions.append(self.pred_instance(x, self.tree))
        return np.array(predictions)
#########################################################################################

class xDecisionTree:    
    def __init__(self):
        self.tree = None

    def grow_tree(self, X, y, depth=0):
            
            #Stopping criteria
            if len(np.unique(y)) == 1:
                return y.mode()[0]  # Majority class in leaf node
            
            best_feature_index, best_threshold = get_best_split(X, y)
            if best_feature_index is None:
                return y.mode()[0]  # Majority class in leaf node
            bol = X.iloc[:, best_feature_index] <= best_threshold
            y_left = y[bol]
            y_right = y[~bol]
            if y_left.empty or y_right.empty:
                return y.mode()[0]  # Majority class in leaf node
            left_tree = self.grow_tree(X[bol], y_left, depth + 1)
            right_tree = self.grow_tree(X[~bol], y_right, depth + 1)
            return (best_feature_index, best_threshold, left_tree, right_tree)

    def fit(self, X, y):
        self.tree = self.grow_tree(X, y)

    def pred_instance(self, x, tree):
        if isinstance(tree, int) or isinstance(tree, float):
            return tree  # Return the predicted class directly
        feature_index, threshold, left_branch, right_branch = tree
        if float(x[feature_index]) <= float(threshold):
            return self.pred_instance(x, left_branch)
        else:
            return self.pred_instance(x, right_branch)

    def predict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        X = X.astype(float) 
        predictions = []
        for _, x in X.iterrows():
            predictions.append(self.pred_instance(x, self.tree))
        return np.array(predictions)
        

In [None]:
'''
#Webiste-phising
tree1 = DecisionTree()   
tree1.fit(X1_train, y1_train)
y1_pred = tree1.predict(X1_test)
acc1_tree = np.mean(y1_test == y1_pred)
print("Accuracy webiste-phising:", acc1_tree)

#BCP
tree2 = DecisionTree()
tree2.fit(X2_train, y2_train)
y2_pred = tree2.predict(X2_test)
acc2_tree = np.mean(y2_test == y2_pred)
print("Accuracy bcp:", acc2_tree)

#Arrhythmia
tree3 = DecisionTree()
tree3.fit(X3_train, y3_train)
y3_pred = tree3.predict(X3_test)
acc3_tree = np.mean(y3_test == y3_pred)
print("Accuracy: arrhythmia", acc3_tree)
'''

'''
# Pruned Tree
pruned1 = PrunedTree(max_depth=5, min_samples_split=5)
pruned2 = PrunedTree(max_depth=5, min_samples_split=5)
pruned3 = PrunedTree(max_depth=5, min_samples_split=5)

# Fit the models
pruned1.fit(X1_train, y1_train)
pruned2.fit(X2_train, y2_train)
pruned3.fit(X3_train, y3_train)

# Accuracy
y1_pred = pruned1.predict(X1_test)
y2_pred = pruned2.predict(X2_test)
y3_pred = pruned3.predict(X3_test)

acc1_pruned = accuracy(y1_test, y1_pred)
acc2_pruned = accuracy(y2_test, y2_pred)
acc3_pruned = accuracy(y3_test, y3_pred)

print("Accuracy webiste-phising:", acc1_pruned)
print("Accuracy bcp:", acc2_pruned)
print("Accuracy: arrhythmia", acc3_pruned)'''

'''class PrunedTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def grow_tree(self, X, y, depth=0):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return y.mode()[0]  # Majority class in leaf node
        best_feature_index, best_threshold = get_best_split(X, y)
        if best_feature_index is None:
            return y.mode()[0]  # Majority class in leaf node
        bol = X.iloc[:, best_feature_index] <= best_threshold
        y_left = y[bol]
        y_right = y[~bol]
        if y_left.empty or y_right.empty:
            return y.mode()[0]  # Majority class in leaf node
        left_tree = self.grow_tree(X[bol], y_left, depth + 1)
        right_tree = self.grow_tree(X[~bol], y_right, depth + 1)
        return (best_feature_index, best_threshold, left_tree, right_tree)

    def fit(self, X, y, X_val=None, y_val=None):
        self.tree = self.grow_tree(X, y)
        if X_val is not None and y_val is not None:
            self.post_prune(X_val, y_val)

    def post_prune(self, X_val, y_val):
        self.prune_node(X_val, y_val, self.tree)

    def prune_node(self, X_val, y_val, node):
        if isinstance(node, int):
            return node  # Leaf node, no pruning

        feature_index, threshold, left_branch, right_branch = node

        # Recursively prune left and right branches
        left_branch = self.prune_node(X_val, y_val, left_branch)
        right_branch = self.prune_node(X_val, y_val, right_branch)

        # Evaluate performance before and after pruning
        pred_before_prune = self.predict(X_val, node)
        pred_after_prune = self.predict(X_val, (feature_index, threshold))

        # Calculate error before and after pruning
        error_before_prune = np.sum(pred_before_prune != y_val) / len(y_val)
        error_after_prune = np.sum(pred_after_prune != y_val) / len(y_val)

        # Prune node if performance improves or does not significantly worsen
        if error_after_prune <= error_before_prune:
            return (feature_index, threshold, None, None)  # Prune node
        else:
            return node  # Keep node
    
    def predict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        X = X.astype(float) 
        predictions = []
        for _, x in X.iterrows():
            predictions.append(self.pred_instance(x, self.tree))
        return np.array(predictions)
    
    def pred_instance(self, x, tree):
        if isinstance(tree, int) or isinstance(tree, float):
            return tree  # Return the predicted class directly
        feature_index, threshold, left_branch, right_branch = tree
        if float(x[feature_index]) <= float(threshold):
            return self.pred_instance(x, left_branch)
        else:
            return self.pred_instance(x, right_branch)
        
#Pruned Tree
pruned1 = PrunedTree(max_depth=5)
pruned2 = PrunedTree(max_depth=5)
pruned3 = PrunedTree(max_depth=5)

#Fit the models
pruned1.fit(X1_train, y1_train)
pruned2.fit(X2_train, y2_train)
pruned3.fit(X3_train, y3_train)

#Accuracy
y1_pred = pruned1.predict(X1_test)
y2_pred = pruned2.predict(X2_test)
y3_pred = pruned3.predict(X3_test)

acc1 = np.mean(y1_test == y1_pred)
acc2 = np.mean(y2_test == y2_pred)
acc3 = np.mean(y3_test == y3_pred)

print("Accuracy webiste-phising:", acc1)
print("Accuracy bcp:", acc2)
print("Accuracy: arrhythmia", acc3)
'''

In [None]:
class DecisionTree:    
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def grow_tree(self, X, y, depth=0):
            
            #Stopping criteria
            if depth == self.max_depth or len(np.unique(y)) == 1:
                return y.mode()[0]  # Majority class in leaf node
            best_feature_index, best_threshold = get_best_split(X, y)
            if best_feature_index is None:
                return y.mode()[0]  # Majority class in leaf node
            bol = X.iloc[:, best_feature_index] <= best_threshold
            y_left = y[bol]
            y_right = y[~bol]
            if y_left.empty or y_right.empty:
                return y.mode()[0]  # Majority class in leaf node
            left_tree = self.grow_tree(X[bol], y_left, depth + 1)
            right_tree = self.grow_tree(X[~bol], y_right, depth + 1)
            return (best_feature_index, best_threshold, left_tree, right_tree)

    def fit(self, X, y):
        self.tree = self.grow_tree(X, y)

    def predict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        predictions = []
        for _, x in X.iterrows():
            predictions.append(self.pred_instance(x, self.tree))
        return np.array(predictions)

    def pred_instance(self, x, tree):
        if isinstance(tree, np.int64):
            return tree  # Return the predicted class directly
        feature_index, threshold, left_branch, right_branch = tree
        if float(x[feature_index]) <= float(threshold):
            return self.pred_instance(x, left_branch)
        else:
            return self.pred_instance(x, right_branch)

#Webiste-phising
tree1 = DecisionTree()   
tree1.fit(X1_train, y1_train)
y1_pred = tree1.predict(X1_test)
acc1_tree = np.mean(y1_test == y1_pred)
print("Accuracy webiste-phising:", acc1_tree)

#BCP
tree2 = DecisionTree()
tree2.fit(X2_train, y2_train)
y2_pred = tree2.predict(X2_test)
acc2_tree = np.mean(y2_test == y2_pred)
print("Accuracy bcp:", acc2_tree)

#Arrhythmia
tree3 = DecisionTree()
tree3.fit(X3_train, y3_train)
y3_pred = tree3.predict(X3_test)
acc3_tree = np.mean(y3_test == y3_pred)
print("Accuracy: arrhythmia", acc3_tree)


In [None]:
'''class PrunedTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def grow_tree(self, X, y, depth=0):
        if len(np.unique(y)) == 1 or (self.max_depth is not None and depth == self.max_depth):
            return y.mode()[0]  # Majority class in leaf node

        best_feature_index, best_threshold = get_best_split(X, y)
        if best_feature_index is None:
            return y.mode()[0]  # Majority class in leaf node

        left_indices = X.iloc[:, best_feature_index] <= best_threshold
        right_indices = ~left_indices

        left_tree = self.grow_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self.grow_tree(X[right_indices], y[right_indices], depth + 1)

        return (best_feature_index, best_threshold, left_tree, right_tree)

    def fit(self, X, y):
        self.tree = self.grow_tree(X, y)

    def predict(self, X):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        predictions = []
        for _, x in X.iterrows():
            predictions.append(self.pred_instance(x, self.tree))
        return np.array(predictions)

    def pred_instance(self, x, tree):
        if isinstance(tree, np.int64):
            return tree  # Return the predicted class directly
        feature_index, threshold, left_branch, right_branch = tree
        if float(x[feature_index]) <= float(threshold):
            return self.pred_instance(x, left_branch)
        else:
            return self.pred_instance(x, right_branch)

# Pruned Tree
pruned1 = PrunedTree(max_depth=5)
pruned1.fit(X1_train, y1_train)
y1_pred = pruned1.predict(X1_test.values)
acc1_pruned = np.mean(y1_test == y1_pred)
print("Accuracy webiste-phising:", acc1_pruned)


pruned2 = PrunedTree(max_depth=5)
pruned2.fit(X2_train, y2_train)
y2_pred = pruned2.predict(X2_test)
acc2_pruned = np.mean(y2_test == y2_pred)
print("Accuracy bcp:", acc2_pruned)


pruned3 = PrunedTree(max_depth=5)
pruned3.fit(X3_train, y3_train)
y3_pred = pruned3.predict(X3_test)
acc3_pruned = np.mean(y3_test == y3_pred)
print("Accuracy: arrhythmia", acc3_pruned)
'''