In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from openml import tasks, runs
from sklearn import neighbors
import math
import openml
from collections import Counter
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import datasets
from sklearn.metrics import accuracy_score

In [2]:
openml.config.apikey = '6a1d598d43fc357eb5b7b7afd49ab5f0

In [None]:
#Descision tree original

class Node:
    """
    A class to represent a node in a decision tree.
    """
    def __init__(self, feature=None, value=None, left=None, right=None, outcome=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.outcome = outcome

class DecisionTree:
    """
    A class to represent a decision tree.
    """
    def __init__(self, max_depth=float("inf"), min_samples_split=2, min_impurity=1e-7):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.root = None

    def fit(self, X, y):
        data = pd.concat([X, y], axis=1)
        self.root = self._build_tree(data)

    def predict(self, X):
        return X.apply(lambda row: self._predict_row(row), axis=1)

    def _build_tree(self, data, depth=0):
        """
        Recursive function that builds the decision tree.
        """
        # Check if we have reached the maximum depth
        if depth == self.max_depth:
            return Node(outcome=self._most_common_outcome(data))

        # Check if we have reached a leaf node
        if len(data) < self.min_samples_split:
            return Node(outcome=self._most_common_outcome(data))

        # Check if the data is pure (all labels are the same)
        if self._entropy(data) < self.min_impurity:
            return Node(outcome=self._most_common_outcome(data))

        # Select the best feature to split the data
        best_feature, best_value = self._best_feature_to_split(data)

        # Split the data based on the best feature and value
        left_data, right_data = self._split_data(data, best_feature, best_value)

        # Recursively build the left and right subtrees
        left_subtree = self._build_tree(left_data, depth+1)
        right_subtree = self._build_tree(right_data, depth+1)

        # Create a new node to represent the best feature and value
        return Node(feature=best_feature, value=best_value, left=left_subtree, right=right_subtree)

    def _predict_row(self, row):
        """
        Recursive function that predicts the label of a single row.
        """
        node = self.root
        while node.outcome is None:
            if row[node.feature] == node.value:
                node = node.left
            else:
                node = node.right
        return node.outcome

    def _entropy(self, data):
        """
        Calculate the entropy of a set of data.
        """
        num_samples = len(data)
        value_counts = data.iloc[:, -1].value_counts()
        probabilities = value_counts / num_samples
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _best_feature_to_split(self, data):
        """
        Select the best feature to split the data based on information gain.
        """
        best_gain = -1
        best_feature = None
        best_value = None
        entropy = self._entropy(data)

        for feature in data.columns[:-1]:
            values = data[feature].unique()

            for value in values:
                left_data, right_data = self._split_data(data, feature, value)

                if len(left_data) == 0:
                    continue
                if len(right_data) == 0:
                    continue

                gain = self._information_gain(data, left_data, right_data, entropy)

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_value = value

        return best_feature, best_value

    def _information_gain(self, data, left_data, right_data, entropy):
        """
        Calculate the information gain from splitting the data into two groups.
        """
        p = len(left_data) / len(data)
        gain = entropy - p*self._entropy(left_data) - (1-p)*self._entropy(right_data)
        return gain

    def _split_data(self, data, feature, value):
        """
        Split the data based on a given feature and value.
        """
        left_data = data[data[feature] == value].reset_index(drop=True)
        right_data = data[data[feature] != value].reset_index(drop=True)
        return left_data, right_data

    def _most_common_outcome(self, data):
        """
        Return the most common outcome in the data.
        """
        outcome_counts = data.iloc[:, -1].value_counts()
        most_common_outcome = outcome_counts.index[0]
        return most_common_outcome

In [83]:
class Node:
    """
    A class to represent a node in a decision tree.
    """
    def __init__(self, feature=None, value=None, left=None, right=None, outcome=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.outcome = outcome


class DecisionTree2:
    """
    A class to represent a decision tree.
    """
    def __init__(self, max_depth=float("inf"), min_samples_split=2, min_impurity=1e-7):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.root = None

    def fit(self, X, y):
        data = pd.concat([X, y], axis=1)
        self.root = self._build_tree(data)

    def predict(self, X):
        return X.apply(lambda row: self._predict_row(row), axis=1)

    def _build_tree(self, data, depth=0):
        """
        Recursive function that builds the decision tree.
        """
        # Check if we have reached the maximum depth
        if depth == self.max_depth:
            return Node(outcome=self._most_common_outcome(data))

        # Check if we have reached a leaf node
        if len(data) < self.min_samples_split:
            return Node(outcome=self._most_common_outcome(data))

        # Check if the data is pure (all labels are the same)
        if self._entropy(data) < self.min_impurity:
            return Node(outcome=self._most_common_outcome(data))

        # Select the best feature to split the data
        best_feature, best_value = self._best_feature_to_split(data)

        # Split the data based on the best feature and value
        left_data, right_data = self._split_data(data, best_feature, best_value)

        # Recursively build the left and right subtrees
        left_subtree = self._build_tree(left_data, depth+1)
        right_subtree = self._build_tree(right_data, depth+1)

        # Create a new node to represent the best feature and value
        return Node(feature=best_feature, value=best_value, left=left_subtree, right=right_subtree)

    def _predict_row(self, row):
        """
        Recursive function that predicts the label of a single row.
        """
        node = self.root
        while node.outcome is None:
            if row[node.feature] == node.value:
                node = node.left
            else:
                node = node.right
        return node.outcome

    def criteria(self, data):
        """
        Calculate the mean absolute deviation of a set of data.
        """
        target = data.iloc[:, -1]
        mean = target.mean()
        return np.abs(target - mean).mean()

    def _best_feature_to_split(self, data):
        """
        Select the best feature to split the data based on MAD reduction.
        """
        best_reduction = -1
        best_feature = None
        best_value = None

        # Calculate the MAD of the entire dataset
        criteria_full = self.criteria(data)

        # Loop through each feature and value
        for feature in data.columns[:-1]:
            for value in data[feature].unique():
                # Split the data based on the feature and value
                left_data, right_data = self._split_data(data, feature, value)

                # Skip if the split produced empty data
                if len(left_data) == 0 or len(right_data) == 0:
                    continue

                # Calculate the MAD reduction
                criteria_left = self.criteria(left_data)
                criteria_right = self.criteria(right_data)
                criteria_reduction = criteria_full - (len(left_data) / len(data) * criteria_left) - (len(right_data) / len(data) * criteria_right)

                # Update the best feature and value if the MAD reduction is higher
                if criteria_reduction > best_reduction:
                    best_reduction = criteria_reduction
                    best_feature = feature
                    best_value = value

        return best_feature, best_value

    def _split_data(self, data, feature, value):
        """
        Split the data based on a feature and value.
        """
        left_data = data[data[feature] == value]
        right_data = data[data[feature] != value]
        return left_data, right_data

    def _most_common_outcome(self, data):
        """
        Return the most common outcome in the data.
        """
        return data.iloc[:, -1].mode()[0]

    def _entropy(self, data):
        """
        Calculate the entropy of a set of data.
        """
        target = data.iloc[:, -1]
        p = target.value_counts(normalize=True)
        entropy = -(p * np.log2(p)).sum()
        return entropy

    def print_tree(self, node=None, depth=0):
        """
        Print the decision tree in a readable format.
        """
        if node is None:
            node = self.root

        if node.outcome is not None:
            print("    " * depth + "Outcome:", node.outcome)
        else:
            print("    " * depth + "Feature:", node.feature)
            print("    " * depth + "Value:", node.value)
            self.print_tree(node.left, depth+1)
            self.print_tree(node.right, depth+1)


In [84]:
#DATASET 1

dataset = openml.datasets.get_dataset(29)
dataset

OpenML Dataset
Name..........: credit-approval
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:21:38
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/29/credit-approval.arff
OpenML URL....: https://www.openml.org/d/29
# of features.: 16
# of instances: 690

In [85]:
df, _, _, _ = dataset.get_data()
df.tail()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0.0,f,g,260.0,0.0,-
686,a,22.67,0.75,u,g,c,v,2.0,f,t,2.0,t,g,200.0,394.0,-
687,a,25.25,13.5,y,p,ff,ff,2.0,f,t,1.0,t,g,200.0,1.0,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0.0,f,g,280.0,750.0,-
689,b,35.0,3.375,u,g,c,h,8.29,f,f,0.0,t,g,0.0,0.0,-


In [86]:
df["class"] = df["class"].replace(["-", "+"], [0, 1])

df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1.0,f,g,202.0,0.0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6.0,f,g,43.0,560.0,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0.0,f,g,280.0,824.0,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5.0,t,g,100.0,3.0,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0.0,f,s,120.0,0.0,1


In [87]:
X = df.drop("class", axis="columns")
X.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1.0,f,g,202.0,0.0
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6.0,f,g,43.0,560.0
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0.0,f,g,280.0,824.0
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5.0,t,g,100.0,3.0
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0.0,f,s,120.0,0.0


In [88]:
y = df["class"]
y = y.astype("int")
y.head()
#y.mean()

0    1
1    1
2    1
3    1
4    1
Name: class, dtype: int32

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [90]:
clf2 = DecisionTree2(max_depth=5)
clf2.fit(X_train, y_train)
predictions2 = clf2.predict(X_test)

In [81]:
clf = DecisionTree(max_depth=5)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

In [91]:
accuracy_score(y_test, predictions2)

0.8599033816425121

In [82]:
accuracy_score(y_test, predictions)

0.8454106280193237

In [None]:
#DATASETS

#29