In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from openml import tasks, runs
from sklearn import neighbors
import math
import openml
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import accuracy_score

In [2]:
openml.config.apikey = '6a1d598d43fc357eb5b7b7afd49ab5f0'

In [3]:
#Descision tree original

class Node:
    """
    A class to represent a node in a decision tree.
    """
    def __init__(self, feature=None, value=None, left=None, right=None, outcome=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.outcome = outcome

class DecisionTree:
    """
    A class to represent a decision tree.
    """
    def __init__(self, max_depth=float("inf"), min_samples_split=2, min_impurity=1e-7):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.root = None

    def fit(self, X, y):
        data = pd.concat([X, y], axis=1)
        self.root = self._build_tree(data)

    def predict(self, X):
        return X.apply(lambda row: self._predict_row(row), axis=1)

    def _build_tree(self, data, depth=0):
        """
        Recursive function that builds the decision tree.
        """
        # Check if we have reached the maximum depth
        if depth == self.max_depth:
            return Node(outcome=self._most_common_outcome(data))

        # Check if we have reached a leaf node
        if len(data) < self.min_samples_split:
            return Node(outcome=self._most_common_outcome(data))

        # Check if the data is pure (all labels are the same)
        if self._entropy(data) < self.min_impurity:
            return Node(outcome=self._most_common_outcome(data))

        # Select the best feature to split the data
        best_feature, best_value = self._best_feature_to_split(data)

        # Split the data based on the best feature and value
        left_data, right_data = self._split_data(data, best_feature, best_value)

        # Recursively build the left and right subtrees
        left_subtree = self._build_tree(left_data, depth+1)
        right_subtree = self._build_tree(right_data, depth+1)

        # Create a new node to represent the best feature and value
        return Node(feature=best_feature, value=best_value, left=left_subtree, right=right_subtree)

    def _predict_row(self, row):
        """
        Recursive function that predicts the label of a single row.
        """
        node = self.root
        while node.outcome is None:
            if row[node.feature] == node.value:
                node = node.left
            else:
                node = node.right
        return node.outcome

    def _entropy(self, data):
        """
        Calculate the entropy of a set of data.
        """
        num_samples = len(data)
        value_counts = data.iloc[:, -1].value_counts()
        probabilities = value_counts / num_samples
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _best_feature_to_split(self, data):
        """
        Select the best feature to split the data based on information gain.
        """
        best_gain = -1
        best_feature = None
        best_value = None
        entropy = self._entropy(data)

        for feature in data.columns[:-1]:
            values = data[feature].unique()

            for value in values:
                left_data, right_data = self._split_data(data, feature, value)

                if len(left_data) == 0:
                    continue
                if len(right_data) == 0:
                    continue

                gain = self._information_gain(data, left_data, right_data, entropy)

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_value = value

        return best_feature, best_value

    def _information_gain(self, data, left_data, right_data, entropy):
        """
        Calculate the information gain from splitting the data into two groups.
        """
        p = len(left_data) / len(data)
        gain = entropy - p*self._entropy(left_data) - (1-p)*self._entropy(right_data)
        return gain

    def _split_data(self, data, feature, value):
        """
        Split the data based on a given feature and value.
        """
        left_data = data[data[feature] == value].reset_index(drop=True)
        right_data = data[data[feature] != value].reset_index(drop=True)
        return left_data, right_data

    def _most_common_outcome(self, data):
        """
        Return the most common outcome in the data.
        """
        outcome_counts = data.iloc[:, -1].value_counts()
        most_common_outcome = outcome_counts.index[0]
        return most_common_outcome

In [4]:
class Node:
    """
    A class to represent a node in a decision tree.
    """
    def __init__(self, feature=None, value=None, left=None, right=None, outcome=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.outcome = outcome


class DecisionTree2:
    """
    A class to represent a decision tree.
    """
    def __init__(self, max_depth=float("inf"), min_samples_split=2, min_impurity=1e-7):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.root = None

    def fit(self, X, y):
        data = pd.concat([X, y], axis=1)
        self.root = self._build_tree(data)

    def predict(self, X):
        return X.apply(lambda row: self._predict_row(row), axis=1)

    def _build_tree(self, data, depth=0):
        """
        Recursive function that builds the decision tree.
        """
        # Check if we have reached the maximum depth
        if depth == self.max_depth:
            return Node(outcome=self._most_common_outcome(data))

        # Check if we have reached a leaf node
        if len(data) < self.min_samples_split:
            return Node(outcome=self._most_common_outcome(data))

        # Check if the data is pure (all labels are the same)
        if self._entropy(data) < self.min_impurity:
            return Node(outcome=self._most_common_outcome(data))

        # Select the best feature to split the data
        best_feature, best_value = self._best_feature_to_split(data)

        # Split the data based on the best feature and value
        left_data, right_data = self._split_data(data, best_feature, best_value)

        # Recursively build the left and right subtrees
        left_subtree = self._build_tree(left_data, depth+1)
        right_subtree = self._build_tree(right_data, depth+1)

        # Create a new node to represent the best feature and value
        return Node(feature=best_feature, value=best_value, left=left_subtree, right=right_subtree)

    def _predict_row(self, row):
        """
        Recursive function that predicts the label of a single row.
        """
        node = self.root
        while node.outcome is None:
            if row[node.feature] == node.value:
                node = node.left
            else:
                node = node.right
        return node.outcome

    def criteria(self, data):
        """
        Calculate the mean absolute deviation of a set of data.
        """
        target = data.iloc[:, -1]
        mean = target.mean()
        return np.abs(target - mean).mean()

    def _best_feature_to_split(self, data):
        """
        Select the best feature to split the data based on MAD reduction.
        """
        best_reduction = -1
        best_feature = None
        best_value = None

        # Calculate the MAD of the entire dataset
        criteria_full = self.criteria(data)

        # Loop through each feature and value
        for feature in data.columns[:-1]:
            for value in data[feature].unique():
                # Split the data based on the feature and value
                left_data, right_data = self._split_data(data, feature, value)

                # Skip if the split produced empty data
                if len(left_data) == 0 or len(right_data) == 0:
                    continue

                # Calculate the MAD reduction
                criteria_left = self.criteria(left_data)
                criteria_right = self.criteria(right_data)
                criteria_reduction = criteria_full - (len(left_data) / len(data) * criteria_left) - (len(right_data) / len(data) * criteria_right)

                # Update the best feature and value if the MAD reduction is higher
                if criteria_reduction > best_reduction:
                    best_reduction = criteria_reduction
                    best_feature = feature
                    best_value = value

        return best_feature, best_value

    def _split_data(self, data, feature, value):
        """
        Split the data based on a feature and value.
        """
        left_data = data[data[feature] == value]
        right_data = data[data[feature] != value]
        return left_data, right_data

    def _most_common_outcome(self, data):
        """
        Return the most common outcome in the data.
        """
        return data.iloc[:, -1].mode()[0]

    def _entropy(self, data):
        """
        Calculate the entropy of a set of data.
        """
        target = data.iloc[:, -1]
        p = target.value_counts(normalize=True)
        entropy = -(p * np.log2(p)).sum()
        return entropy

    def print_tree(self, node=None, depth=0):
        """
        Print the decision tree in a readable format.
        """
        if node is None:
            node = self.root

        if node.outcome is not None:
            print("    " * depth + "Outcome:", node.outcome)
        else:
            print("    " * depth + "Feature:", node.feature)
            print("    " * depth + "Value:", node.value)
            self.print_tree(node.left, depth+1)
            self.print_tree(node.right, depth+1)


In [None]:
#Descision tree original

class Node:
    """
    A class to represent a node in a decision tree.
    """
    def __init__(self, feature=None, value=None, left=None, right=None, outcome=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.outcome = outcome

class DecisionTree3:
    """
    A class to represent a decision tree.
    """
    def __init__(self, max_depth=float("inf"), min_samples_split=2, min_impurity=1e-7):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.root = None

    def fit(self, X, y):
        data = pd.concat([X, y], axis=1)
        self.root = self._build_tree(data)

    def predict(self, X):
        return X.apply(lambda row: self._predict_row(row), axis=1)

    def _build_tree(self, data, depth=0):
        """
        Recursive function that builds the decision tree.
        """
        # Check if we have reached the maximum depth
        if depth == self.max_depth:
            return Node(outcome=self._most_common_outcome(data))

        # Check if we have reached a leaf node
        if len(data) < self.min_samples_split:
            return Node(outcome=self._most_common_outcome(data))

        # Check if the data is pure (all labels are the same)
        if self._entropy(data) < self.min_impurity:
            return Node(outcome=self._most_common_outcome(data))

        # Select the best feature to split the data
        best_feature, best_value = self._best_feature_to_split(data)

        # Split the data based on the best feature and value
        left_data, right_data = self._split_data(data, best_feature, best_value)

        # Recursively build the left and right subtrees
        left_subtree = self._build_tree(left_data, depth+1)
        right_subtree = self._build_tree(right_data, depth+1)

        # Create a new node to represent the best feature and value
        return Node(feature=best_feature, value=best_value, left=left_subtree, right=right_subtree)

    def _predict_row(self, row):
        """
        Recursive function that predicts the label of a single row.
        """
        node = self.root
        while node.outcome is None:
            if row[node.feature] == node.value:
                node = node.left
            else:
                node = node.right
        return node.outcome

    def _entropy(self, data):
        """
        Calculate the entropy of a set of data.
        """
        num_samples = len(data)
        value_counts = data.iloc[:, -1].value_counts()
        probabilities = value_counts / num_samples
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _best_feature_to_split(self, data):
        """
        Select the best feature to split the data based on information gain.
        """
        best_gain = -1
        best_feature = None
        best_value = None
        entropy = self._entropy(data)

        for feature in data.columns[:-1]:
            values = data[feature].unique()

            for value in values:
                left_data, right_data = self._split_data(data, feature, value)

                if len(left_data) == 0:
                    continue
                if len(right_data) == 0:
                    continue

                gain = self._information_gain(data, left_data, right_data, entropy)

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_value = value

        return best_feature, best_value

    def _information_gain(self, data, left_data, right_data, entropy):
        """
        Calculate the information gain from splitting the data into two groups.
        """
        p = len(left_data) / len(data)
        gain = entropy - p*self._entropy(left_data) - (1-p)*self._entropy(right_data)
        return gain

    def _split_data(self, data, feature, value):
        """
        Split the data based on a given feature and value.
        """
        left_data = data[data[feature] == value].reset_index(drop=True)
        right_data = data[data[feature] != value].reset_index(drop=True)
        return left_data, right_data

    def _most_common_outcome(self, data):
        """
        Return the most common outcome in the data.
        """
        outcome_counts = data.iloc[:, -1].value_counts()
        most_common_outcome = outcome_counts.index[0]
        return most_common_outcome

In [5]:
def accuracy(X, y):
    tree1 = DecisionTree(max_depth=5)
    tree2 = DecisionTree2(max_depth=5)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
    #Original tree
    tree1.fit(X_train, y_train)
    predictions1 = tree1.predict(X_test)  
    
    #Our tree
    tree2.fit(X_train, y_train)
    predictions2 = tree2.predict(X_test)  
    
    print(f"Accuracy original tree: {accuracy_score(y_test, predictions1)}")
    print(f"Accuracy our tree: {accuracy_score(y_test, predictions2)}")
     

In [6]:
#DATASET 1

dataset = openml.datasets.get_dataset(29)
dataset

OpenML Dataset
Name..........: credit-approval
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:21:38
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/29/credit-approval.arff
OpenML URL....: https://www.openml.org/d/29
# of features.: 16
# of instances: 690

In [12]:
df, _, _, _ = dataset.get_data()
df

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1.0,f,g,202.0,0.0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6.0,f,g,43.0,560.0,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0.0,f,g,280.0,824.0,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5.0,t,g,100.0,3.0,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0.0,f,s,120.0,0.0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0.0,f,g,260.0,0.0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2.0,t,g,200.0,394.0,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1.0,t,g,200.0,1.0,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0.0,f,g,280.0,750.0,-


In [13]:
df["class"] = df["class"].replace(["-", "+"], [0, 1])
df

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1.0,f,g,202.0,0.0,1
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6.0,f,g,43.0,560.0,1
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0.0,f,g,280.0,824.0,1
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5.0,t,g,100.0,3.0,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0.0,f,s,120.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0.0,f,g,260.0,0.0,0
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2.0,t,g,200.0,394.0,0
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1.0,t,g,200.0,1.0,0
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0.0,f,g,280.0,750.0,0


In [16]:
df.iloc[:, -1].value_counts()

0    383
1    307
Name: class, dtype: int64

In [94]:
X = df.drop("class", axis="columns")
y = df["class"]
y = y.astype("int")
accuracy(X, y)

Accuracy original tree: 0.8454106280193237
Accuracy our tree: 0.8599033816425121


In [95]:
#DATASET 2

dataset2 = openml.datasets.get_dataset(28)
dataset2

OpenML Dataset
Name..........: optdigits
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:21:34
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/28/optdigits.arff
OpenML URL....: https://www.openml.org/d/28
# of features.: 65
# of instances: 5620

In [96]:
df2, _, _, _ = dataset2.get_data()
df2

Unnamed: 0,input1,input2,input3,input4,input5,input6,input7,input8,input9,input10,...,input56,input57,input58,input59,input60,input61,input62,input63,input64,class
0,0.0,1.0,6.0,15.0,12.0,1.0,0.0,0.0,0.0,7.0,...,0.0,0.0,0.0,6.0,14.0,7.0,1.0,0.0,0.0,0
1,0.0,0.0,10.0,16.0,6.0,0.0,0.0,0.0,0.0,7.0,...,0.0,0.0,0.0,10.0,16.0,15.0,3.0,0.0,0.0,0
2,0.0,0.0,8.0,15.0,16.0,13.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,9.0,14.0,0.0,0.0,0.0,0.0,7
3,0.0,0.0,0.0,3.0,11.0,16.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,15.0,2.0,0.0,0.0,4
4,0.0,0.0,5.0,14.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,12.0,14.0,7.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5615,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0,9
5616,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0,0
5617,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0,8
5618,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0,9


In [97]:
X = df2.drop("class", axis="columns")
y = df2["class"]
y = y.astype("int")
accuracy(X, y)

Accuracy original tree: 0.7176749703440095
Accuracy our tree: 0.4489916963226572


In [98]:
#DATASET 3

dataset3 = openml.datasets.get_dataset(31)
dataset3

OpenML Dataset
Name..........: credit-g
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:21:47
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/31/credit-g.arff
OpenML URL....: https://www.openml.org/d/31
# of features.: 21
# of instances: 1000

In [99]:
df3, _, _, _ = dataset3.get_data()
df3

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951.0,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096.0,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870.0,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12,existing paid,furniture/equipment,1736.0,<100,4<=X<7,3,female div/dep/mar,none,...,real estate,31,none,own,1,unskilled resident,1,none,yes,good
996,<0,30,existing paid,used car,3857.0,<100,1<=X<4,4,male div/sep,none,...,life insurance,40,none,own,1,high qualif/self emp/mgmt,1,yes,yes,good
997,no checking,12,existing paid,radio/tv,804.0,<100,>=7,4,male single,none,...,car,38,none,own,1,skilled,1,none,yes,good
998,<0,45,existing paid,radio/tv,1845.0,<100,1<=X<4,4,male single,none,...,no known property,23,none,for free,1,skilled,1,yes,yes,bad


In [100]:
df3["class"] = df3["class"].replace(["bad", "good"], [0, 1])
df3

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,1
1,0<=X<200,48,existing paid,radio/tv,5951.0,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,0
2,no checking,12,critical/other existing credit,education,2096.0,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,1
3,<0,42,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,1
4,<0,24,delayed previously,new car,4870.0,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12,existing paid,furniture/equipment,1736.0,<100,4<=X<7,3,female div/dep/mar,none,...,real estate,31,none,own,1,unskilled resident,1,none,yes,1
996,<0,30,existing paid,used car,3857.0,<100,1<=X<4,4,male div/sep,none,...,life insurance,40,none,own,1,high qualif/self emp/mgmt,1,yes,yes,1
997,no checking,12,existing paid,radio/tv,804.0,<100,>=7,4,male single,none,...,car,38,none,own,1,skilled,1,none,yes,1
998,<0,45,existing paid,radio/tv,1845.0,<100,1<=X<4,4,male single,none,...,no known property,23,none,for free,1,skilled,1,yes,yes,0


In [102]:
X = df3.drop("class", axis="columns")
y = df3["class"]
y = y.astype("int")
accuracy(X, y)

Accuracy original tree: 0.6833333333333333
Accuracy our tree: 0.68


In [103]:
#Dataset 4

dataset4 = openml.datasets.get_dataset(37)
dataset4

OpenML Dataset
Name..........: diabetes
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:22:13
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/37/diabetes.arff
OpenML URL....: https://www.openml.org/d/37
# of features.: 9
# of instances: 768

In [104]:
df4, _, _, _ = dataset4.get_data()
df4

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,tested_positive
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,tested_negative
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,tested_positive
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,tested_negative
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,tested_positive
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,tested_negative
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0,tested_negative
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,tested_negative
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,tested_positive


In [105]:
df4["class"] = df4["class"].replace(["tested_negative", "tested_positive"], [0, 1])
df4

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,0
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0,0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,0
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,1


In [106]:
X = df4.drop("class", axis="columns")
y = df4["class"]
y = y.astype("int")
accuracy(X, y)

Accuracy original tree: 0.6147186147186147
Accuracy our tree: 0.6147186147186147


In [132]:
#DATASET 5
#43
dataset5 = openml.datasets.get_dataset(22)
dataset5

OpenML Dataset
Name..........: mfeat-zernike
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:21:00
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/22/mfeat-zernike.arff
OpenML URL....: https://www.openml.org/d/22
# of features.: 48
# of instances: 2000

In [133]:
df5, _, _, _ = dataset5.get_data()
df5

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att39,att40,att41,att42,att43,att44,att45,att46,att47,class
0,0.011033,0.831466,15.351804,75.806559,171.554214,490.156556,206.416027,0.122135,2.601646,11.472709,...,33.810340,9.858915,1.399891,148.138058,326.239452,9.711070,20.007248,47.032578,539.208457,1
1,0.038271,1.166746,10.526913,42.369276,85.187116,420.360566,253.569574,0.033657,0.390566,11.700830,...,35.400531,70.681899,6.674412,155.135985,377.832675,8.140633,44.536711,46.338954,518.496567,1
2,0.042698,1.225007,8.273804,31.744786,54.448177,404.103204,389.980746,0.041733,0.937399,11.629045,...,19.477230,30.093590,7.858211,150.126419,419.565747,4.530921,26.292170,44.574822,549.912691,1
3,0.032418,1.638247,19.205283,51.196682,57.181760,429.052011,256.174645,0.073624,1.973268,13.057108,...,14.179518,30.564085,7.097728,173.840759,441.350376,3.706023,13.432311,51.739930,574.887814,1
4,0.015866,0.611561,8.627839,37.325052,48.509025,459.909634,238.572767,0.046477,1.117292,10.012169,...,8.705403,30.242473,9.015714,167.021185,332.479997,1.806273,23.689300,50.407770,492.227513,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.018696,0.060507,4.189839,20.072573,136.863862,364.365568,83.535937,0.027723,0.175661,6.677457,...,37.077931,229.878330,6.376995,144.500902,393.876660,8.672527,152.791625,43.559482,501.537544,10
1996,0.025601,0.373184,8.199696,36.255929,63.651474,217.184626,107.002051,0.053221,1.002008,12.029996,...,8.122498,224.796064,7.282318,187.335473,353.453408,1.389084,161.716981,53.922259,482.428339,10
1997,0.024194,0.785229,9.852438,47.772260,71.338044,381.053077,76.972270,0.051574,1.860790,21.352962,...,23.227692,122.971707,6.250161,124.337857,359.585224,4.643621,101.472040,37.013903,430.544358,10
1998,0.010046,0.238167,1.285469,11.191213,81.659420,414.859326,38.144435,0.034859,1.212122,16.690799,...,69.833696,175.418772,3.563793,140.067305,486.688709,16.109266,126.684436,41.154583,571.426195,10


In [None]:
X = df5.drop("class", axis="columns")
y = df5["class"]
y = y.astype("int")
accuracy(X, y)