In [316]:
import pandas as pd
import numpy as np
from openml import tasks, runs
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [293]:
openml.config.apikey = '6a1d598d43fc357eb5b7b7afd49ab5f0'

In [294]:
#ORIGINAL DECISION TREE

class Node:
    """
    A class to represent a node in a decision tree.
    """
    def __init__(self, feature=None, value=None, left=None, right=None, outcome=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.outcome = outcome

class DecisionTree:
    """
    A class to represent a decision tree.
    """
    def __init__(self, max_depth, min_samples_split=2, min_impurity=1e-7):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.root = None

    def fit(self, X, y):
        data = pd.concat([X, y], axis=1)
        self.root = self._build_tree(data)

    def predict(self, X):
        return X.apply(lambda row: self._predict_row(row), axis=1)

    def _build_tree(self, data, depth=0):
        """
        Recursive function that builds the decision tree.
        """
        # Check if we have reached the maximum depth
        if depth == self.max_depth:
            return Node(outcome=self._most_common_outcome(data))

        # Check if we have reached a leaf node
        if len(data) < self.min_samples_split:
            return Node(outcome=self._most_common_outcome(data))

        # Check if the data is pure (all labels are the same)
        if self._entropy(data) < self.min_impurity:
            return Node(outcome=self._most_common_outcome(data))

        # Select the best feature to split the data
        best_feature, best_value = self._best_feature_to_split(data)

        # Split the data based on the best feature and value
        left_data, right_data = self._split_data(data, best_feature, best_value)

        # Recursively build the left and right subtrees
        left_subtree = self._build_tree(left_data, depth+1)
        right_subtree = self._build_tree(right_data, depth+1)

        # Create a new node to represent the best feature and value
        return Node(feature=best_feature, value=best_value, left=left_subtree, right=right_subtree)

    def _predict_row(self, row):
        """
        Recursive function that predicts the label of a single row.
        """
        node = self.root
        while node.outcome is None:
            if row[node.feature] == node.value:
                node = node.left
            else:
                node = node.right
        return node.outcome

    def _entropy(self, data):
        """
        Calculate the entropy of a set of data.
        """
        num_samples = len(data)
        value_counts = data.iloc[:, -1].value_counts()
        probabilities = value_counts / num_samples
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _best_feature_to_split(self, data):
        """
        Select the best feature to split the data based on information gain.
        """
        best_gain = -1
        best_feature = None
        best_value = None
        entropy = self._entropy(data)
        
        for feature in data.columns[:-1]:
            
            
            values = data[feature].unique()

            for value in values:
                left_data, right_data = self._split_data(data, feature, value)

                if len(left_data) == 0:
                    continue
                if len(right_data) == 0:
                    continue

                gain = self._information_gain(data, left_data, right_data, entropy)

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_value = value
        print("best feature: ", best_feature, " best value: ", best_value)
        return best_feature, best_value

    def _information_gain(self, data, left_data, right_data, entropy):
        """
        Calculate the information gain from splitting the data into two groups.
        """
        p = len(left_data) / len(data)
        gain = entropy - p*self._entropy(left_data) - (1-p)*self._entropy(right_data)
        return gain

    def _split_data(self, data, feature, value):
        """
        Split the data based on a given feature and value.
        """
        left_data = data[data[feature] == value].reset_index(drop=True)
        right_data = data[data[feature] != value].reset_index(drop=True)
        return left_data, right_data

    def _most_common_outcome(self, data):
        """
        Return the most common outcome in the data.
        """
        outcome_counts = data.iloc[:, -1].value_counts()
        most_common_outcome = outcome_counts.index[0]
        return most_common_outcome

In [364]:
#OUR DECISION TREE

class Node:
    """
    A class to represent a node in a decision tree.
    """
    def __init__(self, feature=None, value=None, left=None, right=None, outcome=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.outcome = outcome

class DecisionTree2:
    """
    A class to represent a decision tree.
    """
    def __init__(self, max_depth, min_samples_split=2, min_impurity=1e-7):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity
        self.root = None

    def fit(self, X, y):
        data = pd.concat([X, y], axis=1)
        self.root = self._build_tree(data)

    def predict(self, X):
        return X.apply(lambda row: self._predict_row(row), axis=1)
    
    def _std(self, data):
        """
        Calculate the entropy of a set of data.
        """
        value = data.iloc[:, -1]
        std = value.std()
        return std

    def _build_tree(self, data, depth=0):
        """
        Recursive function that builds the decision tree.
        """
        # Check if we have reached the maximum depth
        if depth == self.max_depth:
            return Node(outcome=self._most_common_outcome(data))

        # Check if we have reached a leaf node
        if len(data) < self.min_samples_split:
            return Node(outcome=self._most_common_outcome(data))

        # Check if the data is pure (all labels are the same)
        if self._std(data) < self.min_impurity:
            return Node(outcome=self._most_common_outcome(data))

        # Select the best feature to split the data
        best_feature, best_value = self._best_feature_to_split(data)

        # Split the data based on the best feature and value
        left_data, right_data = self._split_data(data, best_feature, best_value)

        # Recursively build the left and right subtrees
        left_subtree = self._build_tree(left_data, depth+1)
        right_subtree = self._build_tree(right_data, depth+1)

        # Create a new node to represent the best feature and value
        return Node(feature=best_feature, value=best_value, left=left_subtree, right=right_subtree)

    def _predict_row(self, row):
        """
        Recursive function that predicts the label of a single row.
        """
        node = self.root
        while node.outcome is None:
            if row[node.feature] == node.value:
                node = node.left
            else:
                node = node.right
        return node.outcome

    def _best_feature_to_split(self, data):
        best_std = float("inf")
        best_feature = None
        best_value = None

        for feature in data.columns[:-1]:
            
            values = data[feature].unique()
            for value in values:
                left_data, right_data = self._split_data(data, feature, value)

                if len(left_data) == 0 or len(right_data) == 0:
                    continue

                std = self._std_deviation(left_data, right_data)

                if std < best_std:
                    best_std = std
                    best_feature = feature
                    best_value = value
        print("best feature: ", best_feature, " best value: ", best_value)
        return best_feature, best_value

    def _std_deviation(self, left_data, right_data):
        left_std = np.std(left_data.iloc[:, -1])
        right_std = np.std(right_data.iloc[:, -1])
        left_weight = len(left_data) / (len(left_data) + len(right_data))
        right_weight = len(right_data) / (len(left_data) + len(right_data))
        std = left_weight * left_std + right_weight * right_std
        return std


    def _split_data(self, data, feature, value):
        """
        Split the data based on a given feature and value.
        """
        left_data = data[data[feature] == value].reset_index(drop=True)
        right_data = data[data[feature] != value].reset_index(drop=True)
        return left_data, right_data

    def _most_common_outcome(self, data):
        """
        Return the most common outcome in the data.
        """
        outcome_counts = data.iloc[:, -1].value_counts()
        most_common_outcome = outcome_counts.index[0]
        return most_common_outcome

In [365]:
def accuracy(X, y, depth):
    tree1 = DecisionTree(max_depth=depth)
    tree2 = DecisionTree2(max_depth=depth)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
    #Original tree
    print("ORIGINAL TREE SPLITS")
    tree1.fit(X_train, y_train)
    predictions1 = tree1.predict(X_test)  
    
    print()
    
    #Our tree
    print("OUR TREE SPLITS")
    tree2.fit(X_train, y_train)
    predictions2 = tree2.predict(X_test) 
    
    print()
    print(f"Accuracy original tree: {accuracy_score(y_test, predictions1)}")
    print(f"Accuracy our tree: {accuracy_score(y_test, predictions2)}")

In [330]:
#DATASET 1

task1 = tasks.get_task(9971)
task1

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 9971
Task URL.............: https://www.openml.org/t/9971
Estimation Procedure.: crossvalidation
Target Feature.......: Class
# of Classes.........: 2
Cost Matrix..........: Available

In [331]:
dataset1 = task1.get_dataset()
dataset1

OpenML Dataset
Name..........: ilpd
Version.......: 1
Format........: ARFF
Upload Date...: 2015-05-22 22:40:56
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/1590565/ilpd.arff
OpenML URL....: https://www.openml.org/d/1480
# of features.: 11
# of instances: 583

In [332]:
df1, _, _, _ = dataset1.get_data()
df1

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,Class
0,65,Female,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.90,1
1,62,Male,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.00,1
4,72,Male,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,Male,0.5,0.1,500.0,20.0,34.0,5.9,1.6,0.37,2
579,40,Male,0.6,0.1,98.0,35.0,31.0,6.0,3.2,1.10,1
580,52,Male,0.8,0.2,245.0,48.0,49.0,6.4,3.2,1.00,1
581,31,Male,1.3,0.5,184.0,29.0,32.0,6.8,3.4,1.00,1


In [333]:
X = df1.drop("Class", axis="columns")
y = df1["Class"]
y = y.astype("int")
accuracy(X, y, 5)

ORIGINAL TREE SPLITS
best feature:  V4  best value:  0.2
best feature:  V6  best value:  25.0
best feature:  V7  best value:  23.0
best feature:  V1  best value:  28
best feature:  V6  best value:  21.0
best feature:  V6  best value:  20.0
best feature:  V7  best value:  21.0
best feature:  V3  best value:  2.4
best feature:  V5  best value:  178.0
best feature:  V4  best value:  0.6
best feature:  V3  best value:  1.3
best feature:  V3  best value:  1.2

OUR TREE SPLITS
best feature:  V4  best value:  0.2
best feature:  V6  best value:  25.0
best feature:  V6  best value:  20.0
best feature:  V1  best value:  31
best feature:  V1  best value:  51
best feature:  V1  best value:  40
best feature:  V1  best value:  32
best feature:  V7  best value:  21.0
best feature:  V3  best value:  2.4
best feature:  V5  best value:  178.0

Accuracy original tree: 0.6857142857142857
Accuracy our tree: 0.7314285714285714


In [335]:
#DATASET 2

task2 = tasks.get_task(3)
task2

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 3
Task URL.............: https://www.openml.org/t/3
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 2
Cost Matrix..........: Available

In [336]:
dataset2 = task2.get_dataset()
dataset2

OpenML Dataset
Name..........: kr-vs-kp
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:19:28
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/3/kr-vs-kp.arff
OpenML URL....: https://www.openml.org/d/3
# of features.: 37
# of instances: 3196

In [337]:
df2, _, _, _ = dataset2.get_data()
df2

Unnamed: 0,bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,...,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg,class
0,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
1,f,f,f,f,t,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
2,f,f,f,f,t,f,t,f,f,f,...,f,f,f,f,f,f,t,t,n,won
3,f,f,f,f,f,f,f,f,t,f,...,f,f,f,f,f,f,t,t,n,won
4,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3191,t,f,f,f,f,f,t,f,f,f,...,f,t,f,f,t,f,t,f,n,nowin
3192,t,f,f,f,f,f,t,f,f,f,...,f,t,f,f,t,f,t,f,n,nowin
3193,t,f,f,f,f,f,t,f,f,f,...,f,t,f,f,t,f,t,f,n,nowin
3194,t,f,t,f,f,f,t,f,f,f,...,f,t,f,f,t,f,f,f,n,nowin


In [338]:
df2["class"] = df2["class"].replace(["nowin", "won"], [0, 1])
df2

Unnamed: 0,bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,...,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg,class
0,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,1
1,f,f,f,f,t,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,1
2,f,f,f,f,t,f,t,f,f,f,...,f,f,f,f,f,f,t,t,n,1
3,f,f,f,f,f,f,f,f,t,f,...,f,f,f,f,f,f,t,t,n,1
4,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3191,t,f,f,f,f,f,t,f,f,f,...,f,t,f,f,t,f,t,f,n,0
3192,t,f,f,f,f,f,t,f,f,f,...,f,t,f,f,t,f,t,f,n,0
3193,t,f,f,f,f,f,t,f,f,f,...,f,t,f,f,t,f,t,f,n,0
3194,t,f,t,f,f,f,t,f,f,f,...,f,t,f,f,t,f,f,f,n,0


In [339]:
X = df2.drop("class", axis="columns")
y = df2["class"]
y = y.astype("int")
accuracy(X, y, 5)

ORIGINAL TREE SPLITS
best feature:  rimmx  best value:  f
best feature:  bxqsq  best value:  f
best feature:  wknck  best value:  f
best feature:  wkna8  best value:  t
best feature:  bknwy  best value:  f
best feature:  bkxbq  best value:  t
best feature:  wkovl  best value:  t
best feature:  bknwy  best value:  t
best feature:  bkxcr  best value:  t

OUR TREE SPLITS
best feature:  rimmx  best value:  f
best feature:  bxqsq  best value:  f
best feature:  wknck  best value:  f
best feature:  bkxbq  best value:  f
best feature:  wkpos  best value:  f
best feature:  bkxcr  best value:  f
best feature:  wkovl  best value:  t

Accuracy original tree: 0.948905109489051
Accuracy our tree: 0.9520333680917622


In [340]:
#DATASET 3

task3 = tasks.get_task(29)
task3

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 29
Task URL.............: https://www.openml.org/t/29
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 2
Cost Matrix..........: Available

In [341]:
dataset3 = task3.get_dataset()
dataset3

OpenML Dataset
Name..........: credit-approval
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:21:38
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/29/credit-approval.arff
OpenML URL....: https://www.openml.org/d/29
# of features.: 16
# of instances: 690

In [342]:
df3, _, _, _ = dataset3.get_data()
df3

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1.0,f,g,202.0,0.0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6.0,f,g,43.0,560.0,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0.0,f,g,280.0,824.0,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5.0,t,g,100.0,3.0,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0.0,f,s,120.0,0.0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0.0,f,g,260.0,0.0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2.0,t,g,200.0,394.0,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1.0,t,g,200.0,1.0,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0.0,f,g,280.0,750.0,-


In [343]:
df3["class"] = df3["class"].replace(["-", "+"], [0, 1])
df3

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1.0,f,g,202.0,0.0,1
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6.0,f,g,43.0,560.0,1
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0.0,f,g,280.0,824.0,1
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5.0,t,g,100.0,3.0,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0.0,f,s,120.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0.0,f,g,260.0,0.0,0
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2.0,t,g,200.0,394.0,0
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1.0,t,g,200.0,1.0,0
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0.0,f,g,280.0,750.0,0


In [344]:
X = df3.drop("class", axis="columns")
y = df3["class"]
y = y.astype("int")
accuracy(X, y, 5)

ORIGINAL TREE SPLITS
best feature:  A9  best value:  f
best feature:  A13  best value:  g
best feature:  A3  best value:  0.125
best feature:  A6  best value:  k
best feature:  A2  best value:  16.5
best feature:  A2  best value:  18.0
best feature:  A2  best value:  16.17
best feature:  A7  best value:  v
best feature:  A2  best value:  41.0
best feature:  A6  best value:  i
best feature:  A6  best value:  k
best feature:  A10  best value:  f
best feature:  A8  best value:  0.04
best feature:  A7  best value:  h
best feature:  A6  best value:  c
best feature:  A14  best value:  0.0
best feature:  A15  best value:  5.0
best feature:  A14  best value:  0.0
best feature:  A11  best value:  3.0

OUR TREE SPLITS
best feature:  A9  best value:  f
best feature:  A13  best value:  g
best feature:  A3  best value:  0.125
best feature:  A6  best value:  k
best feature:  A2  best value:  16.5
best feature:  A4  best value:  u
best feature:  A10  best value:  t
best feature:  A7  best value:  v
b

In [345]:
#DATASET 4

task4 = tasks.get_task(9964)
task4

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 9964
Task URL.............: https://www.openml.org/t/9964
Estimation Procedure.: crossvalidation
Target Feature.......: Class
# of Classes.........: 10
Cost Matrix..........: Available

In [346]:
dataset4 = task4.get_dataset()
dataset4

OpenML Dataset
Name..........: semeion
Version.......: 1
Format........: ARFF
Upload Date...: 2015-05-25 22:22:34
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/1592293/semeion.arff
OpenML URL....: https://www.openml.org/d/1501
# of features.: 257
# of instances: 1593

In [347]:
df4, _, _, _ = dataset4.get_data()
df4

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V248,V249,V250,V251,V252,V253,V254,V255,V256,Class
0,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1588,0,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,2
1589,0,0,0,0,0,0,1,1,1,1,...,1,1,1,1,1,1,0,0,0,2
1590,0,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,2
1591,0,0,0,1,1,1,1,1,1,1,...,1,1,1,0,0,0,0,0,0,2


In [348]:
X = df4.drop("Class", axis="columns")
y = df4["Class"]
y = y.astype("int")
tree2 = accuracy(X, y, 5)

ORIGINAL TREE SPLITS
best feature:  V162  best value:  0
best feature:  V82  best value:  0
best feature:  V191  best value:  0
best feature:  V238  best value:  0
best feature:  V1  best value:  1
best feature:  V8  best value:  1
best feature:  V182  best value:  0
best feature:  V68  best value:  0
best feature:  V158  best value:  1
best feature:  V63  best value:  0
best feature:  V12  best value:  1
best feature:  V44  best value:  0
best feature:  V191  best value:  0
best feature:  V145  best value:  1
best feature:  V93  best value:  0
best feature:  V181  best value:  1
best feature:  V79  best value:  0
best feature:  V229  best value:  0
best feature:  V119  best value:  0
best feature:  V146  best value:  0
best feature:  V196  best value:  1
best feature:  V104  best value:  0
best feature:  V61  best value:  1
best feature:  V7  best value:  1
best feature:  V127  best value:  1
best feature:  V168  best value:  0
best feature:  V16  best value:  0
best feature:  V121  b

In [349]:
#DATASET 5

task5 = tasks.get_task(146821)
task5

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 146821
Task URL.............: https://www.openml.org/t/146821
Estimation Procedure.: crossvalidation
Target Feature.......: class
# of Classes.........: 4
Cost Matrix..........: Available

In [350]:
dataset5 = task5.get_dataset()
dataset5

OpenML Dataset
Name..........: car
Version.......: 3
Format........: ARFF
Upload Date...: 2017-11-30 20:27:42
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/18116966/car.arff
OpenML URL....: https://www.openml.org/d/40975
# of features.: 7
# of instances: 1728

In [351]:
df5, _, _, _ = dataset5.get_data()
df5

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [352]:
df5["class"] = df5["class"].replace(["unacc", "acc", "good", "vgood"], [1, 2, 3, 4])
df5

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,1
1,vhigh,vhigh,2,2,small,med,1
2,vhigh,vhigh,2,2,small,high,1
3,vhigh,vhigh,2,2,med,low,1
4,vhigh,vhigh,2,2,med,med,1
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,3
1724,low,low,5more,more,med,high,4
1725,low,low,5more,more,big,low,1
1726,low,low,5more,more,big,med,3


In [353]:
X = df5.drop("class", axis="columns")
y = df5["class"]
y = y.astype("int")
accuracy(X, y, 5)

ORIGINAL TREE SPLITS
best feature:  persons  best value:  2
best feature:  safety  best value:  low
best feature:  maint  best value:  vhigh
best feature:  buying  best value:  high
best feature:  buying  best value:  vhigh
best feature:  buying  best value:  low
best feature:  safety  best value:  med
best feature:  buying  best value:  med

OUR TREE SPLITS
best feature:  persons  best value:  2
best feature:  safety  best value:  low
best feature:  maint  best value:  vhigh
best feature:  buying  best value:  high
best feature:  buying  best value:  vhigh
best feature:  buying  best value:  low
best feature:  safety  best value:  med
best feature:  buying  best value:  med

Accuracy original tree: 0.8439306358381503
Accuracy our tree: 0.8439306358381503


In [354]:
#DATASET 6

task6 = tasks.get_task(53)
task6

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 53
Task URL.............: https://www.openml.org/t/53
Estimation Procedure.: crossvalidation
Target Feature.......: Class
# of Classes.........: 4
Cost Matrix..........: Available

In [355]:
dataset6 = task6.get_dataset()
dataset6

OpenML Dataset
Name..........: vehicle
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:23:10
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/54/vehicle.arff
OpenML URL....: https://www.openml.org/d/54
# of features.: 19
# of instances: 846

In [356]:
df6, _, _, _ = dataset6.get_data()
df6

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,SCALED_VARIANCE_MAJOR,SCALED_VARIANCE_MINOR,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class
0,95,48,83,178.0,72,10,162.0,42,20,159,176.0,379.0,184.0,70,6,16,187,197,van
1,91,41,84,141.0,57,9,149.0,45,19,143,170.0,330.0,158.0,72,9,14,189,199,van
2,104,50,106,209.0,66,10,207.0,32,23,158,223.0,635.0,220.0,73,14,9,188,196,saab
3,93,41,82,159.0,63,9,144.0,46,19,143,160.0,309.0,127.0,63,6,10,199,207,van
4,85,44,70,205.0,103,52,149.0,45,19,144,241.0,325.0,188.0,127,9,11,180,183,bus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,93,39,87,183.0,64,8,169.0,40,20,134,200.0,422.0,149.0,72,7,25,188,195,saab
842,89,46,84,163.0,66,11,159.0,43,20,159,173.0,368.0,176.0,72,1,20,186,197,van
843,106,54,101,222.0,67,12,222.0,30,25,173,228.0,721.0,200.0,70,3,4,187,201,saab
844,86,36,78,146.0,58,7,135.0,50,18,124,155.0,270.0,148.0,66,0,25,190,195,saab


In [357]:
df6["Class"] = df6["Class"].replace(["opel", "saab", "bus", "van"], [1, 2, 3, 4])
df6

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,SCALED_VARIANCE_MAJOR,SCALED_VARIANCE_MINOR,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class
0,95,48,83,178.0,72,10,162.0,42,20,159,176.0,379.0,184.0,70,6,16,187,197,4
1,91,41,84,141.0,57,9,149.0,45,19,143,170.0,330.0,158.0,72,9,14,189,199,4
2,104,50,106,209.0,66,10,207.0,32,23,158,223.0,635.0,220.0,73,14,9,188,196,2
3,93,41,82,159.0,63,9,144.0,46,19,143,160.0,309.0,127.0,63,6,10,199,207,4
4,85,44,70,205.0,103,52,149.0,45,19,144,241.0,325.0,188.0,127,9,11,180,183,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,93,39,87,183.0,64,8,169.0,40,20,134,200.0,422.0,149.0,72,7,25,188,195,2
842,89,46,84,163.0,66,11,159.0,43,20,159,173.0,368.0,176.0,72,1,20,186,197,4
843,106,54,101,222.0,67,12,222.0,30,25,173,228.0,721.0,200.0,70,3,4,187,201,2
844,86,36,78,146.0,58,7,135.0,50,18,124,155.0,270.0,148.0,66,0,25,190,195,2


In [358]:
X = df6.drop("Class", axis="columns")
y = df6["Class"]
y = y.astype("int")
accuracy(X, y, 5)

ORIGINAL TREE SPLITS
best feature:  PR.AXIS_RECTANGULARITY  best value:  19
best feature:  MAX.LENGTH_ASPECT_RATIO  best value:  9
best feature:  MAX.LENGTH_ASPECT_RATIO  best value:  10
best feature:  CIRCULARITY  best value:  38
best feature:  SKEWNESS_ABOUT_MINOR  best value:  0
best feature:  DISTANCE_CIRCULARITY  best value:  77
best feature:  ELONGATEDNESS  best value:  43
best feature:  CIRCULARITY  best value:  39
best feature:  CIRCULARITY  best value:  37
best feature:  PR.AXIS_RECTANGULARITY  best value:  18
best feature:  DISTANCE_CIRCULARITY  best value:  70
best feature:  SCATTER_RATIO  best value:  143.0
best feature:  MAX.LENGTH_ASPECT_RATIO  best value:  8
best feature:  PR.AXIS_RECTANGULARITY  best value:  17
best feature:  PR.AXIS_ASPECT_RATIO  best value:  57
best feature:  MAX.LENGTH_ASPECT_RATIO  best value:  6

OUR TREE SPLITS
best feature:  PR.AXIS_RECTANGULARITY  best value:  24
best feature:  HOLLOWS_RATIO  best value:  189
best feature:  DISTANCE_CIRCULARITY 

In [359]:
#DATASET 7

task7 = tasks.get_task(3549)
task7

OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 3549
Task URL.............: https://www.openml.org/t/3549
Estimation Procedure.: crossvalidation
Target Feature.......: Author
# of Classes.........: 4
Cost Matrix..........: Available

In [360]:
dataset7 = task7.get_dataset()
dataset7

OpenML Dataset
Name..........: analcatdata_authorship
Version.......: 1
Format........: ARFF
Upload Date...: 2014-09-28 23:51:06
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/52570/analcatdata_authorship.arff
OpenML URL....: https://www.openml.org/d/458
# of features.: 71
# of instances: 841

In [361]:
df7, _, _, _ = dataset7.get_data()
df7

Unnamed: 0,a,all,also,an,and,any,are,as,at,be,...,what,when,which,who,will,with,would,your,BookID,Author
0,46,12,0,3,66,9,4,16,13,13,...,7,5,6,8,4,9,1,0,1,Austen
1,35,10,0,7,44,4,3,18,16,9,...,5,7,7,3,5,14,8,0,1,Austen
2,46,2,0,3,40,1,13,11,9,23,...,10,4,6,4,5,15,3,9,1,Austen
3,40,7,0,4,64,3,3,20,13,20,...,3,6,10,5,3,22,4,3,1,Austen
4,29,5,0,6,52,5,14,17,6,16,...,8,4,13,2,4,21,10,0,1,Austen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
836,32,4,0,6,33,0,7,8,4,18,...,13,2,3,3,11,17,5,10,12,Shakespeare
837,16,5,0,5,49,1,6,10,3,24,...,6,5,6,0,11,20,2,7,12,Shakespeare
838,22,15,0,3,48,0,9,10,2,13,...,16,2,2,0,12,15,1,10,12,Shakespeare
839,25,4,0,8,59,3,6,7,3,13,...,11,2,2,2,22,23,4,5,12,Shakespeare


In [362]:
df7["Author"] = df7["Author"].replace(["Austen", "London", "Milton", "Shakespeare"], [1, 2, 3, 4])
df7

Unnamed: 0,a,all,also,an,and,any,are,as,at,be,...,what,when,which,who,will,with,would,your,BookID,Author
0,46,12,0,3,66,9,4,16,13,13,...,7,5,6,8,4,9,1,0,1,1
1,35,10,0,7,44,4,3,18,16,9,...,5,7,7,3,5,14,8,0,1,1
2,46,2,0,3,40,1,13,11,9,23,...,10,4,6,4,5,15,3,9,1,1
3,40,7,0,4,64,3,3,20,13,20,...,3,6,10,5,3,22,4,3,1,1
4,29,5,0,6,52,5,14,17,6,16,...,8,4,13,2,4,21,10,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
836,32,4,0,6,33,0,7,8,4,18,...,13,2,3,3,11,17,5,10,12,4
837,16,5,0,5,49,1,6,10,3,24,...,6,5,6,0,11,20,2,7,12,4
838,22,15,0,3,48,0,9,10,2,13,...,16,2,2,0,12,15,1,10,12,4
839,25,4,0,8,59,3,6,7,3,13,...,11,2,2,2,22,23,4,5,12,4


In [363]:
X = df7.drop(["BookID", "Author"], axis="columns")
y = df7["Author"]
y = y.astype("int")
accuracy(X, y, 5)

ORIGINAL TREE SPLITS
best feature:  may  best value:  0
best feature:  should  best value:  0
best feature:  be  best value:  17
best feature:  an  best value:  0
best feature:  a  best value:  17
best feature:  now  best value:  9
best feature:  must  best value:  0
best feature:  a  best value:  55
best feature:  its  best value:  0
best feature:  only  best value:  0
best feature:  be  best value:  7
best feature:  its  best value:  0
best feature:  has  best value:  0
best feature:  your  best value:  0
best feature:  but  best value:  7
best feature:  any  best value:  0
best feature:  been  best value:  0
best feature:  a  best value:  43
best feature:  only  best value:  0
best feature:  her  best value:  0
best feature:  an  best value:  2
best feature:  only  best value:  0
best feature:  should  best value:  3
best feature:  be  best value:  6

OUR TREE SPLITS
best feature:  its  best value:  0
best feature:  been  best value:  0
best feature:  upon  best value:  0
best featu