In [169]:
import pandas as pd
import numpy as np
import random

# **1) Model creation**

In [172]:
def create_node():
    """
    Function that returns a dictionary representing a node

    feature_index: the feature of the data the node is going to split
    threhsold: the value to split
    is_final: boolean indicating whether or not the node is a leaf
    label: the label we have to return if the node is a leaf
    child_true: the tree we go on if we condition based on the feature_index and the threhsold is verified
    child_false: the tree we go otherwise
    indicies: the indicies of X_train that go through this node
    current_depth: current depth of the node in the tree

    """

    dico = {'feature_index': None,
            'threshold': None,
            'is_final': False,
            'label': None,
            'child_true': {},
            'child_false': {},
            'indicies': [],
            'current_depth': None
            }
      
    return dico


def get_proportions(y, nb_labels):
    proportions = [0 for i in range(nb_labels)]
    n = len(y)
    
    if n > 0:
        for i in range(nb_labels):
            proportions[i] = len(np.nonzero(y == i)[0]) / n

        return proportions

    else:
        return []


def compute_criterion(type='gini', y=[], nb_labels=2):
    n = len(y)

    if n > 0:
        proportions = get_proportions(y, nb_labels)
        
        res = 0
        
        for i in range(nb_labels):
            if type == 'gini':
                res += (proportions[i]*(1 - proportions[i]))

            elif type == 'entropy' or type == 'log_loss':
                if proportions[i] != 0:
                    res -= proportions[i]*np.log(proportions[i])
                    
            else:
                raise Exception("The criterion you specified does not exist or is not implemented yet")

        return res
    
    else:
        return 0

In [210]:
class DecisionTreeClassifier:
    def __init__(self, criterion='gini', max_depth=None, min_samples_split=2, max_features=None):
        self.tree = {}
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features

    def __call__(self, X):
        return self.predict(X)

    def compute_max_features(self, nb_features):
        if self.max_features == None:
            self.max_features = nb_features
            
        elif isinstance(self.max_features, int):
            try:
                assert (self.max_features > 0) and (self.max_features <= nb_features)
            except:
                raise Exception("The number of features to consider when splitting must be contained between 0 and the number of features of X_train")
        
        elif isinstance(self.max_features, float):
            self.max_features = max(1, int(self.max_features * nb_features))
            try:
                assert (self.max_features > 0) and (self.max_features <= nb_features)
            except:
                raise Exception("The ratio of features to consider when splitting must be contained between 0 and 1")
            
        elif self.max_features == "sqrt":
            self.max_features = max(1, np.floor(np.sqrt(nb_features)))
        
        elif self.max_features == "log2":
            self.max_features = max(1, np.floor(np.log2(nb_features)))
        
        elif self.max_features == "auto":
            self.max_features = max(1, np.floor(np.sqrt(nb_features)))
            
    def fit(self, X, y):
        X = X.to_numpy()
        y = y.to_numpy()

        n, nb_features = X.shape
        nb_labels = np.max(y)
        
        # Gestion de max_features
        self.compute_max_features(nb_features)

        # On initialise l'arbre à un noeud
        root = create_node()
        root['indicies'] = np.arange(n)
        root['current_depth'] = 1

        # On initialise la pile de noeuds
        to_compute = [root]

        # Tant qu'il reste un noeud à traiter
        while (to_compute):
            # On récupère le dernier élément de to_compute et on le retire
            node = to_compute.pop()

            # On récupère les lignes du dataset qui correspondent à notre noeud
            indicies = node['indicies']
            X_tronq = X[indicies]
            y_tronq = y[indicies]
            nb_samples = len(y_tronq)

            # On vérifie s'il n'y a qu'une seule classe
            if len(np.unique(y_tronq)) == 1:
                # S'il n'y a qu'une classe, le noeud est une feuille
                node['is_final'] = True
                node['label'] = y_tronq[0]
                
            elif len(indicies) < self.min_samples_split:
                label = np.argmax(get_proportions(indicies, nb_labels=nb_labels))
                node['is_final'] = True
                node['label'] = label
            
            else:
                # Sinon, s'il y a plusieurs classes
                # On stocke la valeur max du critère et les indices correspondant
                min_criterion = np.inf
                feature_index = None
                threshold = None
                found_criterion = False

                # On sélectionne les features à checker (choisies de manière aléatoire avec self.max_features)
                features_to_check = [i for i in range(nb_features)]
                random.seed(random.randint(0, 100))
                random.shuffle(features_to_check)
                features_to_check = features_to_check[:self.max_features]

                for i in features_to_check:
                    # On récupère les valeurs possibles
                    possible_values = np.unique(X_tronq[:, i])
                    possible_values = np.sort(possible_values)

                    if len(possible_values) >= 2:
                        possible_values = possible_values[1:]
                        found_criterion = True
                        
                        """il faudrait optimiser le calcul des proportions pour chaque valeur, car actuellement la complexité est en O(n²)
                        alors qu'on peut faire du O(n)"""
                        # Pour chaque valeur
                        for value in possible_values:
                            # On récupère les indices où la valeur de la feature est inférieure
                            lower_indexes = np.nonzero(X_tronq[:, i] < value)[0]
                            # Et les indices où la valeur est supérierue ou égale
                            greater_indexes = np.nonzero(X_tronq[:, i] >= value)[0]

                            # On récupère les labels correspondant
                            y_lower = y_tronq[lower_indexes]
                            y_greater = y_tronq[greater_indexes]

                            # On calcule les critères
                            criterion_lower = compute_criterion(self.criterion, y_lower, nb_labels=nb_labels)
                            criterion_greater = compute_criterion(self.criterion, y_greater, nb_labels=nb_labels)

                            # On calcule le critère final
                            nb_samples_lower = len(y_lower)
                            nb_samples_greater = len(y_greater)
                           
                            criterion_split = nb_samples_lower/nb_samples*criterion_lower + nb_samples_greater/nb_samples*criterion_greater

                            if (criterion_split < min_criterion):
                                min_criterion = criterion_split
                                feature_index = i
                                threshold = value
                
                
                new_depth = node['current_depth'] + 1
                
                if found_criterion:
                    # On met à jour les informations du noeud
                    node['feature_index'] = feature_index
                    node['threshold'] = threshold
                    node['is_final'] = False
                    
                    # On récupère les indices pour le noeud où la condition est vérifiée
                    true_index = np.nonzero(X_tronq[:, feature_index] < threshold)[0]
                    # Pareil pour le noeud où la condition n'est pas vérifiée
                    false_index = np.nonzero(X_tronq[:, feature_index] >= threshold)[0]

                    # On crée les enfants du noeud, et on met leurs indices
                    child_true = create_node()
                    child_true['indicies'] = indicies[true_index]
                    child_true['current_depth'] = new_depth

                    child_false = create_node()
                    child_false['indicies'] = indicies[false_index]
                    child_false['current_depth'] = new_depth

                    node['child_true'] = child_true
                    node['child_false'] = child_false

                    # On les ajoute à la pile
                    if self.max_depth and (new_depth < self.max_depth):
                        to_compute.append(child_true)
                        to_compute.append(child_false)

                    else:
                        label_true = np.argmax(get_proportions(indicies[true_index], nb_labels=nb_labels))
                        label_false = np.argmax(get_proportions(indicies[false_index], nb_labels=nb_labels))

                        child_true['label'] = label_true
                        child_true['is_final'] = True
                        child_false['label'] = label_false
                        child_false['is_final'] = True
                        
                else:
                    label = np.argmax(get_proportions(indicies, nb_labels=nb_labels))
                    node['is_final'] = True
                    node['label'] = label
                    
        self.tree = root


    def predict(self, X):
        X = X.to_numpy()

        res = []
        for i in range(len(X)):
            res.append(self.predict_one(X[i]))
        
        return np.array(res)

    
    def predict_one(self, x):
        current_node = self.tree

        while not current_node['is_final']:
            threshold = current_node['threshold']
            feature_index = current_node['feature_index']
            if x[feature_index] < threshold:
                current_node = current_node['child_true']
            else:
                current_node = current_node['child_false']

        return current_node['label']
        

# **2) Load dataset**

Creation of a dataset that contains weight and size for adults (label 1) and children (label 0)

In [211]:
df = pd.read_csv("star_classification.csv") # Récupérer sur https://www.kaggle.com/datasets/fedesoriano/stellar-classification-dataset-sdss17
df.head()

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842


In [212]:
df = df.drop(columns=['obj_ID'])
dico = {'GALAXY': 0, 'QSO': 1, 'STAR': 2}
df['class'] = df['class'].apply(lambda s: dico[s])

# **3) Test**

In [213]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = df.drop(columns=['class'])
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = DecisionTreeClassifier(criterion='gini', max_depth=1, min_samples_split=2, max_features=3)
model.fit(X_train, y_train)

In [196]:
preds = model.predict(X_train)
print("Training score : ", accuracy_score(preds, y_train))

preds = model.predict(X_test)
print("Testing score : ", accuracy_score(preds, y_test))

0.9814814814814815
0.42857142857142855
