# Дерево решений

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# 1. Загрузка и предобработка данных

In [2]:
df = pd.read_csv('Heart_Disease_and_Hospitals.csv')

df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Male' else 0)

X = df[['age', 'blood_pressure', 'cholesterol', 'bmi', 'glucose_level', 'gender']]
y = df['heart_disease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

train_data = X_train.values
train_labels = y_train.values
test_data = X_test.values
test_labels = y_test.values

# 2. Реализация самописного дерева решений

In [3]:
class Node:
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index  # индекс признака для разделения
        self.t = t  # пороговое значение
        self.true_branch = true_branch  # ветвь для условия "истина"
        self.false_branch = false_branch # ветвь для условия "ложь"

class Leaf:
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.prediction = self.predict()

    def predict(self):
        classes = {}
        for label in self.labels:
            classes[label] = classes.get(label, 0) + 1
        prediction = max(classes, key=classes.get)
        return prediction

def gini(labels):
    unique_labels, counts = np.unique(labels, return_counts=True)
    probabilities = counts / len(labels)
    return 1 - np.sum(probabilities**2)

# прирост информации
def gain(left_labels, right_labels, root_gini):
    p = float(len(left_labels)) / (len(left_labels) + len(right_labels))
    weighted_gini = p * gini(left_labels) + (1 - p) * gini(right_labels)
    information_gain = root_gini - weighted_gini
    return information_gain

# тут делим на две ветки по условию
def split(data, labels, column_index, t):
    left_indices = np.where(data[:, column_index] <= t)
    right_indices = np.where(data[:, column_index] > t)

    return data[left_indices], data[right_indices], labels[left_indices], labels[right_indices]


def find_best_split(data, labels, min_samples_leaf):
    root_gini = gini(labels)
    best_gain = 0
    best_t, best_index = None, None
    n_features = data.shape[1]

    for index in range(n_features):
        t_values = np.unique(data[:, index])
        
        thresholds = (t_values[:-1] + t_values[1:]) / 2
        
        for t in thresholds:
            true_data, false_data, true_labels, false_labels = split(data, labels, index, t)

            if len(true_labels) < min_samples_leaf or len(false_labels) < min_samples_leaf:
                continue
            
            current_gain = gain(true_labels, false_labels, root_gini)

            if current_gain > best_gain:
                best_gain, best_t, best_index = current_gain, t, index

    return best_gain, best_t, best_index

def build_tree(data, labels, max_depth=5, min_samples_leaf=5, current_depth=0):
    if current_depth >= max_depth:
        return Leaf(data, labels)

    gain, t, index = find_best_split(data, labels, min_samples_leaf)

    if gain == 0 or index is None:
        return Leaf(data, labels)
    
    true_data, false_data, true_labels, false_labels = split(data, labels, index, t)
    true_branch = build_tree(true_data, true_labels, max_depth, min_samples_leaf, current_depth + 1)
    false_branch = build_tree(false_data, false_labels, max_depth, min_samples_leaf, current_depth + 1)

    return Node(index, t, true_branch, false_branch)

def classify_object(obj, node):
    if isinstance(node, Leaf):
        return node.prediction
    if obj[node.index] <= node.t:
        return classify_object(obj, node.true_branch)
    else:
        return classify_object(obj, node.false_branch)

def predict(data, tree):
    return [classify_object(obj, tree) for obj in data]


def accuracy_metric(actual, predicted):
    correct_predictions = np.sum(np.array(actual) == np.array(predicted))
    return correct_predictions / len(actual) * 100.0

# 3. Обучение, оценка и сравнение моделей

In [4]:
my_tree = build_tree(train_data, train_labels, max_depth=3, min_samples_leaf=5)
my_predictions = predict(test_data, my_tree)
my_accuracy = accuracy_metric(test_labels, my_predictions)
print("Точность самописного дерева: {:.4f}%".format(my_accuracy))

sk_tree = DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=5, random_state=42)
sk_tree.fit(train_data, train_labels)
sk_predictions = sk_tree.predict(test_data)
sk_accuracy = accuracy_score(test_labels, sk_predictions) * 100
print("Точность дерева из sklearn: {:.4f}%".format(sk_accuracy))

if np.array_equal(my_predictions, sk_predictions):
    print("Результаты совпадают!")
else:
    print("Результаты не совпадают.")

Точность самописного дерева: 88.2333%
Точность дерева из sklearn: 88.2333%
Результаты совпадают!
