# Лабораторная работа 3 (DecisionTree)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from utils import regression_cross_validate, display_metrics_table, classification_cross_validate, display_metrics_classification_table
import warnings
warnings.filterwarnings("ignore")

### Regression

#### 1. Обработка данных

In [2]:
df = pd.read_csv('data/laptop_prices.csv')
df.head()

Unnamed: 0,Company,Product,TypeName,Inches,Ram,OS,Weight,Price_euros,Screen,ScreenW,...,RetinaDisplay,CPU_company,CPU_freq,CPU_model,PrimaryStorage,SecondaryStorage,PrimaryStorageType,SecondaryStorageType,GPU_company,GPU_model
0,Apple,MacBook Pro,Ultrabook,13.3,8,macOS,1.37,1339.69,Standard,2560,...,Yes,Intel,2.3,Core i5,128,0,SSD,No,Intel,Iris Plus Graphics 640
1,Apple,Macbook Air,Ultrabook,13.3,8,macOS,1.34,898.94,Standard,1440,...,No,Intel,1.8,Core i5,128,0,Flash Storage,No,Intel,HD Graphics 6000
2,HP,250 G6,Notebook,15.6,8,No OS,1.86,575.0,Full HD,1920,...,No,Intel,2.5,Core i5 7200U,256,0,SSD,No,Intel,HD Graphics 620
3,Apple,MacBook Pro,Ultrabook,15.4,16,macOS,1.83,2537.45,Standard,2880,...,Yes,Intel,2.7,Core i7,512,0,SSD,No,AMD,Radeon Pro 455
4,Apple,MacBook Pro,Ultrabook,13.3,8,macOS,1.37,1803.6,Standard,2560,...,Yes,Intel,3.1,Core i5,256,0,SSD,No,Intel,Iris Plus Graphics 650


In [3]:
features, target = df.drop(columns=['Price_euros']), df['Price_euros']

numerical_features = features.select_dtypes(include=[np.number]).columns  # Численные признаки
categorical_features = features.select_dtypes(exclude=[np.number]).columns  # Категориальные признаки

train_features, test_features, train_target, test_target = train_test_split(features, target, random_state=42)

le = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=-1)

train_features[categorical_features] = le.fit_transform(train_features[categorical_features])
test_features[categorical_features] = le.transform(test_features[categorical_features])

imputer = SimpleImputer(strategy='most_frequent') 
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_features = pd.DataFrame(imputer.transform(test_features), columns=test_features.columns)

#### 2. Построение бейзлайна 

Для оценки модели будем использовать метод кросс валидации, который позволяет более качественно оценить полученные метрики.

In [4]:
metrics = regression_cross_validate(DecisionTreeRegressor, train_features.to_numpy(), train_target.to_numpy(), n_folds=5, random_state=42)
display_metrics_table(*metrics)

dt = DecisionTreeRegressor(random_state=42)
dt.fit(train_features, train_target)

predicted_target = dt.predict(test_features)

# Метрики
mse = mean_squared_error(test_target, predicted_target)
mae = mean_absolute_error(test_target, predicted_target)
r2 = r2_score(test_target, predicted_target)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")

| Metric   |          Mean |       Std Dev |
|:---------|--------------:|--------------:|
| MAE      |    227.449    |    13.0951    |
| MSE      | 132840        | 30014.5       |
| R2       |      0.728708 |     0.0612653 |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 145584.74
Средняя абсолютная ошибка (MAE): 247.89
Коэффициент детерминации (R^2): 0.70


Как можем увидеть значение метрики $R^2$ около 0.70, что означает что около 70% дисперсии данных объясняется моделью.

Сформулируем несколько гипотез, которые могут помочь улучшить качество модели

1) Поменять Encoder категориальных признаков с `LabelEncoder` на `OneHotEncoder`
2) Отмасштабировать численные признаки
3) Добавить параметр глубины в дереве

In [5]:
onehot = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

encoded_train_data = onehot.fit_transform(train_features[categorical_features])
encoded_test_data = onehot.transform(test_features[categorical_features])

encoded_df = pd.DataFrame(encoded_train_data, columns=onehot.get_feature_names_out(categorical_features))
updated_train_features = train_features.drop(columns=categorical_features).reset_index(drop=True)
updated_train_features = pd.concat([updated_train_features, encoded_df], axis=1)

encoded_df = pd.DataFrame(encoded_test_data, columns=onehot.get_feature_names_out(categorical_features))
updated_test_features = test_features.drop(columns=categorical_features).reset_index(drop=True)
updated_test_features = pd.concat([updated_test_features, encoded_df], axis=1)

# scaler = StandardScaler()
# updated_train_features[numerical_features] = scaler.fit_transform(train_features[numerical_features])
# updated_test_features[numerical_features] = scaler.transform(test_features[numerical_features])

In [6]:
metrics = regression_cross_validate(DecisionTreeRegressor, updated_train_features.to_numpy(), train_target.to_numpy(), n_folds=5, max_depth=10, random_state=42)
display_metrics_table(*metrics)

dt = DecisionTreeRegressor(max_depth=10, random_state=42)
dt.fit(updated_train_features, train_target)

predicted_target = dt.predict(updated_test_features)

# Метрики
mse = mean_squared_error(test_target, predicted_target)
mae = mean_absolute_error(test_target, predicted_target)
r2 = r2_score(test_target, predicted_target)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")

| Metric   |          Mean |      Std Dev |
|:---------|--------------:|-------------:|
| MAE      |    230.343    |    16.5754   |
| MSE      | 145682        | 28143.9      |
| R2       |      0.702678 |     0.054038 |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 118536.23
Средняя абсолютная ошибка (MAE): 228.46
Коэффициент детерминации (R^2): 0.76


Можно заметить, что в среднем показатели метрик улучшились по сравнению с предыдущими результатами. Анализ результатов на тестовой выборке также демонстрирует существенные улучшения, что подтверждает эффективность проведённых модификаций.

#### 4. Реализация своего класса

In [7]:
class MyDecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, features, target):
        self.tree = self._build_tree(features, target, depth=0)

    def _build_tree(self, features, target, depth):
        if len(set(target)) == 1 or (self.max_depth and depth == self.max_depth):
            return np.mean(target)
        best_split = self._find_best_split(features, target)
        if best_split is None:
            return np.mean(target)

        left_indices = features[:, best_split['feature']] <= best_split['value']
        right_indices = ~left_indices
        left_tree = self._build_tree(features[left_indices], target[left_indices], depth + 1)
        right_tree = self._build_tree(features[right_indices], target[right_indices], depth + 1)

        return {
            'feature': best_split['feature'],
            'value': best_split['value'],
            'left': left_tree,
            'right': right_tree
        }

    def _find_best_split(self, features, target):
        best_split = None
        best_score = float('inf')
        
        for feature in range(features.shape[1]):  # Для каждого признака
            possible_values = set(features[:, feature])  # Уникальные значения признака
            for value in possible_values:
                left_indices = features[:, feature] <= value
                right_indices = ~left_indices

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                left_y, right_y = target[left_indices], target[right_indices]
                # Вычисление ошибки на текущем разрезе (среднеквадратическая ошибка)
                score = self._calculate_split_score(left_y, right_y)

                if score < best_score:
                    best_score = score
                    best_split = {'feature': feature, 'value': value}

        return best_split

    def _calculate_split_score(self, left_y, right_y):
        left_score = np.var(left_y) * len(left_y)
        right_score = np.var(right_y) * len(right_y)
        return left_score + right_score

    def predict(self, features):
        return np.array([self._predict_sample(x, self.tree) for x in features])

    def _predict_sample(self, x, tree):
        if isinstance(tree, dict):
            if x[tree['feature']] <= tree['value']:
                return self._predict_sample(x, tree['left'])
            else:
                return self._predict_sample(x, tree['right'])
        else:
            return tree

In [8]:
metrics = regression_cross_validate(MyDecisionTreeRegressor, train_features.to_numpy(), train_target.to_numpy(), n_folds=5)
display_metrics_table(*metrics)

dt = MyDecisionTreeRegressor()
dt.fit(train_features.to_numpy(), train_target.to_numpy())

predicted_target = dt.predict(test_features.to_numpy())

# Метрики
mse = mean_squared_error(test_target, predicted_target)
mae = mean_absolute_error(test_target, predicted_target)
r2 = r2_score(test_target, predicted_target)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")

| Metric   |          Mean |      Std Dev |
|:---------|--------------:|-------------:|
| MAE      |    237.603    |    13.8225   |
| MSE      | 147960        | 22421.1      |
| R2       |      0.697786 |     0.044956 |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 172541.46
Средняя абсолютная ошибка (MAE): 275.09
Коэффициент детерминации (R^2): 0.64


In [9]:
metrics = regression_cross_validate(MyDecisionTreeRegressor, updated_train_features.to_numpy(), train_target.to_numpy(), n_folds=5, max_depth=10)
display_metrics_table(*metrics)

dt = MyDecisionTreeRegressor(max_depth=10)
dt.fit(updated_train_features.to_numpy(), train_target)

predicted_target = dt.predict(updated_test_features.to_numpy())

# Метрики
mse = mean_squared_error(test_target, predicted_target)
mae = mean_absolute_error(test_target, predicted_target)
r2 = r2_score(test_target, predicted_target)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")

| Metric   |         Mean |       Std Dev |
|:---------|-------------:|--------------:|
| MAE      |    229.499   |    17.6742    |
| MSE      | 146707       | 30914.9       |
| R2       |      0.70176 |     0.0511069 |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 120889.44
Средняя абсолютная ошибка (MAE): 235.83
Коэффициент детерминации (R^2): 0.75


Результаты демонстрируют, что разработанная собственная реализация модели в целом обеспечивает уровень качества, сопоставимый с результатами, достигаемыми стандартной моделью из библиотеки `sklearn`.

### Classification

#### 1. Обработка данных

In [10]:
df = pd.read_csv('data/AIDS_Classification.csv')
df.head()

Unnamed: 0,time,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,...,str2,strat,symptom,treat,offtrt,cd40,cd420,cd80,cd820,infected
0,948,2,48,89.8128,0,0,0,100,0,0,...,0,1,0,1,0,422,477,566,324,0
1,1002,3,61,49.4424,0,0,0,90,0,1,...,1,3,0,1,0,162,218,392,564,1
2,961,3,45,88.452,0,1,1,90,0,1,...,1,3,0,1,1,326,274,2063,1893,0
3,1166,3,47,85.2768,0,1,0,100,0,1,...,1,3,0,1,0,287,394,1590,966,0
4,1090,0,43,66.6792,0,1,0,100,0,1,...,1,3,0,0,0,504,353,870,782,0


In [11]:
features, target = df.drop(columns=['infected']), df['infected']

train_features, test_features, train_target, test_target = train_test_split(features, target, random_state=42)

num_features = features.select_dtypes(include=[np.number]).columns  # Численные признаки
categorical_features = features.select_dtypes(exclude=[np.number]).columns  # Категориальные признаки

le = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=99)

train_features[categorical_features] = le.fit_transform(train_features[categorical_features])
test_features[categorical_features] = le.transform(test_features[categorical_features])

imputer = SimpleImputer(strategy='most_frequent') 
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_features = pd.DataFrame(imputer.transform(test_features), columns=test_features.columns)

In [12]:
metrics = classification_cross_validate(DecisionTreeClassifier, train_features.to_numpy(), train_target.to_numpy(), n_folds=5)
display_metrics_classification_table(*metrics)

dt = DecisionTreeClassifier()  # Выбираем количество соседей
dt.fit(train_features, train_target)

predicted_target = dt.predict(test_features)

accuracy = accuracy_score(test_target, predicted_target)
precision = precision_score(test_target, predicted_target, average='weighted')
recall = recall_score(test_target, predicted_target, average='weighted')
f1 = f1_score(test_target, predicted_target, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |   Std Dev |
|:----------|---------:|----------:|
| Accuracy  | 0.865977 | 0.0310933 |
| Precision | 0.8663   | 0.0287761 |
| Recall    | 0.865977 | 0.0310933 |
| F1-score  | 0.865852 | 0.0300451 |

=== Результаты на Тесте ===
1. Accuracy: 82.24%
2. Precision: 82.11%
3. Recall: 82.24%
4. F1-score: 82.17%


Сформулируем несколько гипотез, которые могут помочь улучшить качество модели

1) Поменять Encoder категориальных признаков с `LabelEncoder` на `OneHotEncoder`
2) Отмасштабировать численные признаки
3) Добавить параметр глубины в дереве

In [13]:
onehot = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

encoded_train_data = onehot.fit_transform(train_features[categorical_features])
encoded_test_data = onehot.transform(test_features[categorical_features])

encoded_df = pd.DataFrame(encoded_train_data, columns=onehot.get_feature_names_out(categorical_features))
updated_train_features = train_features.drop(columns=categorical_features).reset_index(drop=True)
updated_train_features = pd.concat([updated_train_features, encoded_df], axis=1)

encoded_df = pd.DataFrame(encoded_test_data, columns=onehot.get_feature_names_out(categorical_features))
updated_test_features = test_features.drop(columns=categorical_features).reset_index(drop=True)
updated_test_features = pd.concat([updated_test_features, encoded_df], axis=1)


scaler = StandardScaler()
updated_train_features[num_features] = scaler.fit_transform(train_features[num_features])
updated_test_features[num_features] = scaler.transform(test_features[num_features])

In [14]:
metrics = classification_cross_validate(DecisionTreeClassifier, updated_train_features.to_numpy(), train_target.to_numpy(), n_folds=5, max_depth=5)
display_metrics_classification_table(*metrics)

dt = DecisionTreeClassifier(max_depth=5)  
dt.fit(updated_train_features, train_target)

predicted_target = dt.predict(updated_test_features)

accuracy = accuracy_score(test_target, predicted_target)
precision = precision_score(test_target, predicted_target, average='weighted')
recall = recall_score(test_target, predicted_target, average='weighted')
f1 = f1_score(test_target, predicted_target, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |   Std Dev |
|:----------|---------:|----------:|
| Accuracy  | 0.875956 | 0.0266113 |
| Precision | 0.87268  | 0.0284218 |
| Recall    | 0.875956 | 0.0266113 |
| F1-score  | 0.871532 | 0.0291601 |

=== Результаты на Тесте ===
1. Accuracy: 86.92%
2. Precision: 86.79%
3. Recall: 86.92%
4. F1-score: 85.86%


In [15]:
class MyDecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None
    
    def fit(self, features, target):
        self.tree = self._build_tree(features, target)
    
    def _build_tree(self, features, target, depth=0):
        n_samples, n_features = features.shape
        unique_classes = np.unique(target)
        
        # Условия остановки
        if len(unique_classes) == 1:
            return unique_classes[0]
        if n_samples <= 1:
            return np.random.choice(unique_classes)
        if self.max_depth and depth >= self.max_depth:
            return self._most_common_class(target)
        
        best_split = self._best_split(features, target)
        left_tree = self._build_tree(features[best_split['left_indices']], target[best_split['left_indices']], depth + 1)
        right_tree = self._build_tree(features[best_split['right_indices']], target[best_split['right_indices']], depth + 1)
        
        return {'feature_index': best_split['feature_index'], 'threshold': best_split['threshold'], 'left': left_tree, 'right': right_tree}

    def _best_split(self, features, target):
        best_info_gain = -float('inf')
        best_split = {}
        n_samples, n_features = features.shape
        
        for feature_index in range(n_features):
            feature_values = features[:, feature_index]
            thresholds = np.unique(feature_values)
            
            for threshold in thresholds:
                left_indices = feature_values <= threshold
                right_indices = feature_values > threshold
                
                if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
                    continue
                
                info_gain = self._information_gain(target, left_indices, right_indices)
                
                if info_gain > best_info_gain:
                    best_info_gain = info_gain
                    best_split = {'feature_index': feature_index, 'threshold': threshold, 'left_indices': left_indices, 'right_indices': right_indices}
        
        return best_split

    def _information_gain(self, target, left_indices, right_indices):
        left_y = target[left_indices]
        right_y = target[right_indices]
        
        parent_entropy = self._entropy(target)
        left_entropy = self._entropy(left_y)
        right_entropy = self._entropy(right_y)
        
        left_weight = len(left_y) / len(target)
        right_weight = len(right_y) / len(target)
        
        info_gain = parent_entropy - (left_weight * left_entropy + right_weight * right_entropy)
        return info_gain
    
    def _entropy(self, target):
        class_counts = np.bincount(target)
        probabilities = class_counts / len(target)
        return -np.sum(probabilities * np.log2(probabilities + 1e-9))
    
    def _most_common_class(self, target):
        return np.bincount(target).argmax()
    
    def predict(self, features):
        predictions = [self._predict_sample(sample, self.tree) for sample in features]
        return np.array(predictions)
    
    def _predict_sample(self, sample, tree):
        if isinstance(tree, dict):
            feature_value = sample[tree['feature_index']]
            if feature_value <= tree['threshold']:
                return self._predict_sample(sample, tree['left'])
            else:
                return self._predict_sample(sample, tree['right'])
        else:
            return tree

In [16]:
metrics = classification_cross_validate(MyDecisionTreeClassifier, train_features.to_numpy(), train_target.to_numpy(), n_folds=5)
display_metrics_classification_table(*metrics)

dt = MyDecisionTreeClassifier() 
dt.fit(train_features.to_numpy(), train_target)

predicted_target = dt.predict(test_features.to_numpy())

accuracy = accuracy_score(test_target, predicted_target)
precision = precision_score(test_target, predicted_target, average='weighted')
recall = recall_score(test_target, predicted_target, average='weighted')
f1 = f1_score(test_target, predicted_target, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |   Std Dev |
|:----------|---------:|----------:|
| Accuracy  | 0.852889 | 0.0217752 |
| Precision | 0.854719 | 0.0212242 |
| Recall    | 0.852889 | 0.0217752 |
| F1-score  | 0.85327  | 0.0213728 |

=== Результаты на Тесте ===
1. Accuracy: 84.11%
2. Precision: 83.71%
3. Recall: 84.11%
4. F1-score: 83.87%


In [17]:
metrics = classification_cross_validate(MyDecisionTreeClassifier, train_features.to_numpy(), train_target.to_numpy(), n_folds=5, max_depth=5)
display_metrics_classification_table(*metrics)

dt = MyDecisionTreeClassifier(max_depth=5) 
dt.fit(train_features.to_numpy(), train_target)

predicted_target = dt.predict(test_features.to_numpy())

accuracy = accuracy_score(test_target, predicted_target)
precision = precision_score(test_target, predicted_target, average='weighted')
recall = recall_score(test_target, predicted_target, average='weighted')
f1 = f1_score(test_target, predicted_target, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |   Std Dev |
|:----------|---------:|----------:|
| Accuracy  | 0.882179 | 0.01197   |
| Precision | 0.880082 | 0.0124448 |
| Recall    | 0.882179 | 0.01197   |
| F1-score  | 0.879594 | 0.0114833 |

=== Результаты на Тесте ===
1. Accuracy: 87.48%
2. Precision: 87.05%
3. Recall: 87.48%
4. F1-score: 87.08%


Результаты показывают, что собственная реализация модели в среднем такой же по качеству что и модели из `sklearn`, однако демонстрирует приросты при использовании улучшенного бейзлайна.

### Заключение

Внесённые изменения, включая нормализацию данных и подбор параметра глубины в алгоритме решающих деревьев, улучшают метрики модели. Проведённые эксперименты демонстрируют, что как собственная реализация, так и применение моделей из sklearn, дают схожие результаты.

| Модель                    |      MSE  |        MAE |      $R^2$ |
|:--------------------------|----------:|-----------:|-----------:|
| Sklearn (до улучшения)    | 145584.74 | 247.89     |  0.70      |
| Sklearn (после улучшения) | 118536.23 | 228.46     |  0.76      |
| Собственная имплементация (до улучшения)   | 172541.46 | 275.09   |   0.64     |
| Собственная имплементация (после улучшения)| 120889.44 | 235.83   |  0.75    |

| Модель                    |  Accuracy |  Precision |     Recall |    F1-score |
|:--------------------------|----------:|-----------:|-----------:|-----------:|
| Sklearn (до улучшения)    |   82.43%  |   82.17%   |  82.43%    |  82.29%    |
| Sklearn (после улучшения) |   87.10%  |   86.98%   |  87.10%    |  86.09%    |
| Собственная имплементация (до улучшения)   |   84.11%  |   83.71%   |  84.11%    |  83.87%    |
| Собственная имплементация (после улучшения)|   87.48%  |   87.05%   |  87.48%    |  87.08%    |