# Tree-based models - Exercises

In [1]:
import warnings
warnings.filterwarnings("ignore")

import hashlib

import pandas as pd
import numpy as np

from utils.utils import *

# Exercise 1 - Decision Trees

## 1.1 Gini impurity

Used by the CART algorithm for classification, Gini impurity is an alternative to entropy.

As entropy, it is a way to measure node homogeneity. Hence, it can be used to identify promissing splits.

Take $p$ as the probability of the positive class, i.e., the proportion of positive cases in the set:

$$I_G(p)= 1 - p^2 - (1-p)^2$$

It measures how often a randomly chosen element from the set would be incorrectly labeled.

In [2]:
def gini(p):
    return 1-p**2 - (1-p)**2
    
    
data = make_data()

In [3]:
expected_hash_1 = '5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91b46729d73a27fb57e9'
assert hashlib.sha256(str(gini(p=0)).encode('utf-8')).hexdigest() == expected_hash_1

expected_hash_2 = '0061765517977143a7ed83b494b172f02af75de1c3cdde6f969effdef053ee8c'
assert hashlib.sha256(str(gini(p=1/6)).encode('utf-8')).hexdigest() == expected_hash_2

expected_hash_3 = '683487f42b8f523b6df156fbc42818868de8d2b70ad48f801df4df36d07b40f2'
assert hashlib.sha256(str(gini(p=1/3)).encode('utf-8')).hexdigest() == expected_hash_3

expected_hash_4 = 'd2cbad71ff333de67d07ec676e352ab7f38248eb69c942950157220607c55e84'
assert hashlib.sha256(str(gini(p=1/2)).encode('utf-8')).hexdigest() == expected_hash_4

## 1.2 Applying the Gini 

### 1.2.1 Single node

Compute the impurity of a node that includes all instances where $x_i^{Windy}$ is `'true'`.

(Note that `'true'` is a string.)

In [85]:
# single_node_gini = ...
def compute_probability(node):
    n = node.shape[0]
    
    f = (node['Class']==1).sum()
    return f/n

single_node_gini = gini(compute_probability(data[data["Windy"]=="true"]))

In [87]:
expected_hash_5 = 'd2cbad71ff333de67d07ec676e352ab7f38248eb69c942950157220607c55e84'
assert hashlib.sha256(str(single_node_gini).encode('utf-8')).hexdigest() == expected_hash_5

### 1.2.2 Single feature

Write a function to compute the mean impurity of branching on a given feature.

(Hint: compute the impurity of each resulting node and average the results.)

In [125]:
from statistics import mean


def mean_impurity(data, feature_name):
    c_norm = data.shape[0]
    information = 0
    values = data[feature_name].unique()
    
    for value in values:
        node = data[data[feature_name]==value]
        
        p = compute_probability(node)
        
        e = gini(p)
        
        information += e
    
    return information/data[feature_name].nunique()

In [129]:
assert mean_impurity(data, 'Temperature') == 0.4398148148148148

expected_hash_6 = '60bb8f6c52a2aca85b1a3ae08e71f97dd3fa8603b55abefb09ebb0edfe5294d1'
assert hashlib.sha256(str(mean_impurity(data, 'Outlook')).encode('utf-8')).hexdigest() == expected_hash_6

expected_hash_7 = '1bd499c4b5bbae82a4f4a4efe3950b803e7dc64764c56115810e696db1b28afe'
assert hashlib.sha256(str(mean_impurity(data, 'Humidity')).encode('utf-8')).hexdigest() == expected_hash_7

expected_hash_8 = 'ccb9ffbc97edf0207e9b20dd9331f576984b45f178aa756421dd16508dd984fc'
assert hashlib.sha256(str(mean_impurity(data, 'Windy')).encode('utf-8')).hexdigest() == expected_hash_8

## 1.3 Worst split

Use the `mean_impurity()` function above to identify the **least promising** feature.

(Uncomment the **correct option**.)

In [135]:
# worst_split = 'Temperature'
# worst_split = 'Outlook'
# worst_split = 'Humidity'
worst_split = 'Windy'

In [136]:
expected_hash_9 = 'cdcaccab2e98e50c7fd550e5be166e6c260312708903794361d1815222ed0975'
assert hashlib.sha256(str(worst_split).encode('utf-8')).hexdigest() == expected_hash_9

# Exercise 2 - Random Forests

## 2.1 Bagging

Uncomment the **incorrect statement**.

In [139]:
# incorrect_statement_1 = 'Bagging helps to deal with overfitting, which is a big risk when using Decision Trees.'
# incorrect_statement_1 = 'Bagging involves averaging the predictions of multiple independent models.'
# incorrect_statement_1 = 'Bagging involves creating multiple data sets by sampling observations.'
incorrect_statement_1 = 'Bagging involves creating multiple data sets by sampling columns.'

In [140]:
expected_hash_10 = '1fb9060eefe647e998b5868f95d4930673fa3645ed0120176f6867f6c43ecf71'
assert hashlib.sha256(str(incorrect_statement_1).encode('utf-8')).hexdigest() == expected_hash_10

## 2.2 Random forests

Uncomment the **incorrect statement**.

In [141]:
# incorrect_statement_2 = 'Random forests rely on bootstrapping to build several training sets.'
# incorrect_statement_2 = 'Random forests rely on random feature selection before each split.'
# incorrect_statement_2 = 'Random forests aggregates the predictions of multiple models running in parallel.'
incorrect_statement_2 = 'Random forests aggregates the predictions of multiple models running sequentially.'

In [142]:
expected_hash_11 = '3a36af9faaf10138801ffef535b24bfe0a34217160a886dacdb3bd68b7777067'
assert hashlib.sha256(str(incorrect_statement_2).encode('utf-8')).hexdigest() == expected_hash_11

# 3 Gradient boosting

Uncomment the **incorrect statement**.

In [143]:
# incorrect_statement_3 = 'Gradient boosting fits individual trees sequentially.'
incorrect_statement_3 = 'Gradient boosting fits individual trees on the predictions from the previous tree.'
# incorrect_statement_3 = 'Gradient boosting fits individual trees on the pseudo-residuals of the previous tree.'
# incorrect_statement_3 = 'Gradient boosting can be used to optimize a differentiable cost function.'

In [144]:
expected_hash_12 = '4000dc46debb11c42c4d44c82eb50120463cad49ae182b5130c9e276104570bd'
assert hashlib.sha256(str(incorrect_statement_3).encode('utf-8')).hexdigest() == expected_hash_12