# Decision tree и Random forest

Реализовать 2 класса DecisionTreeClassifier и RandomForestClassifier

In [13]:
import time

from scipy import optimize

class DecisionTreeClassifier:
    NON_LEAF_TYPE = 0
    LEAF_TYPE = 1

    def __init__(self, min_samples_split=2, max_depth=None, sufficient_share=1.0, criterion='gini', max_features=None):
        self.tree = dict()
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.sufficient_share = sufficient_share
        self.num_class = -1
        if criterion == 'gini':
            self.G_function = self.__gini
        elif criterion == 'entropy':
            self.G_function = self.__entropy
        elif criterion == 'misclass':
            self.G_function = self.__misclass
        else:
            print 'invalid criterion name'
            raise

        if max_features == 'sqrt':
            self.get_feature_ids = self.__get_feature_ids_sqrt
        elif max_features == 'log2':
            self.get_feature_ids = self.__get_feature_ids_log2
        elif max_features == None:
            self.get_feature_ids = self.__get_feature_ids_N
        else:
            print 'invalid max_features name'
            raise

    def __gini(self, l_c, l_s, r_c, r_s):
        l_s = l_s.astype('float')
        r_s = r_s.astype('float')
        return np.sum(l_c * (l_s - l_c) / l_s, axis=1) + np.sum(r_c * (r_s - r_c) / r_s, axis=1)
    
    def __entropy(self, l_c, l_s, r_c, r_s):
        return -np.sum(l_c * (np.log(l_c) - np.log(l_s))) - np.sum(r_c * (np.log(r_c) - np.log(r_s))) 

    def __misclass(self, l_c, l_s, r_c, r_s):
        return l_s - np.max(l_c) + r_s - np.max(r_c)

    def __get_feature_ids_sqrt(self, n_feature):
        feature_ids = range(n_feature)
        np.random.shuffle(feature_ids)
        return feature_ids[:int(n_feature**0.5)]
        
    def __get_feature_ids_log2(self, n_feature):
        feature_ids = range(n_feature)
        np.random.shuffle(feature_ids)
        return feature_ids[:int(np.log(n_feature))]

    def __get_feature_ids_N(self, n_feature):
        return range(n_feature)
    
    def __sort_samples(self, x, y):
        sorted_idx = x.argsort()
        return x[sorted_idx], y[sorted_idx]

    def __div_samples(self, x, y, feature_id, threshold):
        left_mask = x[:, feature_id] > threshold
        right_mask = ~left_mask
        return x[left_mask], x[right_mask], y[left_mask], y[right_mask]

    def __find_threshold(self, x, y):
        sorted_x, sorted_y = self.__sort_samples(x, y)
        class_number = np.unique(y).shape[0]
        
        splitted_sorted_y = sorted_y[self.min_samples_split:-self.min_samples_split]
        r_border_ids = np.where(splitted_sorted_y[:-1] != splitted_sorted_y[1:])[0] + (self.min_samples_split + 1)
        
        if len(r_border_ids) == 0:
            return float('+inf'), None
        
        eq_el_count = r_border_ids - np.append([self.min_samples_split], r_border_ids[:-1])
        one_hot_code = np.zeros((r_border_ids.shape[0], class_number))
        one_hot_code[np.arange(r_border_ids.shape[0]), sorted_y[r_border_ids - 1]] = 1
        class_increments = one_hot_code * eq_el_count.reshape(-1, 1)
        class_increments[0] = class_increments[0] + np.bincount(y[:self.min_samples_split], minlength=class_number)
        
        l_class_count = np.cumsum(class_increments, axis=0)        
        r_class_count = np.bincount(y) - l_class_count
        l_sizes = r_border_ids.reshape(l_class_count.shape[0], 1)
        r_sizes = sorted_y.shape[0] - l_sizes

        gs = self.G_function(l_class_count, l_sizes, r_class_count, r_sizes)
        idx = np.argmin(gs)
    
        left_el_id = l_sizes[idx][0]
        return gs[idx], (sorted_x[left_el_id-1] + sorted_x[left_el_id]) / 2.0

    def __fit_node(self, x, y, node_id, depth, pred_f=-1):
        probs = np.bincount(y)
        probs.resize(self.num_class)
        leaf_class_idx = np.argmax(probs)
        if depth == self.max_depth or probs[leaf_class_idx] >= self.sufficient_share * y.shape[0]:
            self.tree[node_id] = (self.__class__.LEAF_TYPE, leaf_class_idx, np.true_divide(probs, np.sum(probs)))
            return

        min_g = float('+inf')
        res_feature_id = 0
        res_threshold = 0
        for feature_id in self.get_feature_ids(x.shape[1]):
            if feature_id == pred_f:
                continue
            g, threshold = self.__find_threshold(x[:, feature_id], y)
            if g < min_g:
                min_g = g
                res_feature_id = feature_id
                res_threshold = threshold
        
        left_x, right_x, left_y, right_y = self.__div_samples(x, y, res_feature_id, res_threshold)
        
        if (left_y.size < 2 * self.min_samples_split) or (right_y.size < 2 * self.min_samples_split):
            self.tree[node_id] = (self.__class__.LEAF_TYPE, leaf_class_idx, np.true_divide(probs, np.sum(probs)))
            return

        self.tree[node_id] = (self.__class__.NON_LEAF_TYPE, res_feature_id, res_threshold)
        self.__fit_node(left_x, left_y, 2 * node_id + 1, depth + 1, res_feature_id)
        self.__fit_node(right_x, right_y, 2 * node_id + 2, depth + 1, res_feature_id)

    
    def fit(self, x, y):
        self.num_class = np.unique(y).size
        self.__fit_node(x, y, 0, 0) 

    def __predict_class(self, x, node_id):
        node = self.tree[node_id]
        if node[0] == self.__class__.NON_LEAF_TYPE:
            _, feature_id, threshold = node
            if x[feature_id] > threshold:
                return self.__predict_class(x, 2 * node_id + 1)
            else:
                return self.__predict_class(x, 2 * node_id + 2)
        else:
            return node[1]

    def __predict_probs(self, x, node_id):
        node = self.tree[node_id]
        if node[0] == self.__class__.NON_LEAF_TYPE:
            _, feature_id, threshold = node
            if x[feature_id] > threshold:
                return self.__predict_probs(x, 2 * node_id + 1)
            else:
                return self.__predict_probs(x, 2 * node_id + 2)
        else:
            return node[2]
        
    def predict(self, X):
        return np.array([self.__predict_class(x, 0) for x in X])
    
    def predict_probs(self, X):
        return np.array([self.__predict_probs(x, 0) for x in X])

    def fit_predict(self, x_train, y_train, predicted_x):
        self.fit(x_train, y_train)
        return self.predict(predicted_x)

In [14]:
class RandomForestClassifier:
    def __init__(self, n_estimators=10, min_samples_split=2, max_depth=None, sufficient_share=1.0, criterion='gini', max_features='sqrt'):
        self.trees = list()
        self.n_estimators = n_estimators
        for _ in xrange(self.n_estimators):
            clf = DecisionTreeClassifier(
                min_samples_split=min_samples_split,
                max_depth=max_depth,
                sufficient_share=sufficient_share,
                criterion=criterion,
                max_features = max_features
            )
            self.trees.append(clf)
  
    def fit(self, x, y):
        n_samples = x.shape[0]
        samples = np.arange(n_samples)
        for t in self.trees:
            sample_ids = np.unique(np.random.choice(samples, n_samples))
            t.fit(x[sample_ids], y[sample_ids])

    def predict(self, x):
        predictions = np.column_stack(t.predict(x) for t in self.trees)
        n_classes = np.max(predictions) + 1
        return np.argmax(np.apply_along_axis(lambda x: np.bincount(x, minlength=n_classes), 1, predictions), axis=1)
            
        
    def fit_predict(self, x_train, y_train, predicted_x):
        self.fit(x_train, y_train)
        return self.predict(predicted_x)

In [15]:
class GradientBoostingClassifier:
    def __init__(self, n_estimators=10, min_samples_split=2, max_depth=None, sufficient_share=1.0, criterion='gini', max_features='sqrt'):
        self.trees = list()
        self.n_estimators = n_estimators
        for _ in xrange(self.n_estimators):
            clf = DecisionTreeClassifier(
                min_samples_split=min_samples_split,
                max_depth=max_depth,
                sufficient_share=sufficient_share,
                criterion=criterion,
                max_features = max_features
            )
            self.trees.append(clf)
        self.alpha_list = None
  
    def fit(self, x, y):
        self.alpha_list = [1]
        one_hot_code = np.zeros((y.shape[0], np.unique(y).size))
        one_hot_code[np.arange(y.shape[0]), y] = 1
        err_sum = one_hot_code       
        tree = self.trees[0]
        tree.fit(x, y)
        res = tree.predict_probs(x)
        err_sum -= res
        new_y = np.argmax(err_sum, axis=1)
        for tree in self.trees[1:]:
            tree.fit(x, new_y)
            res = tree.predict_probs(x)
            alpha_min = optimize.brent(lambda alpha: np.sum((err_sum - alpha * res)**2))
            self.alpha_list.append(alpha_min)
            err_sum -= alpha_min * res
            new_y = np.argmax(err_sum, axis=1)

    def predict(self, x):
        res = np.sum([self.alpha_list[idx] * tree.predict_probs(x) for idx, tree in enumerate(self.trees)], axis=0)
        return np.argmax(res, axis=1)
        
    def fit_predict(self, x_train, y_train, predicted_x):
        self.fit(x_train, y_train)
        return self.predict(predicted_x)


In [16]:
import pandas as pd
import numpy as np

In [17]:
import matplotlib.pyplot as plt
%matplotlib inline

In [18]:
df = pd.read_csv('./data/cs-training.csv', sep=',')

In [19]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [37]:
x = df.as_matrix(columns=df.columns[1:])

In [38]:
y = df.as_matrix(columns=df.columns[:1])
y = y.reshape(y.shape[0])

In [39]:
# clf = DecisionTreeClassifier(min_samples_split=2)
# clf = RandomForestClassifier(n_estimators=10, min_samples_split=2)
clf = GradientBoostingClassifier(n_estimators=10, min_samples_split=2, max_depth=3)

In [40]:
clf.fit(x, y)



In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
gkf = KFold(n_splits=5, shuffle=True)

for train, test in gkf.split(x, y):
    X_train, y_train = x[train], y[train]
    X_test, y_test = x[test], y[test]
    clf.fit(X_train, y_train)
    print(accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))