# Извлечение признаков, выделяемых деревьями

In [1]:
import pydotplus 
import pandas as pd
import numpy as np

from IPython.display import Image
from sklearn import tree
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('HR.csv')

target = 'left'
features = [c for c in data if c != target]
print(features)

X, y = data[features], data[target]

['last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']


In [3]:
from sklearn.tree import _tree

def tree_to_code(tree, feature_names, return_class=True):
        
    '''
    Outputs a decision tree model as a Python function
    
    Parameters:
    -----------
    tree: decision tree model
        The decision tree to represent as a function
    feature_names: list
        The feature names of the dataset used for building the decision tree
    return class:
        Return most frequent class rather than number of elements of each class in the node
    '''

    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "    " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            value = tree_.value[node][0]
            if return_class and len(value) > 1:
                value = np.argmax(tree_.value[node])

            print("{}return {}".format(indent, value))

    recurse(0, 1)

In [4]:
data_sample = data.sample(200)
model = tree.DecisionTreeClassifier(max_depth=2)
model.fit(data_sample[features], data_sample[target])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [5]:
tree_to_code(model, features)

def tree(last_evaluation, number_project, average_montly_hours, time_spend_company, Work_accident, promotion_last_5years):
    if number_project <= 2.5:
        if last_evaluation <= 0.574999988079071:
            return 1
        else:
            return 0
    else:
        if last_evaluation <= 0.8450000286102295:
            return 0
        else:
            return 0


In [6]:
print('Logistic Regression with base features:', cross_val_score(LogisticRegression(), data[features], data[target]).mean())

Logistic Regression with base features: 0.6287053143962126


In [7]:
data['new_feature_01'] = (data.number_project <= 2.5) & (data.last_evaluation < 0.53)
data['new_feature_02'] = 0 # your code

In [8]:
data[['number_project', 'last_evaluation', 'new_feature_01', 'new_feature_02']][:4]

Unnamed: 0,number_project,last_evaluation,new_feature_01,new_feature_02
0,2,0.53,False,0
1,5,0.86,False,0
2,7,0.88,False,0
3,5,0.87,False,0


In [9]:
new_features = [c for c in data if c != target]
print('Logistic Regression with new features:', cross_val_score(LogisticRegression(), data[new_features], data[target]).mean())

Logistic Regression with new features: 0.6749115556444623


## Опциональная задача
Попробуйте добавить такие признаки, чтобы accuracy линейной модели стало сравнимым с RandomForest на 15 деревьях, то есть дошло до 0.73