# Решающие деревья.

In [None]:
# Импортируем библиотеки
import numpy as np
%pylab inline

In [None]:
# Создадим пример данных
from sklearn.datasets import make_circles
X_circles, y_circles = make_circles(n_samples=1000, factor=0.01, noise=0.55, random_state=42)

In [None]:
#рисуем данные
from matplotlib.colors import ListedColormap
color_map = ListedColormap(['yellow', 'red'])
plt.figure(figsize=(12,10))
plt.scatter(X_circles[:,0], X_circles[:,1], c=y_circles, cmap=color_map, s=100)

In [None]:
# Создадим решающее дерево
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=42)

In [None]:
from sklearn.model_selection import train_test_split
X_train_circles, X_test_circles, y_train_circles, y_test_circles = train_test_split(X_circles,
                                                                                    y_circles,
                                                                                    test_size=0.2)

In [None]:
dtree.fit(X_train_circles, y_train_circles)

In [None]:
predict = dtree.predict(X_test_circles)

In [None]:
# Измерим качество алгоритма
dtree.score(X_test_circles, y_test_circles)

In [None]:
from sklearn.model_selection import cross_val_score

print(np.mean(cross_val_score(dtree, X_circles, y_circles, cv=5)))


In [None]:
from sklearn import metrics
def get_meshgrid(data, step=.05, border=.5,):
    x_min, x_max = data[:, 0].min() - border, data[:, 0].max() + border
    y_min, y_max = data[:, 1].min() - border, data[:, 1].max() + border
    return np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step))

def plot_decision_surface(estimator, train_data, train_labels, test_data, test_labels):
    #fit model
    estimator.fit(train_data, train_labels)
    
    #set figure size
    pyplot.figure(figsize = (16, 6))
    
    light_colors = ListedColormap(['lightyellow','lightcoral'])
    colors = ListedColormap(['yellow', 'red'])
    
    #plot decision surface on the train data 
    pyplot.subplot(1,2,1)
    xx, yy = get_meshgrid(train_data)
    mesh_predictions = np.array(estimator.predict(np.c_[xx.ravel(), yy.ravel()])).reshape(xx.shape)
    pyplot.pcolormesh(xx, yy, mesh_predictions, cmap = light_colors)
    pyplot.scatter(train_data[:, 0], train_data[:, 1], c = train_labels, s = 40, cmap = colors)
    pyplot.title('Train data, accuracy={}'.format(metrics.accuracy_score(train_labels, estimator.predict(train_data))))
    
    #plot decision surface on the test data
    pyplot.subplot(1,2,2)
    pyplot.pcolormesh(xx, yy, mesh_predictions, cmap = light_colors)
    pyplot.scatter(test_data[:, 0], test_data[:, 1], c = test_labels, s = 40, cmap = colors)
    pyplot.title('Test data, accuracy={}'.format(metrics.accuracy_score(test_labels, estimator.predict(test_data))))

In [None]:
plot_decision_surface(dtree, X_train_circles,y_train_circles, X_test_circles, y_test_circles)

In [None]:
from sklearn.ensemble import RandomForestClassifier
plot_decision_surface(RandomForestClassifier(), X_train_circles,y_train_circles, X_test_circles, y_test_circles)

In [None]:
#Давайте обсудим с какими параметраим решающего дерева можно "играться" чтобы улучшить качество алгоритма

In [None]:
# Поиграемся с параметрами решающего дерева

In [None]:
DecisionTreeClassifier()

In [None]:
plot_decision_surface(DecisionTreeClassifier(max_depth=500), X_train_circles,y_train_circles, X_test_circles, y_test_circles)

In [None]:
# Самостоятельная работа
# Todo: изучить качество работы алгоритма при изменении max_depth
# ваш код здесь

In [None]:
# Самостоятельная работа
# Todo: изучить качество работы алгоритма при изменении min_samples_leaf
# ваш код здесь


In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
def draw_accuracy_graph(alg, param, grid_list, X, y):
    # Инициализируем валидацию
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    train_acc = []
    test_acc = []
    temp_train_acc = []
    temp_test_acc = []
    trees_grid = grid_list
    
    # Обучаем на тренировочном датасете
    for ntrees in trees_grid:
        _kwargs = {param:ntrees}
        rfc = alg(**_kwargs)
        temp_train_acc = []
        temp_test_acc = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            rfc.fit(X_train, y_train)
            temp_train_acc.append(rfc.score(X_train, y_train))
            temp_test_acc.append(rfc.score(X_test, y_test))
        train_acc.append(temp_train_acc)
        test_acc.append(temp_test_acc)

    train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
    print("Best accuracy on CV is {:.2f}% with {} {}".format(max(test_acc.mean(axis=1))*100, 
                                                            trees_grid[np.argmax(test_acc.mean(axis=1))],
                                                            param))
    
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.plot(trees_grid, train_acc.mean(axis=1), alpha=0.5, color='blue', label='train')
    ax.plot(trees_grid, test_acc.mean(axis=1), alpha=0.5, color='red', label='cv')
    ax.fill_between(trees_grid, test_acc.mean(axis=1) - test_acc.std(axis=1), test_acc.mean(axis=1) + test_acc.std(axis=1), color='#888888', alpha=0.4)
    ax.fill_between(trees_grid, test_acc.mean(axis=1) - 2*test_acc.std(axis=1), test_acc.mean(axis=1) + 2*test_acc.std(axis=1), color='#888888', alpha=0.2)
    ax.legend(loc='best')
    #ax.set_ylim([0.88,1.02])
    ax.set_ylabel("Accuracy")
    ax.set_xlabel(param)

In [None]:
draw_accuracy_graph(DecisionTreeClassifier,'max_depth', list(range(1,10)), X_circles, y_circles)

In [None]:
draw_accuracy_graph(DecisionTreeClassifier,'min_samples_leaf', list(range(1,100)), X_circles, y_circles)

In [None]:
# Код для изучения точности в зависимости от параметров
# Ваш код здесь

## Чо там у титаника

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score

In [None]:
# Загрузка данных
df = pd.read_csv("titanic.csv")

In [None]:
df.head()

In [None]:
#  Выбираем те данные, которые нам нужны
def preprocess_data(data):
    new_df = pd.DataFrame()
    new_df['Age'] = data.Age.fillna(data.Age.mean())
    new_df['Sex'] = pd.Series([1 if s == 'male' else 0 for s in data.Sex], name = 'Sex')
    new_df['Pclass'] = data.Pclass
    return new_df

In [None]:
# Ваше решение здесь

In [None]:
X_titanic = preprocess_data(df)
y_titanic = np.array(df.Survived)