In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')
gen_submission = pd.read_csv('./gender_submission.csv')

df_test['Survived'] = gen_submission['Survived']

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Преобразуем данные, чтобы получить готовый датасет из предыдущей лабораторной.

In [3]:
for df in [df_train, df_test]:
    mean = df['Age'].mean()
    std = df['Age'].std()
    number_of_nulls = df['Age'].isnull().sum()
    random_ages = np.random.randint(mean - std, mean + std, size=number_of_nulls)

    new_ages = df['Age'].copy()
    new_ages[np.isnan(new_ages)] = random_ages
    df['Age'] = new_ages

for df in [df_train, df_test]:
    df['Embarked'] = df['Embarked'].fillna('S')

df_test = df_test[df_test['Fare'].notnull()]

df_train = df_train.drop(columns=['Name', 'PassengerId', 'Ticket', 'Cabin'])
df_test = df_test.drop(columns=['Name', 'PassengerId', 'Ticket', 'Cabin'])

df_train['Relatives'] = df_train['Parch'] + df_train['SibSp']
df_test['Relatives'] = df_test['Parch'] + df_test['SibSp']
df_train = df_train.drop(columns=['SibSp', 'Parch'])
df_test = df_test.drop(columns=['SibSp', 'Parch'])

genders = {'male': 0, 'female': 1}
ports = {"S": 0, "C": 1, "Q": 2}
for df in [df_train, df_test]:
    df['Sex'] = df['Sex'].map(genders)
    df['Embarked'] = df['Embarked'].map(ports)

In [4]:
X_train = df_train.drop(columns=['Survived']).to_numpy()
Y_train = df_train['Survived'].to_numpy()

X_test = df_test.drop(columns=['Survived']).to_numpy()
Y_test = df_test['Survived'].to_numpy()

df_test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Survived,Relatives
0,3,0,34.5,7.8292,2,0,0
1,3,1,47.0,7.0,0,1,1
2,2,0,62.0,9.6875,2,0,0
3,3,0,27.0,8.6625,0,0,0
4,3,1,22.0,12.2875,0,1,2


Вспомогательные функции для анализа результатов классификации

In [7]:
def confusion_matrix(y_pred, y_test):
    matrix = pd.DataFrame({'actual_1' : [0, 0], 'actual_0': [0, 0]})
    matrix.index = ['predicted_1', 'predicted_0']

    for i in range(len(y_pred)):
        if y_pred[i] == 1 and y_test[i] == 1:
            matrix.loc['predicted_1', 'actual_1'] += 1
        elif y_pred[i] == 1 and y_test[i] == 0:
            matrix.loc['predicted_1', 'actual_0'] += 1
        elif y_pred[i] == 0 and y_test[i] == 1:
            matrix.loc['predicted_0', 'actual_1'] += 1
        else:
            matrix.loc['predicted_0', 'actual_0'] += 1

    return matrix

def metrics(matrix):
    TP = matrix.loc['predicted_1', 'actual_1']
    FP = matrix.loc['predicted_1', 'actual_0']
    FN = matrix.loc['predicted_0', 'actual_1']
    TN = matrix.loc['predicted_0', 'actual_0']

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)

    return accuracy, precision, recall


## 1. Логистическая регрессия

In [152]:
class my_Logistic_Regression():
    def __init__(self, learning_rate, n_iterations):        
        self.learning_rate = learning_rate        
        self.n_iterations = n_iterations

    def fit(self, X, Y):
        self.X = X.copy()
        self.Y = Y.copy()
        n_samples, n_features = X.shape     
        self.W = np.zeros(n_features)        
        self.b = 0

        # обновление весов с помощью градиентного спуска
        for _ in range(self.n_iterations):
            linear_model = np.dot(self.X, self.W) + self.b          
            y_predicted = 1 / (1 + np.exp(-linear_model))

            dW = (1 / n_samples) * np.dot(X.T, y_predicted - Y)
            db = (1 / n_samples) * np.sum(y_predicted - Y)
            self.W -= self.learning_rate * dW    
            self.b -= self.learning_rate * db  

    def predict(self, X):
        S = np.array(1 / (1 + np.exp(-(np.dot(X, self.W) + self.b))))
        # если сигмойд получается больше, чем 0.5, то предсказываем класс 1
        return np.where(S >= 0.5, 1, 0)

In [153]:
hyperparameters = [
(0.1, 500), 
(0.1, 1000), 
(0.1, 1500),
(0.01, 1000),
(0.01, 2000),
(0.01, 5000),
(0.01, 10000),
(0.001, 1000),
(0.001, 3000),
(0.001, 5000),
(0.001, 10000),
(0.001, 15000),
(0.0001, 2000),
(0.0001, 5000),
(0.0001, 10000),
(0.0001, 15000),
(0.0001, 20000),
(0.0001, 50000)
]

for pair in hyperparameters:
    lr, iters = pair
    my_log_regr = my_Logistic_Regression(learning_rate=lr, n_iterations=iters)
    my_log_regr.fit(X_train, Y_train)
    Y_pred_test = my_log_regr.predict(X_test)
    Y_pred_train = my_log_regr.predict(X_train)
    print(f'Learning rate = {lr}, iterations = {iters}, train accuracy = {np.mean(Y_train == Y_pred_train)}, test accuracy = {np.mean(Y_test == Y_pred_test)}')


Learning rate = 0.1, iterations = 500, train accuracy = 0.7182940516273849, test accuracy = 0.6738609112709832
Learning rate = 0.1, iterations = 1000, train accuracy = 0.7418630751964085, test accuracy = 0.7194244604316546
Learning rate = 0.1, iterations = 1500, train accuracy = 0.7609427609427609, test accuracy = 0.7697841726618705
Learning rate = 0.01, iterations = 1000, train accuracy = 0.6161616161616161, test accuracy = 0.6354916067146283
Learning rate = 0.01, iterations = 2000, train accuracy = 0.6464646464646465, test accuracy = 0.6594724220623501
Learning rate = 0.01, iterations = 5000, train accuracy = 0.7216610549943884, test accuracy = 0.7721822541966427
Learning rate = 0.01, iterations = 10000, train accuracy = 0.7575757575757576, test accuracy = 0.8345323741007195
Learning rate = 0.001, iterations = 1000, train accuracy = 0.6879910213243546, test accuracy = 0.6522781774580336
Learning rate = 0.001, iterations = 3000, train accuracy = 0.6992143658810326, test accuracy = 0.6

Лучшую точность показала модель с learning rate, равным 0.001 и количеством итераций, равным 15000. Посмотрим на результаты классификации для моей модели с такими гиперпараметрами.

In [164]:
mylogit = my_Logistic_Regression(learning_rate=0.001, n_iterations=15000)
mylogit.fit(X_train, Y_train)
Y_pred = mylogit.predict(X_test)

In [165]:
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,102,16
predicted_0,50,249


In [166]:
metrics_mylogreg = metrics(confusion_matrix(Y_pred, Y_test))
print(f'Accuracy: {metrics_mylogreg[0]}\nPrecision: {metrics_mylogreg[1]}\nRecall: {metrics_mylogreg[2]}')

Accuracy: 0.841726618705036
Precision: 0.864406779661017
Recall: 0.6710526315789473


Мы получили довольно низкий recall. Это значит (и видно из таблицы), что модель предсказала малую долю выживших людей, то есть много выживших она определила как погибших.

Теперь посмотрим на модель из sklearn

In [167]:
logreg_model = LogisticRegression().fit(X_train, Y_train)
Y_pred = logreg_model.predict(X_test)

In [168]:
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,139,15
predicted_0,13,250


In [169]:
metrics_sklogreg = metrics(confusion_matrix(Y_pred, Y_test))
print(f'Accuracy: {metrics_sklogreg[0]}\nPrecision: {metrics_sklogreg[1]}\nRecall: {metrics_sklogreg[2]}')

Accuracy: 0.9328537170263789
Precision: 0.9025974025974026
Recall: 0.9144736842105263


Мы видим, что модель логистический регрессии из sklearn справилась немного лучше, чем реализованная. Причем recall получился немного выше, чем precision.

## 2. SVM

In [23]:
class my_SVM:
    def __init__(self, learning_rate=0.01, lambda_param=0.01, n_iters=10000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters

    def fit(self, X, y):
        # переименовываем лейблы в -1 и 1
        y_ = np.where(y <= 0, -1, 1)
        n_samples, n_features = X.shape

        self.W = np.zeros(n_features)
        self.b = 0

        # процесс обучения (настройка весов и смещения)
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * ((x_i @ self.W) - self.b) >= 1
                if condition:
                    self.W -= self.lr * (2 * self.lambda_param * self.W)
                else:
                    self.W -= self.lr * (2 * self.lambda_param * self.W - np.dot(x_i,y_[idx]))
                    self.b -= self.lr * y_[idx]

    def predict(self, X):
        res = np.dot(X, self.W) - self.b
        # предсказываем класс в зависимости от знака
        return np.where(res >= 0, 1, 0)

Попробуем подобрать гиперпараметры. Проведя некоторые тесты, лямбду я взял везде одинаковую, равную 0.01.

In [161]:
hyperparameters = [
(0.01, 0.01, 2000),
(0.01, 0.01, 5000),
(0.001, 0.01, 1000),
(0.001, 0.01, 3000),
(0.001, 0.01, 5000),
(0.001, 0.01, 10000),
(0.001, 0.01, 15000),
(0.0001, 0.01, 5000),
(0.0001, 0.01, 10000),
(0.0001, 0.01, 15000),
]

for h in hyperparameters:
    lr, lambda_param, iters = h
    my_svm = my_SVM(learning_rate=lr, lambda_param=lambda_param, n_iters=iters)
    my_svm.fit(X_train, Y_train)
    print(f'Learning rate = {lr}, lambda = {lambda_param}, iterations = {iters}, train accuracy = {np.mean(Y_train == my_svm.predict(X_train))}, test accuracy = {np.mean(Y_test == my_svm.predict(X_test))}')


Learning rate = 0.01, lambda = 0.01, iterations = 2000, train accuracy = 0.7216610549943884, test accuracy = 0.6906474820143885
Learning rate = 0.01, lambda = 0.01, iterations = 5000, train accuracy = 0.7373737373737373, test accuracy = 0.7338129496402878
Learning rate = 0.001, lambda = 0.01, iterations = 1000, train accuracy = 0.7620650953984287, test accuracy = 0.8369304556354916
Learning rate = 0.001, lambda = 0.01, iterations = 3000, train accuracy = 0.7878787878787878, test accuracy = 0.8585131894484412
Learning rate = 0.001, lambda = 0.01, iterations = 5000, train accuracy = 0.7777777777777778, test accuracy = 0.841726618705036
Learning rate = 0.001, lambda = 0.01, iterations = 10000, train accuracy = 0.7856341189674523, test accuracy = 0.8561151079136691
Learning rate = 0.001, lambda = 0.01, iterations = 15000, train accuracy = 0.7485970819304153, test accuracy = 0.7529976019184652
Learning rate = 0.0001, lambda = 0.01, iterations = 5000, train accuracy = 0.8002244668911336, tes

Рассмотрим модель с параметрами (0.0001, 0.01, 5000).

In [24]:
my_svm = my_SVM(learning_rate=0.0001, lambda_param=0.01, n_iters=5000)
my_svm.fit(X_train, Y_train)
Y_pred = my_svm.predict(X_test)

In [170]:
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,139,15
predicted_0,13,250


In [171]:
metrics_my_svm = metrics(confusion_matrix(Y_pred, Y_test))
print(f'Accuracy: {metrics_my_svm[0]}\nPrecision: {metrics_my_svm[1]}\nRecall: {metrics_my_svm[2]}')

Accuracy: 0.9328537170263789
Precision: 0.9025974025974026
Recall: 0.9144736842105263


Посмотрим на модель SVM из sklearn

In [8]:
sklearn_svm_model = SVC()
sklearn_svm_model.fit(X_train, Y_train)
Y_pred = sklearn_svm_model.predict(X_test)

In [9]:
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,41,35
predicted_0,111,230


In [10]:
metrics_sklearn_svm = metrics(confusion_matrix(Y_pred, Y_test))
print(f'Accuracy: {metrics_sklearn_svm[0]}\nPrecision: {metrics_sklearn_svm[1]}\nRecall: {metrics_sklearn_svm[2]}')

Accuracy: 0.6498800959232613
Precision: 0.5394736842105263
Recall: 0.26973684210526316


In [13]:
# С нормализацией данных по столбцам
X_train_normalized = np.apply_along_axis(lambda x: (x-x.mean())/ x.std(), 0, X_train)
X_test_normalized = np.apply_along_axis(lambda x: (x-x.mean())/ x.std(), 0, X_test)

sklearn_svm_model = SVC()
sklearn_svm_model.fit(X_train_normalized, Y_train)
Y_pred = sklearn_svm_model.predict(X_test_normalized)
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,115,7
predicted_0,37,258


In [14]:
metrics_sklearn_svm = metrics(confusion_matrix(Y_pred, Y_test))
print(f'Accuracy: {metrics_sklearn_svm[0]}\nPrecision: {metrics_sklearn_svm[1]}\nRecall: {metrics_sklearn_svm[2]}')

Accuracy: 0.894484412470024
Precision: 0.9426229508196722
Recall: 0.756578947368421


Мы видим, что нормализация данных сильно улучшила качество классификации. Подберем гиперпараметры с помощью RandomizedSearchCV.

In [19]:
def svc_param_selection(X, y):
    Cs = [0.001, 0.01, 0.1, 1, 10, 100]
    gammas = [0.001, 0.01, 0.1, 1]
    kernels = ['linear', 'rbf']
    param_grid = {'C': Cs, 'gamma' : gammas, 'kernel': kernels}
    search = RandomizedSearchCV(SVC(), param_grid)
    search.fit(X, y)
    search.best_params_
    return search.best_params_

svc_param_selection(X_train_normalized, Y_train)

{'kernel': 'rbf', 'gamma': 0.01, 'C': 100}

In [20]:
sklearn_svm_model = SVC(kernel='rbf', gamma=0.01, C=100)
sklearn_svm_model.fit(X_train_normalized, Y_train)
Y_pred = sklearn_svm_model.predict(X_test_normalized)

In [21]:
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,135,3
predicted_0,17,262


In [22]:
metrics_sklearn_svm = metrics(confusion_matrix(Y_pred, Y_test))
print(f'Accuracy: {metrics_sklearn_svm[0]}\nPrecision: {metrics_sklearn_svm[1]}\nRecall: {metrics_sklearn_svm[2]}')

Accuracy: 0.9520383693045563
Precision: 0.9782608695652174
Recall: 0.8881578947368421


## 3. Дерево решений

In [179]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        # для внутренней вершины
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        # для листа
        self.value = value


class my_DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):
        self.root = None

        # гиперпараметры для ограничения построения дерева
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        
        # рекурсивная процедура разделения узлов, зависит от гиперпараметров
        if num_samples >= self.min_samples_split and curr_depth <= self.max_depth:
            best_split = self.get_best_split(dataset, num_samples, num_features)
            
            # если это условие не выполняется, то мы хотим разделить узел, который и так отвечает за 1 класс
            if best_split['info_gain'] > 0:
                left_subtree = self.build_tree(best_split['dataset_left'], curr_depth + 1)
                right_subtree = self.build_tree(best_split['dataset_right'], curr_depth + 1)
                return Node(best_split['feature_index'], best_split['threshold'], 
                                left_subtree, right_subtree, best_split['info_gain'])
        
        leaf_value = max(list(Y), key=list(Y).count)
        # возвращаем лист
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' Жадно находит лучший способ разделения узла '''
        best_split = {}
        max_info_gain = -float('inf')
        
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            for threshold in np.unique(feature_values):
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                if len(dataset_left) > 0 and len(dataset_right) > 0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    curr_info_gain = self.information_gain(y, left_y, right_y)
                    
                    # если получили лучший вариант разделения, то обновляем старые параметры
                    if curr_info_gain > max_info_gain:
                        best_split['feature_index'] = feature_index
                        best_split['threshold'] = threshold
                        best_split['dataset_left'] = dataset_left
                        best_split['dataset_right'] = dataset_right
                        best_split['info_gain'] = curr_info_gain
                        max_info_gain = curr_info_gain

        return best_split
    

    def split(self, dataset, feature_index, threshold):
        dataset_left = np.array([row for row in dataset if row[feature_index] <= threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index] > threshold])
        return dataset_left, dataset_right
    
    def information_gain(self, parent, l_child, r_child):
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        gain = self.entropy(parent) - (weight_l * self.entropy(l_child) + weight_r * self.entropy(r_child))
        return gain
    
    def entropy(self, y):
        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy
        
    def fit(self, X, Y):
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
    
    def predict(self, X):
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions
    
    def make_prediction(self, x, tree):
        ''' Предсказание для одного наблюдения '''
        if tree.value != None: 
            return tree.value
        
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

In [180]:
hyperparams = [(i, j) for i in range(2, 4) for j in range(1, 4)]
results_mytree = []

for h in hyperparams:
    min_samples_split, max_depth = h
    mytree = my_DecisionTreeClassifier(min_samples_split, max_depth)
    mytree.fit(pd.DataFrame(X_train), pd.DataFrame(Y_train))
    results_mytree.append([min_samples_split, max_depth, np.mean(mytree.predict(X_train) == Y_train), np.mean(mytree.predict(X_test) == Y_test)])

results_mytree.sort(key=lambda x: x[3], reverse=True)
results_mytree

[[2, 1, 0.7867564534231201, 1.0],
 [3, 1, 0.7867564534231201, 1.0],
 [2, 2, 0.8226711560044894, 0.9640287769784173],
 [3, 2, 0.8226711560044894, 0.9640287769784173],
 [2, 3, 0.835016835016835, 0.8776978417266187],
 [3, 3, 0.835016835016835, 0.8776978417266187]]

In [181]:
mytree = my_DecisionTreeClassifier(min_samples_split=1, max_depth=1)
mytree.fit(pd.DataFrame(X_train), pd.DataFrame(Y_train))
confusion_matrix(mytree.predict(X_test), Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,152,0
predicted_0,0,265


In [182]:
metrics_mytree = metrics(confusion_matrix(mytree.predict(X_test), Y_test))
print(f'Accuracy: {metrics_mytree[0]}\nPrecision: {metrics_mytree[1]}\nRecall: {metrics_mytree[2]}')

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [184]:
hyperparams = [(i, j) for i in range(2, 15) for j in range(1, 15)]
results_sktree = []

for h in hyperparams:
    min_samples_split, max_depth = h
    sktree = DecisionTreeClassifier(criterion='entropy', min_samples_split=min_samples_split, max_depth=max_depth)
    sktree.fit(pd.DataFrame(X_train), pd.DataFrame(Y_train))
    results_sktree.append([min_samples_split, max_depth, np.mean(sktree.predict(X_train) == Y_train), np.mean(sktree.predict(X_test) == Y_test)])

results_sktree.sort(key=lambda x: x[3], reverse=True)
results_sktree

[[2, 1, 0.7867564534231201, 1.0],
 [3, 1, 0.7867564534231201, 1.0],
 [4, 1, 0.7867564534231201, 1.0],
 [5, 1, 0.7867564534231201, 1.0],
 [6, 1, 0.7867564534231201, 1.0],
 [7, 1, 0.7867564534231201, 1.0],
 [8, 1, 0.7867564534231201, 1.0],
 [9, 1, 0.7867564534231201, 1.0],
 [10, 1, 0.7867564534231201, 1.0],
 [11, 1, 0.7867564534231201, 1.0],
 [12, 1, 0.7867564534231201, 1.0],
 [13, 1, 0.7867564534231201, 1.0],
 [14, 1, 0.7867564534231201, 1.0],
 [5, 3, 0.8226711560044894, 0.9640287769784173],
 [7, 3, 0.8226711560044894, 0.9640287769784173],
 [9, 3, 0.8226711560044894, 0.9640287769784173],
 [10, 3, 0.8226711560044894, 0.9640287769784173],
 [11, 3, 0.8226711560044894, 0.9640287769784173],
 [13, 3, 0.8226711560044894, 0.9640287769784173],
 [14, 3, 0.8226711560044894, 0.9640287769784173],
 [2, 3, 0.8226711560044894, 0.9616306954436451],
 [3, 3, 0.8226711560044894, 0.9616306954436451],
 [4, 3, 0.8226711560044894, 0.9616306954436451],
 [6, 3, 0.8226711560044894, 0.9616306954436451],
 [8, 3, 0.

Получился очень интересный результат: моя модель и модель из sklearn при глубине дерева, равной единице, верно классифицирует все тестовые наблюдения.