In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import *
from sklearn.naive_bayes import *
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn import preprocessing

Загружаем данные

In [60]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [83]:
data_frame = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/adult.csv')
data_frame.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


Подготовим данные

In [84]:
data_frame['workclass'] = data_frame['workclass'].replace('?', np.nan)
data_frame['occupation'] = data_frame['occupation'].replace('?', np.nan)
data_frame['native-country'] = data_frame['native-country'].replace('?', np.nan)
data_frame = data_frame.dropna()

# Избавимся от дубликатов

data_frame = data_frame.drop_duplicates()
data_frame['income']=data_frame['income'].map({'<=50K':0,'>50K':1})
# Избавимся от ненужных данных.
#1. education. Есть замена - education.num
#2. fnlwgt(final weight) - показатель не стандартизирован для разных штатов.
#3. relationship - объединяет в себе признаки 'marital-status' и 'gender'.
#4. Native-country - в основном состоит из значений 'US'.
data_frame = data_frame.drop(['education'], axis=1)
data_frame = data_frame.drop(['fnlwgt'], axis=1)
data_frame = data_frame.drop(['relationship'], axis=1)
data_frame = data_frame.drop(['native-country'], axis=1)
data_frame = data_frame.drop(['occupation'], axis=1)
# Для построения метрических моделей все признаки должны быть числовыми.

race = {'Black' : 0, 'White' : 1, 'Amer-Indian-Eskimo' : 2, 'Asian-Pac-Islander' : 3, 'Other' : 4}
data_frame = data_frame.replace({'race' : race})

workclass = {'Private' : 0, 'Local-gov' : 1, 'Self-emp-not-inc' : 2, 'Federal-gov' : 3, 'State-gov' : 4, 'Self-emp-inc' : 5, 'Without-pay' : 6}
data_frame = data_frame.replace({'workclass' : workclass})

gender = {'Male' : 0, 'Female' : 1}
data_frame = data_frame.replace({'gender' : gender})
# Семейное положение можно разбить на 2 класса.
#  Not married = 0, Married = 1
ms = {'Never-married' : 0, 'Married-civ-spouse' : 1, 'Widowed' : 0, 'Separated' : 0, 'Divorced' : 0,
      'Married-spouse-absent' : 1, 'Married-AF-spouse' : 1}

data_frame = data_frame.replace({'marital-status' : ms})

Нормализуем данные



In [88]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(data_frame.drop("income", axis=1)))
y = data_frame["income"]

Разобьем на обучающую и тестовую

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


Добавим  функцию для оценки качества предсказания.

In [91]:
def score_prediction(y_pred, y_test):
    print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
    print("accuracy:", accuracy_score(y_test, y_pred))
    print("precision:", precision_score(y_test, y_pred))
    print("recall:", recall_score(y_test, y_pred))
    print("f1_score", f1_score(y_test, y_pred))
    


Добавим сигмоиду которая необходима для реализации логистической регрессии.

In [92]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

**Логистическая регрессия**

In [93]:
class MyLogisticRegression:
    def __init__(self, lr, max_iter = 10000):
        self.max_iter = max_iter
        self.lr = lr

    def fit(self, X, y):
        X = X.to_numpy()
        y = y.to_numpy()
        X = np.hstack((np.full((X.shape[0], 1), 1),X))
        weights = np.random.rand(X.shape[1])
        for i in range(self.max_iter):
            old_weights = weights.copy()
            weights -= self.lr * np.dot(X.T, sigmoid(np.dot(X, weights)) - y) / len(X)
        self.weights = weights

    def predict(self, X):
        X = X.to_numpy()
        X = np.hstack((np.full((X.shape[0], 1), 1), X))
        labels = []
        for row in X:
            labels.append(int(sigmoid(np.dot(row, self.weights)) > 0.5))
        return labels

In [94]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train,y_train)
print("Sklearn implementation accuracy {}".format(accuracy_score(model.predict(X_test), y_test)))

Sklearn implementation accuracy 0.837410071942446


In [95]:
max_accuracy = 0
max_params = (0,0)
y_pred_max = 0
for lr, max_iter in [(0.1, 100), (0.01, 100), (0.1, 1000), (0.01, 1000), (0.1, 10000), (0.01, 10000)]:
    model = MyLogisticRegression(lr=lr, max_iter=max_iter)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_pred, y_test)
    if acc > max_accuracy:
        max_params = (lr, max_iter)
        max_accuracy = acc
        y_pred_max = y_pred
print("My implementation: max accuracy with learning rate = {} and max iterations = {}"
      .format(*max_params))
score_prediction(y_pred_max, y_test)

My implementation: max accuracy with learning rate = 0.1 and max iterations = 10000
confusion matrix:
 [[6304  504]
 [ 965 1262]]
accuracy: 0.837410071942446
precision: 0.7146092865232163
recall: 0.5666816344858554
f1_score 0.6321061858251941


**Дерево решений**

In [81]:
class Node:
    def __init__(self, predicted_class):
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None


class MyDecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        X = X.to_numpy()
        y = y.to_numpy()
        self.n_classes_ = len(set(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)

    def predict(self, X):
        X = X.to_numpy()
        return [self._predict(inputs) for inputs in X]

    def _best_split(self, X, y):
        m = y.size
        if m <= 1:
            return None, None
        num_parent = [np.sum(y == c) for c in range(self.n_classes_)]
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        for idx in range(self.n_features_):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes_)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_)
                )
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr

    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(predicted_class=predicted_class)
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class

In [96]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Sklearn decision tree accuracy {}".format(accuracy_score(y_test, y_pred)))

model = MyDecisionTreeClassifier(max_depth=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("My implementation score:")
score_prediction(y_pred, y_test)

Sklearn decision tree accuracy 0.8235749861649142
My implementation score:
confusion matrix:
 [[6470  338]
 [1015 1212]]
accuracy: 0.8502490315439956
precision: 0.7819354838709678
recall: 0.5442299057027391
f1_score 0.6417791898332009


**Метод опорных векторов**

In [97]:
class MySVM:
    def __init__(self, lr, lambda_, iters):
        self.lr = lr
        self.lambda_ = lambda_
        self.iters = iters

    def fit(self, X, y):
        X = X.to_numpy()
        y = y.to_numpy()
        n = X.shape[0]
        self.W = np.zeros(X.shape[1])
        self.b = 0
        y_ = np.where(y <= 0, -1, 1)

        for i in range(self.iters):
            for j, x in enumerate(X):
                if y_[j] * ((x @ self.W) - self.b) >= 1:
                    self.W -= self.lr * (2 * self.lambda_ * self.W)
                else:
                    self.W -= self.lr * (2 * self.lambda_ * self.W - np.dot(x,y_[j]))
                    self.b -= self.lr * y_[j]

    def predict(self, X):
        X = X.to_numpy()
        res = np.dot(X, self.W) - self.b
        y_pred = np.where(res >= 0, 1, 0)
        return y_pred

In [98]:
model = SVC()
model.fit(X_train, y_train)
print("Sklearn SVM accuracy {}".format(accuracy_score(model.predict(X_test), y_test)))

Sklearn SVM accuracy 0.8470392916436081


In [103]:
max_acc = 0
max_params = (0,0,0)
y_pred = 0
for lr_lambda_iters in [(0.1, 0.1, 10), (0.001, 0.001, 10), (0.001, 0.001, 100)]:
    lr, lambda_, iters = lr_lambda_iters
    model = MySVM(lr=lr, lambda_=lambda_, iters=iters)
    model.fit(X_train, y_train)
    y_pred_cur = model.predict(X_test)
    acc = accuracy_score(y_pred_cur, y_test)
    if acc > max_acc:
        max_acc = acc
        y_pred = y_pred_cur
        max_params = lr_lambda_iters
print("Params with max accuracy lr = {}, lambda = {}, iters = {}".format(*max_params))
score_prediction(y_pred, y_test)

Params with max accuracy lr = 0.001, lambda = 0.001, iters = 100
confusion matrix:
 [[6308  500]
 [ 963 1264]]
accuracy: 0.8380741560597675
precision: 0.7165532879818595
recall: 0.56757970363718
f1_score 0.633425206715109
