ML Лабораторная 1

Тимофеева Наталья

М8О-408Б-19

In [195]:
import pandas as pd
from pandas import DataFrame 
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import euclidean_distances
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_curve
from sklearn.utils.multiclass import unique_labels
import warnings

Подготовка данных

In [196]:
df = pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')

df = shuffle(df)
df = df[:len(df)//50]

In [197]:
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
168620,0.0,1.0,0.0,1.0,32.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,2.0,15.0,0.0,0.0,12.0,6.0,7.0
127467,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,13.0,3.0,3.0
247876,0.0,1.0,0.0,0.0,36.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,13.0,4.0,5.0
97857,1.0,0.0,1.0,1.0,28.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,0.0,3.0,0.0,0.0,9.0,6.0,6.0
140191,0.0,0.0,0.0,1.0,33.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,4.0,10.0,1.0,0.0,0.0,1.0,4.0,3.0


In [198]:
df.shape

(5073, 22)

In [199]:
target = 'Diabetes_binary'
numerical_features = ['BMI', 'MentHlth', 'PhysHlth',]
categorical_features = [feat for feat in list(df.columns) if feat != target and feat not in numerical_features]

Разделим на тренировочную и тестирующую выборку

In [200]:
X = df.drop(target,axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

In [201]:
Preprocessing = ColumnTransformer([
   ('MinMax', MinMaxScaler(), [feature for feature in numerical_features if feature != 'BMI']),
   ('StandartScaler', StandardScaler(), ['BMI'])
   ], remainder='passthrough')

Считаем метрики

In [202]:
def get_metrics(model, X, true, threshold = 0.5, use_probas = True):
    if use_probas:
        pred_probas = model.predict_proba(X)
        if len(pred_probas.shape) == 2:
            pred_probas = pred_probas[:, 1] 
        predicted = pred_probas > threshold
    else:
        predicted = model.predict(X)
    print('Accuracy = ', accuracy_score(true, predicted))
    print('Precision = ', precision_score(true, predicted))
    print('Recall = ', recall_score(true, predicted))
    if use_probas:
        print('ROC AUC = ', roc_auc_score(true, pred_probas))
    print('Confusion matrix:')
    print(confusion_matrix(true, predicted))

SVM

In [203]:
class MySVM(ClassifierMixin, BaseEstimator):
    def __init__(self, epochs = 10, lr = 0.1, alpha = 0.1):
        self.w = None
        self.epochs = epochs
        self.lr = lr
        self.alpha = alpha
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        y = np.where(y == 1, 1, -1)
        n, k = X.shape
        if self.w is None:
            np.random.seed(66)
            self.w = np.random.randn(k + 1)
        X = np.concatenate((np.ones((n, 1)), X), axis = 1)
        for i in range(self.epochs):
            for j, x in enumerate(X):
                margin = y[j] * np.dot(self.w, x)
                if margin >= 1:
                    self.w -= self.lr * self.alpha * self.w / self.epochs
                else:
                    self.w += self.lr * (y[j] * x - self.alpha * self.w / self.epochs)
        return self
    def predict(self, X):
        X = check_array(X)
        n, _ = X.shape
        X = np.concatenate((np.ones((n, 1)), X), axis=1)
        y = np.ndarray((n))
        for i, elem in enumerate(X):
            prediction = np.dot(self.w, elem)
            if prediction > 0:
                y[i] = 1
            else:
                y[i] = 0
        return y
    def _hinge_loss(self, x, y):
        return max(0, 1 - y * np.dot(x, self.w))
    def _soft_margin_loss(self, x, y):
        return self._hinge_loss(x, y) + self.alpha * np.dot(self.w, self.w)

In [204]:
my_SVM_Pipeline = Pipeline([
    ('Column Transform', Preprocessing),
    ('MySVM', MySVM())
])
my_SVM_Grid_Search = RandomizedSearchCV(my_SVM_Pipeline, 
                                       {'MySVM__lr': [0.001, 0.01, 0.05, 0.1,],
                                        'MySVM__epochs': [10, 20, 30, 40],
                                        'MySVM__alpha': [0.01, 0.1, 1, 10]})
my_SVM_Grid_Search.fit(X_train, y_train);
my_SVM_Grid_Search.best_params_

{'MySVM__lr': 0.1, 'MySVM__epochs': 10, 'MySVM__alpha': 0.1}

In [205]:
get_metrics(my_SVM_Grid_Search, X_test, y_test, use_probas = False)

Accuracy =  0.8502463054187193
Precision =  0.0
Recall =  0.0
Confusion matrix:
[[863   1]
 [151   0]]


Sklearn SVM

In [206]:
sklearn_SVM_Pipeline = Pipeline(
    steps=[
        ('Column Transform', Preprocessing),
        ('SVM', LinearSVC(class_weight='balanced'))
    ]
)
sklearn_SVM_Grid_Search = GridSearchCV(sklearn_SVM_Pipeline, {'SVM__loss': ['hinge', 'squared_hinge'], 'SVM__C': [0.01, 0.1, 1, 10]})
sklearn_SVM_Grid_Search.fit(X_train, y_train);
sklearn_SVM_Grid_Search.best_params_

{'SVM__C': 10, 'SVM__loss': 'hinge'}

In [207]:
get_metrics(sklearn_SVM_Grid_Search, X_test, y_test, use_probas=False)

Accuracy =  0.8266009852216749
Precision =  0.41007194244604317
Recall =  0.37748344370860926
Confusion matrix:
[[782  82]
 [ 94  57]]


Logistic Regression

In [208]:
class MyLogisticRegression(BaseEstimator, ClassifierMixin):
    def __init__(self, epochs=10, lr=0.1, batch_size=256):
        self.w = None
        self.epochs = epochs
        self.lr = lr
        self.batch_size = batch_size
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        n, k = X.shape
        if self.w is None:
            np.random.seed(0xDEAD)
            self.w = np.random.randn(k + 1)
        X = np.concatenate((np.ones((n, 1)), X), axis=1)
        for i in range(self.epochs):
            for j in range(0, len(X), self.batch_size):
                X_batch = X[j:j+self.batch_size]
                y_batch = y[j:j+self.batch_size]
                y_pred = self._predict_proba_internal(X_batch)
                self.w -= self.lr * self._get_gradient(X_batch, y_batch, y_pred)
        return self
    def _get_gradient(self, X_batch, y_batch, y_pred):
        gradient = X_batch.T @ (y_pred - y_batch)
        return gradient
    def predict_proba(self, X):
        X = check_array(X)
        n = X.shape[0]
        X = np.concatenate((np.ones((n, 1)), X), axis=1)
        return self._sigmoid(np.dot(X, self.w))
    def _predict_proba_internal(self, X): 
        return self._sigmoid(np.dot(X, self.w))
    def predict(self, X, threshold=0.5):
        return self.predict_proba(X) > threshold
    def _sigmoid(self, a):
        warnings.filterwarnings('ignore')
        return 1. / (1 + np.exp(-a))

In [209]:
my_LogisticRegression_Pipeline = Pipeline(steps = [('Column Transform', Preprocessing), ('myLR' ,MyLogisticRegression())])
param_grid = {
    'myLR__epochs': [100, 500, 200],
    'myLR__lr': [0.1, 0.5, 0.01],
}
my_LR_Grid_Search = RandomizedSearchCV(my_LogisticRegression_Pipeline, param_grid)
my_LR_Grid_Search.fit(X_train, y_train)
my_LR_Grid_Search.best_params_

{'myLR__lr': 0.01, 'myLR__epochs': 100}

In [210]:
get_metrics(my_LR_Grid_Search, X_test, y_test, threshold=0.5)

Accuracy =  0.7921182266009852
Precision =  0.3584905660377358
Recall =  0.5033112582781457
ROC AUC =  0.7779157468727006
Confusion matrix:
[[728 136]
 [ 75  76]]


Sklearn Logistic Regression

In [211]:
sklearn_LogisticRegression_Pipeline = Pipeline([
    ('Column Transform', Preprocessing),
    ('logistic', LogisticRegression())
])
sk_logist = GridSearchCV(sklearn_LogisticRegression_Pipeline, {'logistic__C':[10,1,0.1,0.01],'logistic__max_iter':[500,1000,5000]})
sk_logist.fit(X_train, y_train)

In [212]:
get_metrics(sk_logist, X_test, y_test)

Accuracy =  0.8522167487684729
Precision =  0.5128205128205128
Recall =  0.13245033112582782
ROC AUC =  0.8031870860927152
Confusion matrix:
[[845  19]
 [131  20]]


Naive bayes

In [213]:
class MyNaivBais(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    def predict(self,X):
        def gaussian_prob(x,std,mean):
            return np.exp(-(x - mean)**2 / (2 * std**2)) / np.sqrt(2*np.pi*std**2)
        y = np.zeros(X.shape[0])
        for i, x in enumerate(X):
            var_prob = np.array(self.prob_y)
            for j in range(len(self.prob_y)):
                p = np.array([gaussian_prob(x[k], self.std_X_y[j][k], self.means_X_y[j][k]) for k in range(X.shape[1])])
                var_prob[j] *= np.prod(p)
            y[i] = np.argmax(var_prob)
        return y
    def fit(self, X, y):
        labels, counts = np.unique(y, return_counts=True)
        self.prob_y = [count / y.shape[0] for count in counts]
        self.means_X_y = [X[y == label].mean(axis = 0) for label in labels]
        self.std_X_y = [X[y == label].std(axis = 0) for label in labels]
        return self

In [214]:
my_NaivBais_Pipeline = Pipeline([
    ('Column Transform', Preprocessing),
    ('MyNB',  MyNaivBais())
])
my_NaivBais_Pipeline.fit(X_train, y_train)

In [215]:
get_metrics(my_NaivBais_Pipeline, X_test, y_test, use_probas=False)

Accuracy =  0.7605911330049261
Precision =  0.31746031746031744
Recall =  0.5298013245033113
Confusion matrix:
[[692 172]
 [ 71  80]]


Sklearn Naive bayes

In [216]:
sklearn_NaivBais_Pipeline = Pipeline([
    ('Column Transform', Preprocessing),
    ('NB', GaussianNB())
])
sklearn_NaivBais_Pipeline.fit(X_train, y_train)

In [217]:
get_metrics(sklearn_NaivBais_Pipeline, X_test, y_test, use_probas=False)

Accuracy =  0.7605911330049261
Precision =  0.31746031746031744
Recall =  0.5298013245033113
Confusion matrix:
[[692 172]
 [ 71  80]]


KNN

In [218]:
class My_KNN(ClassifierMixin, BaseEstimator):
    def __init__(self, k = 1):
        self.k = k
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        return self
    def predict(self, X):
        check_is_fitted(self, ['X_', 'y_'])
        X = check_array(X)
        y = np.ndarray((X.shape[0],))
        for (i, elem) in enumerate(X):
            distances = euclidean_distances([elem], self.X_)[0]
            neighbors = np.argpartition(distances, kth = self. k - 1)
            k_neighbors = neighbors[:self.k]
            labels, cnts = np.unique(self.y_[k_neighbors], return_counts = True)
            y[i] = labels[cnts.argmax()]
        return y

In [219]:
my_KNN_Pipeline = Pipeline([
    ('Column Transform', Preprocessing),
    ('MYKNN', My_KNN())
])
My_KNN = RandomizedSearchCV(my_KNN_Pipeline,{'MYKNN__k':[1,2,3,4,5]})
My_KNN.fit(X_train, y_train)

In [220]:
get_metrics(My_KNN, X_test, y_test, use_probas=False)

Accuracy =  0.8472906403940886
Precision =  0.4375
Recall =  0.09271523178807947
Confusion matrix:
[[846  18]
 [137  14]]


KNN Sklearn

In [221]:
sklearn_KNN_Pipeline = Pipeline([
    ('Column Transform', Preprocessing),
    ('KNN', KNeighborsClassifier())
])
KNN = GridSearchCV(sklearn_KNN_Pipeline,{'KNN__n_neighbors':[1,3,5,7]})
KNN.fit(X_train, y_train)

In [222]:
get_metrics(KNN, X_test, y_test, use_probas=False)

Accuracy =  0.8453201970443349
Precision =  0.4318181818181818
Recall =  0.12582781456953643
Confusion matrix:
[[839  25]
 [132  19]]
