## ML Лабораторная 0

### Тимофеева Наталья

### М8О-408Б-19

#### Подготовка данных

In [589]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import euclidean_distances
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [590]:
df = pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')

df = shuffle(df)
df = df[:len(df)//50]

In [591]:
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
192471,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,13.0,4.0,4.0
30221,0.0,1.0,0.0,1.0,20.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,4.0,0.0,0.0,0.0,0.0,13.0,5.0,6.0
112048,0.0,0.0,0.0,1.0,24.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,2.0,0.0,0.0,1.0,7.0,5.0,6.0
205715,0.0,1.0,0.0,1.0,31.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,8.0
229442,0.0,1.0,1.0,1.0,29.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,5.0,30.0,30.0,1.0,0.0,12.0,4.0,2.0


In [592]:
df.shape

(5073, 22)

In [593]:
target = 'Diabetes_binary'
numerical_features = ['BMI', 'MentHlth', 'PhysHlth',]
categorical_features = [feat for feat in list(df.columns) if feat != target and feat not in numerical_features]

##### Разделим на тренировочную и тестирующую выборку

In [594]:
X = df.drop(target,axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

In [595]:
Preprocessing = ColumnTransformer([
   # ('Encoder', OneHotEncoder(), categorical_features),
   ('MinMax', MinMaxScaler(), [feature for feature in numerical_features if feature != 'BMI']),
   ('StandartScaler', StandardScaler(), ['BMI'])
   ], remainder='passthrough')

##### Считаем метрики

In [596]:
def Metrics(model, X, y):
    y_pred = model.predict(X)
    print('Accuracy = ', accuracy_score(y, y_pred))
    print('Precision = ', precision_score(y, y_pred))
    print('Recall = ', recall_score(y, y_pred))
    print('ROC_AUC_score = ', roc_auc_score(y, y_pred))
    print('Confudion_Matrix =')
    print(confusion_matrix(y, y_pred))

## SVM

In [597]:
class MySVM(BaseEstimator, ClassifierMixin):
    def __init__(self, epochs=1000, learning_rate=0.001, lambd = 0.01):
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.lambd = lambd
        self.w = None
    def gradLoss(self, X, y):
        loss = self.lambd*self.w
        for i, Xi in enumerate(X):
            if (y[i] * np.dot(self.w, Xi)) <= 0:
                loss -= y[i]*Xi
        return loss
    def fit(self, X, y):
        if self.w is None:
            self.w = np.random.randn(X.shape[1] + 1)
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
        y = np.where(y == 0, -1, y)
        for i in range(self.epochs):
            self.w -= self.learning_rate*self.gradLoss(X, y)
    def predict(self, X):
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
        pred = np.dot(X,self.w)
        pred = np.where(pred >= 0, 1, 0)
        return pred

In [598]:
myPipelineSVM = Pipeline([
    ('Column Transform', Preprocessing),
    ('SVM', MySVM())
])
SVMmy = GridSearchCV(myPipelineSVM,{'SVM__epochs':[500,1000], 'SVM__learning_rate':[0.001, 0.00001]})
SVMmy.fit(X_train, y_train)

In [599]:
Metrics(SVMmy, X_test, y_test)

Accuracy =  0.8541871921182266
Precision =  0.44
Recall =  0.07586206896551724
ROC_AUC_score =  0.5298850574712644
Confudion_Matrix =
[[856  14]
 [134  11]]


In [600]:
pickle.dump(SVMmy,open('data/mySVM.pkl','wb'))

##### Sklearn SVM

In [601]:
pipelineSVM = Pipeline([
    ('Column Transform', Preprocessing),
    ('SVM', LinearSVC())
])
svm = GridSearchCV(pipelineSVM,{'SVM__C':[10, 1, 0.1, 0.01], 'SVM__loss': ['hinge', 'squared_hinge']})
svm.fit(X_train, y_train)



In [602]:
Metrics(svm, X_test, y_test)

Accuracy =  0.8522167487684729
Precision =  0.3684210526315789
Recall =  0.04827586206896552
ROC_AUC_score =  0.5172413793103449
Confudion_Matrix =
[[858  12]
 [138   7]]


In [603]:
pickle.dump(svm,open('data/sklearnSVM.pkl','wb'))

## Naive bayes

In [604]:
class MyNB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    def predict(self,X):
        def gaussian_prob(x,std,mean):
            return np.exp(-(x - mean)**2 / (2 * std**2)) / np.sqrt(2*np.pi*std**2)
        y = np.zeros(X.shape[0])
        for i, x in enumerate(X):
            var_prob = np.array(self.prob_y)
            for j in range(len(self.prob_y)):
                p = np.array([gaussian_prob(x[k], self.std_X_y[j][k], self.means_X_y[j][k]) for k in range(X.shape[1])])
                var_prob[j] *= np.prod(p)
            y[i] = np.argmax(var_prob)
        return y
    def fit(self, X, y):
        labels, counts = np.unique(y, return_counts=True)
        self.prob_y = [count / y.shape[0] for count in counts]
        self.means_X_y = [X[y == label].mean(axis = 0) for label in labels]
        self.std_X_y = [X[y == label].std(axis = 0) for label in labels]
        return self

In [605]:
myPipelineNB = Pipeline([
    ('Column Transform', Preprocessing),
    ('NB', MyNB())
])
myPipelineNB.fit(X_train, y_train)

In [606]:
Metrics(myPipelineNB, X_test, y_test)

Accuracy =  0.7527093596059113
Precision =  0.2827868852459016
Recall =  0.47586206896551725
ROC_AUC_score =  0.6373563218390804
Confudion_Matrix =
[[695 175]
 [ 76  69]]


In [607]:
pickle.dump(myPipelineNB,open('data/myNB.pkl','wb'))

##### Sklearn Naive bayes

In [608]:
pipelineNB = Pipeline([
    ('Column Transform', Preprocessing),
    ('NB', GaussianNB())
])
pipelineNB.fit(X_train, y_train)

In [609]:
Metrics(pipelineNB, X_test, y_test)

Accuracy =  0.7527093596059113
Precision =  0.2827868852459016
Recall =  0.47586206896551725
ROC_AUC_score =  0.6373563218390804
Confudion_Matrix =
[[695 175]
 [ 76  69]]


In [610]:
pickle.dump(pipelineNB,open('data/sklearnNB.pkl','wb'))

## Logistic Regression

In [611]:
class MyLogist(BaseEstimator, ClassifierMixin):
    def __init__(self, epochs = 100, learning_rate = 0.1):
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.w = None
    def fit(self, X, y):
        if self.w is None:
            self.w = np.random.randn(X.shape[1] + 1)
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) 
        for i in range(self.epochs):
            self.w -= self.learning_rate*self._loss_grad(X,y)
        return self
    def _sigmoid(self, X):
        return 1.0 / (1.0 + np.exp(-X))
    def predict(self, X):
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
        return self._sigmoid(np.dot(X, self.w)) > 0.5
    def _predict_no_bias(self, X):
        return self._sigmoid(np.dot(X, self.w)) > 0.5
    def _loss_grad(self, X, y):
        return -((y - self._predict_no_bias(X)) @ X)

In [612]:
myPipelineLogist = Pipeline([
    ('Column Transform', Preprocessing),
    ('logit', MyLogist())
])
myLogit = GridSearchCV(myPipelineLogist,{'logit__learning_rate':[0.1, 0.5],'logit__epochs':[100, 500, 1000]})
myLogit.fit(X_train, y_train)

  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 / (1.0 + np.exp(-X))
  return 1.0 /

In [613]:
Metrics(myLogit, X_test, y_test)

Accuracy =  0.858128078817734
Precision =  1.0
Recall =  0.006896551724137931
ROC_AUC_score =  0.503448275862069
Confudion_Matrix =
[[870   0]
 [144   1]]


  return 1.0 / (1.0 + np.exp(-X))


In [614]:
pickle.dump(myLogit,open('data/myLogist.pkl','wb'))

##### Sklearn Logistic Regression

In [615]:
pipelineLogist = Pipeline([
    ('Column Transform', Preprocessing),
    ('logit', LogisticRegression())
])

logist = GridSearchCV(pipelineLogist,{'logit__C':[10,1,0.1,0.01],'logit__max_iter':[500,1000,5000]})
logist.fit(X_train, y_train)

In [616]:
Metrics(logist, X_test, y_test)
pickle.dump(logist,open('data/sklearnLogist.pkl','wb'))

Accuracy =  0.8463054187192118
Precision =  0.37209302325581395
Recall =  0.1103448275862069
ROC_AUC_score =  0.5396551724137931
Confudion_Matrix =
[[843  27]
 [129  16]]


## KNN

In [617]:
class MyKNN(BaseEstimator, ClassifierMixin):
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.X = X
        self.y = y
        return self
    def predict_proba(self, X):
        X = check_array(X)   
        y = np.ndarray((X.shape[0]))
        for i, elem in enumerate(X):
            distances = euclidean_distances([elem], self.X)[0]
            distances_with_labels = np.stack((distances, self.y), axis=1)
            distances_with_labels.sort(axis=0)
            k_neighbors = distances_with_labels[:self.n_neighbors]
            labels, counts = np.unique(k_neighbors[:, 1], return_counts=True)
            for j, label in enumerate(labels):
                if label == 1:
                    proba1 = counts[j] / self.n_neighbors
                    break
            else:
                proba1 = 0
            y[i] = proba1
        return y
    def predict(self, X, threshold=0.1):
        return self.predict_proba(X) > threshold

In [618]:
myPipelineKNN = Pipeline([
    ('Column Transform', Preprocessing),
    ('knn', MyKNN())
])
KNNmy = GridSearchCV(myPipelineKNN,{'knn__n_neighbors':[1,3,5,7]})
KNNmy.fit(X_train, y_train)

In [619]:
Metrics(KNNmy, X_test, y_test)

Accuracy =  0.8571428571428571
Precision =  0.0
Recall =  0.0
ROC_AUC_score =  0.5
Confudion_Matrix =
[[870   0]
 [145   0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [620]:
pickle.dump(KNNmy,open('data/myKNN.pkl','wb'))

In [621]:
KNNmy.best_params_

{'knn__n_neighbors': 1}

KNN Sklearn

In [622]:
pipelineKNN = Pipeline([
    ('Column Transform', Preprocessing),
    ('knn', KNeighborsClassifier())
])
knn = GridSearchCV(pipelineKNN,{'knn__n_neighbors':[1,3,5,7]})
knn.fit(X_train, y_train)

In [623]:
Metrics(knn, X_test, y_test)
pickle.dump(knn,open('data/sklearnKNN.pkl','wb'))

Accuracy =  0.8384236453201971
Precision =  0.32075471698113206
Recall =  0.11724137931034483
ROC_AUC_score =  0.5379310344827586
Confudion_Matrix =
[[834  36]
 [128  17]]


In [624]:
knn.best_params_

{'knn__n_neighbors': 7}