##### Подключение необходимых библиотек

In [104]:
#Работа с данными
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#Оптимизация гиперпараметров
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#модели
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB

#Качество моделей
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import multilabel_confusion_matrix

##### Получение данных и обзор

In [105]:
df=pd.read_excel('dataset.xlsx',sheet_name='full_dataset')
df.dtypes

Id        object
Name      object
Read       int64
Write      int64
FA         int64
FAR        int64
FAW        int64
FD         int64
FR         int64
WE       float64
RE       float64
LZJe       int64
LZJc       int64
FEC        int64
FER        int64
FEW        int64
HER        int64
REE       object
WEE       object
PN        object
Class      int64
dtype: object

In [106]:
df['REE']=df['REE'].astype('str')
df['WEE']=df['WEE'].astype('str')
df.sample(5)

Unnamed: 0,Id,Name,Read,Write,FA,FAR,FAW,FD,FR,WE,...,LZJe,LZJc,FEC,FER,FEW,HER,REE,WEE,PN,Class
86,3660,ShellExperienceHost.exe,563,0,14,14,0,0,0,0.0,...,0,13,0,4,4,0,.bin.jpg.png,.bin.jpg.png,ShellExperienceHost.exe,0
81,88,Registry,0,808,8,0,8,0,0,4.741343,...,0,0,0,0,4,0,,.DAT.LOG1.LOG2.dat,Registry,0
70,5608,OneDrive.exe,13554,33,16,14,2,0,0,4.536111,...,0,23,0,2,4,0,.exe.ini,.aodl.exe.ini.log,OneDrive.exe,0
33,3200,Zv7uN.exe,1415475,1405763,542,540,534,855,855,7.999891,...,79,139,855,23,25,0,.DAT.cpp.csv.doc.docx.ini.jpg.lnk.m.md.o.pdf.p...,.DAT.Zv7uN.cpp.csv.doc.docx.ini.jpg.lnk.m.md.o...,Zv7uN,1
103,4964,W10Privacy.exe,5793,367,19,19,11,19,0,4.952679,...,0,81,0,7,7,0,.dat.jpg.log.reg.tmp.txt,.dat.jpg.log.reg.tmp.txt,W10Privacy.exe,0


##### Формирование показателей для модели

Добавление показателя о том, что расширение на чтение или запись превышает 4 символа

In [107]:
r=[]
w=[]
for i in df['REE']:
    for j,st in enumerate(i.split('.')):
        if len(st)>4:
            r.append(1)
            break
        if j==len(i.split('.'))-1:
            r.append(0)
for i in df['REE']:
    for j,st in enumerate(i.split('.')):
        if len(st)>4:
            w.append(1)
            break
        if j==len(i.split('.'))-1:
            w.append(0)
df['long_read_ext']=pd.Series(r)
df['long_write_ext']=pd.Series(w)

Добавление остальных переменных

In [108]:
df['files_read_write']=df['FAR']/df['FAW']
df['files_del_acc']=df['FD']/df['FA']
df['files_read_acc']=df['FAR']/df['FA']
df['files_write_acc']=df['FAW']/df['FA']
df['volume_read_write']=df['Read']/df['Write']
df['entropy_write_read']=(df['WE']-df['RE'])/(8-df['RE'])
df['files_deleted_read']=df['FD']/df['FAR']
df['files_deleted_acc']=df['FD']/df['FA']
df['files_deleted_write']=df['FD']/df['FAW']
df['files_renamed_accessed']=df['FR']/df['FA']
df['files_renamed_read']=df['FR']/df['FAR']
df['files_renamed_write']=df['FR']/df['FAW']

In [109]:
df=df.replace(-np.Inf,np.nan).copy()
df=df.replace(np.Inf,np.nan).copy()
df.apply(lambda x: sum(x.isnull()), axis=0)

Id                         0
Name                      18
Read                       0
Write                      0
FA                         0
FAR                        0
FAW                        0
FD                         0
FR                         0
WE                         0
RE                         0
LZJe                       0
LZJc                       0
FEC                        0
FER                        0
FEW                        0
HER                        0
REE                        0
WEE                        0
PN                        14
Class                      0
long_read_ext              0
long_write_ext             0
files_read_write          19
files_del_acc              0
files_read_acc             0
files_write_acc            0
volume_read_write         21
entropy_write_read         0
files_deleted_read         7
files_deleted_acc          0
files_deleted_write       19
files_renamed_accessed     0
files_renamed_read         7
files_renamed_

In [110]:
df['Name']=df['Name'].fillna('')
df['PN']=df['PN'].fillna('')

In [111]:
df=df.dropna().copy()

In [112]:
df.groupby('Class').count()

Unnamed: 0_level_0,Id,Name,Read,Write,FA,FAR,FAW,FD,FR,WE,...,files_read_acc,files_write_acc,volume_read_write,entropy_write_read,files_deleted_read,files_deleted_acc,files_deleted_write,files_renamed_accessed,files_renamed_read,files_renamed_write
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,58,58,58,58,58,58,58,58,58,58,...,58,58,58,58,58,58,58,58,58,58
1,34,34,34,34,34,34,34,34,34,34,...,34,34,34,34,34,34,34,34,34,34


##### Предобработка данных

In [113]:
num_cols = ['files_read_write','files_del_acc','files_read_acc','files_write_acc','volume_read_write','entropy_write_read',
            'files_deleted_read','files_deleted_acc','files_deleted_write','files_renamed_accessed','files_renamed_read','files_renamed_write']
cat_cols = ['long_read_ext','long_write_ext']
target_col = 'Class'
cols = num_cols + cat_cols + [target_col]
regr_cols=num_cols+cat_cols

In [114]:
x_train, x_test, y_train, y_test = train_test_split(df[regr_cols], df[target_col], random_state=42,test_size=0.4)
y_train=pd.DataFrame(y_train,columns=['Class'])
y_test=pd.DataFrame(y_test,columns=['Class'])

In [115]:
print('Размер тренировочной выборки= {0}'.format(len(y_train)))
print('Доля вредоносных процессов в тренировочной выборке',round(float(y_train.sum()/y_train.count()),4))
print('Размер тестовой выборки= {0}'.format(len(y_test)))
print('Доля вредоносных процессов в тестовой выборке',round(float(y_test.sum()/y_test.count()),4))

Размер тренировочной выборки= 55
Доля вредоносных процессов в тренировочной выборке 0.3273
Размер тестовой выборки= 37
Доля вредоносных процессов в тестовой выборке 0.4324


In [116]:
norm = preprocessing.StandardScaler() #Для стандартизации z-меток
# norm = preprocessing.MaxAbsScaler() #Для стандартизации в интервал [-1;1]
# norm = preprocessing.MinMaxScaler() #Для стандартизации в интервал [0;1]
norm.fit(x_train)
X= norm.transform(x_train) 
x_train = pd.DataFrame(X, index=x_train.index, columns=x_train.columns)

In [117]:
norm = preprocessing.StandardScaler() #Для стандартизации z-меток
# norm = preprocessing.MaxAbsScaler() #Для стандартизации в интервал [-1;1]
# norm = preprocessing.MinMaxScaler() #Для стандартизации в интервал [0;1]
norm.fit(x_test)
X= norm.transform(x_test) 
x_test = pd.DataFrame(X, index=x_test.index, columns=x_test.columns)

### Построение моделей

Создание новой метрики качества

In [120]:
def new_score(y_true,y_predict):
    MCM=multilabel_confusion_matrix(y_true, y_predict)
    tp=MCM[0,1,1]
    fp=MCM[0,1,0]
    fn=MCM[0,0,1]
    tn=MCM[0,0,0]
    specificity=tn/(tn+fp)
    recall=tp/(tp+fn)
    return ((2*specificity+recall)/3)

In [121]:
new_scorer=make_scorer(new_score,greater_is_better=True)

##### Поиск по сетке параметров моделей

In [122]:
def search(x, y, model, param_names, grids):
    parameters = {}
    for i,p in enumerate(param_names):
        parameters[p]=grids[i]
    
    CV_model = GridSearchCV(estimator=model, 
                            param_grid=parameters,
                            cv=6, 
                            scoring=new_scorer,
                            n_jobs=-1, 
                            verbose=10)
    CV_model.fit(x, y)
    means = CV_model.cv_results_['mean_test_score']
    error = CV_model.cv_results_['std_test_score']
    bp=CV_model.best_params_
    bi=CV_model.best_index_
    bs=CV_model.best_score_
    return means,error,bp,bi,bs

#### Деревья решений

In [138]:
model = DecisionTreeClassifier()                              

model.fit(x_train, y_train)
#Качество на тренировочной выборке 
fold=6
print('Средний результат на тренировочной выборке равен ',round(sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold,5))

#Важность переменных
pd.DataFrame({'feature': x_train.columns,
              'importance': model.feature_importances_}).sort_values('importance', ascending=False)

Средний результат на тренировочной выборке равен  0.88413


Unnamed: 0,feature,importance
5,entropy_write_read,0.640819
3,files_write_acc,0.264264
8,files_deleted_write,0.074324
13,long_write_ext,0.020592
0,files_read_write,0.0
1,files_del_acc,0.0
2,files_read_acc,0.0
4,volume_read_write,0.0
6,files_deleted_read,0.0
7,files_deleted_acc,0.0


In [141]:
#Качество на тестовой выборке
model.fit(x_train, y_train)    
y_pred = model.predict(x_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('\nТочность на тестовой выборке',round(new_score(y_test,y_pred),5),'\n')
print('Матрица ошибок\n',conf_mat,'\n')
print (metrics.classification_report(y_pred, y_test))

  model.fit(x_train, y_train)



Точность на тестовой выборке 0.94667 

Матрица ошибок
     0   1
0  21   0
1   4  12 

              precision    recall  f1-score   support

           0       1.00      0.84      0.91        25
           1       0.75      1.00      0.86        12

    accuracy                           0.89        37
   macro avg       0.88      0.92      0.89        37
weighted avg       0.92      0.89      0.89        37



In [142]:
model=DecisionTreeClassifier()
param_names = [ 'max_depth','min_samples_split','min_samples_leaf']
grids = [np.arange(1,11), np.arange(2, 11),np.arange(1,11)]
[means,error,bp,bi,bs]=search(x_train, y_train, model, param_names, grids)

Fitting 6 folds for each of 900 candidates, totalling 5400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0359s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0967s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1683s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 288 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 584 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 704 tasks      | elapsed:  

In [143]:
print('Лучший набор параметров {0}'.format(bp))
print('Лучшее значение score при этом наборе параметров {0}'.format(round(bs,5)))

Лучший набор параметров {'max_depth': 3, 'min_samples_leaf': 6, 'min_samples_split': 4}
Лучшее значение score при этом наборе параметров 0.96925


In [144]:
model = DecisionTreeClassifier(random_state=3,
                               max_depth=bp['max_depth'],
                               min_samples_split=bp['min_samples_split'],
                               min_samples_leaf=bp['min_samples_leaf']
                              )
fold=6
print('Средний результат на тренировочной выборке равен ',round(sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold,5))

model.fit(x_train, y_train)    
y_pred = model.predict(x_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('\nТочность на тестовой выборке',round(new_score(y_test,y_pred),5),'\n')
print('Матрица ошибок\n',conf_mat,'\n')
print (metrics.classification_report(y_pred, y_test))

Средний результат на тренировочной выборке равен  0.9494

Точность на тестовой выборке 0.9089 

Матрица ошибок
     0   1
0  20   1
1   3  13 

              precision    recall  f1-score   support

           0       0.95      0.87      0.91        23
           1       0.81      0.93      0.87        14

    accuracy                           0.89        37
   macro avg       0.88      0.90      0.89        37
weighted avg       0.90      0.89      0.89        37



#### Метод ближайших соседей

In [145]:
model = KNeighborsClassifier ()
fold=6
print('Средний результат на тренировочной выборке равен ',round(sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold,5))

model.fit(x_train, y_train)    
y_pred = model.predict(x_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('\nТочность на тестовой выборке',round(new_score(y_test,y_pred),5),'\n')
print('Матрица ошибок\n',conf_mat,'\n')
print (metrics.classification_report(y_pred, y_test))

Средний результат на тренировочной выборке равен  0.87718

Точность на тестовой выборке 0.82738 

Матрица ошибок
     0   1
0  18   3
1   3  13 

              precision    recall  f1-score   support

           0       0.86      0.86      0.86        21
           1       0.81      0.81      0.81        16

    accuracy                           0.84        37
   macro avg       0.83      0.83      0.83        37
weighted avg       0.84      0.84      0.84        37



  model.fit(x_train, y_train)


In [146]:
model=KNeighborsClassifier()
param_names = ['n_neighbors','weights','metric']
grids = [np.arange(1,11), np.array(['uniform','distance']),np.array(['euclidean','manhattan','chebyshev','minkowski','wminkowski','seuclidean','mahalanobis'])]
[means,error,bp,bi,bs]=search(x_train, y_train['Class'].ravel(), model, param_names, grids)

Fitting 6 folds for each of 140 candidates, totalling 840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0539s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1954s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1739s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:  

In [147]:
print('Лучший набор параметров {0}'.format(bp))
print('Лучшее значение score при этом наборе параметров {0}'.format(round(bs,5)))

Лучший набор параметров {'metric': 'manhattan', 'n_neighbors': 6, 'weights': 'uniform'}
Лучшее значение score при этом наборе параметров 0.91885


In [148]:
model = KNeighborsClassifier (n_neighbors=bp['n_neighbors'], metric=bp['metric'],weights=bp['weights'])
fold=6
print('Средний результат на тренировочной выборке равен ',round(sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold,5))

model.fit(x_train, y_train)    
y_pred = model.predict(x_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('\nТочность на тестовой выборке',round(new_score(y_test,y_pred),5),'\n')
print('Матрица ошибок\n',conf_mat,'\n')
print (metrics.classification_report(y_pred, y_test))

Средний результат на тренировочной выборке равен  0.91885

Точность на тестовой выборке 0.78468 

Матрица ошибок
     0   1
0  18   3
1   5  11 

              precision    recall  f1-score   support

           0       0.86      0.78      0.82        23
           1       0.69      0.79      0.73        14

    accuracy                           0.78        37
   macro avg       0.77      0.78      0.78        37
weighted avg       0.79      0.78      0.79        37



  model.fit(x_train, y_train)


#### Случайный лес

In [149]:
model = RandomForestClassifier(#random_state=42,
                               n_estimators=150,
                               criterion='gini',
                               max_depth=8,
                               min_samples_split=2,
                               min_samples_leaf=1,
                               oob_score=True,
                               warm_start=False,
                               class_weight=None)

fold=6
print('Средний результат на тренировочной выборке равен ',round(sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold,5))

model.fit(x_train, y_train)    
y_pred = model.predict(x_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('\nТочность на тестовой выборке',round(new_score(y_test,y_pred),5),'\n')
print('Матрица ошибок\n',conf_mat,'\n')
print (metrics.classification_report(y_pred, y_test))

Средний результат на тренировочной выборке равен  0.87851


  model.fit(x_train, y_train)



Точность на тестовой выборке 0.9359 

Матрица ошибок
     0   1
0  21   0
1   5  11 

              precision    recall  f1-score   support

           0       1.00      0.81      0.89        26
           1       0.69      1.00      0.81        11

    accuracy                           0.86        37
   macro avg       0.84      0.90      0.85        37
weighted avg       0.91      0.86      0.87        37



In [134]:
model=RandomForestClassifier(n_estimators=200)
param_names = [ 'max_depth','min_samples_split','min_samples_leaf']
grids = [np.arange(5,12), np.arange(2, 8),np.arange(1,8)]
[means,error,bp,bi,bs]=search(x_train, y_train['Class'].ravel(), model, param_names, grids)

Fitting 6 folds for each of 294 candidates, totalling 1764 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   45.6s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   

In [135]:
print('Лучший набор параметров {0}'.format(bp))
print('Лучшее значение score при этом наборе параметров {0}'.format(round(bs,5)))

Лучший набор параметров {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 6}
Лучшее значение score при этом наборе параметров 0.93353


In [140]:
model = RandomForestClassifier(random_state=1,
                                   n_estimators=200,
                                   max_depth=bp['max_depth'],
                                   min_samples_split=bp['min_samples_split'],
                                   min_samples_leaf=bp['min_samples_leaf'],
                                   oob_score=True,
                                   warm_start=False,
                                   class_weight=None
                                  )
fold=6
print('Средний результат на тренировочной выборке равен ',round(sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold,5))

model.fit(x_train, y_train)    
y_pred = model.predict(x_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('\nТочность на тестовой выборке',round(new_score(y_test,y_pred),5),'\n')
print('Матрица ошибок\n',conf_mat,'\n')
print (metrics.classification_report(y_pred, y_test))

Средний результат на тренировочной выборке равен  0.91687


  model.fit(x_train, y_train)



Точность на тестовой выборке 0.94667 

Матрица ошибок
     0   1
0  21   0
1   4  12 

              precision    recall  f1-score   support

           0       1.00      0.84      0.91        25
           1       0.75      1.00      0.86        12

    accuracy                           0.89        37
   macro avg       0.88      0.92      0.89        37
weighted avg       0.92      0.89      0.89        37



#### Логистическая регрессия

In [150]:
model = LogisticRegression()

fold=6
print('Средний результат на тренировочной выборке равен ',round(sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold,5))

model.fit(x_train, y_train)    
y_pred = model.predict(x_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('\nТочность на тестовой выборке',round(new_score(y_test,y_pred),5),'\n')
print('Матрица ошибок\n',conf_mat,'\n')
print (metrics.classification_report(y_pred, y_test))

Средний результат на тренировочной выборке равен  0.86144

Точность на тестовой выборке 0.71562 

Матрица ошибок
     0  1
0  18  3
1   8  8 

              precision    recall  f1-score   support

           0       0.86      0.69      0.77        26
           1       0.50      0.73      0.59        11

    accuracy                           0.70        37
   macro avg       0.68      0.71      0.68        37
weighted avg       0.75      0.70      0.71        37



  return f(**kwargs)


In [151]:
model = LogisticRegression()
param_names = ['penalty','solver']
grids = [['l1', 'l2', 'elasticnet'],['newton-cg', 'lbfgs', 'liblinear']]
[means,error,bp,bi,bs]=search(x_train, y_train['Class'].ravel(), model, param_names, grids)

Fitting 6 folds for each of 9 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0239s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0319s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1269s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    0.2s finished


In [152]:
print('Лучший набор параметров {0}'.format(bp))
print('Лучшее значение score при этом наборе параметров {0}'.format(round(bs,5)))

Лучший набор параметров {'penalty': 'l1', 'solver': 'liblinear'}
Лучшее значение score при этом наборе параметров 0.87401


In [153]:
model = LogisticRegression(multi_class='auto',penalty=bp['penalty'],solver=bp['solver'])

fold=6
print('Средний результат на тренировочной выборке равен ',round(sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold,5))

model.fit(x_train, y_train)    
y_pred = model.predict(x_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('\nТочность на тестовой выборке',round(new_score(y_test,y_pred),5),'\n')
print('Матрица ошибок\n',conf_mat,'\n')
print (metrics.classification_report(y_pred, y_test))

Средний результат на тренировочной выборке равен  0.87401

Точность на тестовой выборке 0.7679 

Матрица ошибок
     0  1
0  19  2
1   8  8 

              precision    recall  f1-score   support

           0       0.90      0.70      0.79        27
           1       0.50      0.80      0.62        10

    accuracy                           0.73        37
   macro avg       0.70      0.75      0.70        37
weighted avg       0.80      0.73      0.74        37



  return f(**kwargs)


#### Support vector machine

In [154]:
model = SVC(kernel='linear')

fold=6
print('Средний результат на тренировочной выборке равен ',round(sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold,5))

model.fit(x_train, y_train)    
y_pred = model.predict(x_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('\nТочность на тестовой выборке',round(new_score(y_test,y_pred),5),'\n')
print('Матрица ошибок\n',conf_mat,'\n')
print (metrics.classification_report(y_pred, y_test))

Средний результат на тренировочной выборке равен  0.83598

Точность на тестовой выборке 0.76218 

Матрица ошибок
     0   1
0  16   5
1   3  13 

              precision    recall  f1-score   support

           0       0.76      0.84      0.80        19
           1       0.81      0.72      0.76        18

    accuracy                           0.78        37
   macro avg       0.79      0.78      0.78        37
weighted avg       0.79      0.78      0.78        37



  return f(**kwargs)


In [155]:
model = SVC()
param_names = ['kernel','C']
grids = [['linear','poly','rbf','sigmoid'],np.linspace(1,10,200)]#[1,1.1,1.2,1.3,1.5,2,3,4,5,10,100,1000]]
[means,error,bp,bi,bs]=search(x_train, y_train['Class'].ravel(), model, param_names, grids)

Fitting 6 folds for each of 800 candidates, totalling 4800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0240s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0638s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0970s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1735s.) Setting batch_size=16.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 840 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 1048 

In [156]:
print('Лучший набор параметров {0}'.format(bp))
print('Лучшее значение score при этом наборе параметров {0}'.format(round(bs,5)))

Лучший набор параметров {'C': 2.9899497487437188, 'kernel': 'linear'}
Лучшее значение score при этом наборе параметров 0.86376


In [157]:
model = SVC(kernel=bp['kernel'], C=bp['C'])

fold=6
print('Средний результат на тренировочной выборке равен ',round(sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold,5))

model.fit(x_train, y_train)    
y_pred = model.predict(x_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('\nТочность на тестовой выборке',round(new_score(y_test,y_pred),5),'\n')
print('Матрица ошибок\n',conf_mat,'\n')
print (metrics.classification_report(y_pred, y_test))

Средний результат на тренировочной выборке равен  0.86376

Точность на тестовой выборке 0.82738 

Матрица ошибок
     0   1
0  18   3
1   3  13 

              precision    recall  f1-score   support

           0       0.86      0.86      0.86        21
           1       0.81      0.81      0.81        16

    accuracy                           0.84        37
   macro avg       0.83      0.83      0.83        37
weighted avg       0.84      0.84      0.84        37



  return f(**kwargs)


#### Naive bayesian method

In [158]:
model = GaussianNB()
y_pred = model.fit(x_train, y_train['Class'].ravel()).predict(x_test)
model.fit(x_train, y_train['Class'].ravel())
fold=6
print('Точность на тренировочной выборке', sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold)

conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('Матрица ошибок\n',conf_mat)
print('Точность на тестовой выборке',new_score(y_test,y_pred))
print (metrics.classification_report(y_pred, y_test))

Точность на тренировочной выборке 0.8216269841269841
Матрица ошибок
     0  1
0  21  0
1  16  0
Точность на тестовой выборке nan
              precision    recall  f1-score   support

           0       1.00      0.57      0.72        37
           1       0.00      0.00      0.00         0

    accuracy                           0.57        37
   macro avg       0.50      0.28      0.36        37
weighted avg       1.00      0.57      0.72        37



  specificity=tn/(tn+fp)
  _warn_prf(average, modifier, msg_start, len(result))


In [69]:
model = GaussianNB()
param_names = ['var_smoothing']
grids = [[1e-14,(1e-14)/2,1e-13,(1e-13)/2,1e-12,(1e-12)/2,1e-11,(1e-11)/2,1e-10,(1e-10)/2,1e-9,(1e-9)/2,1e-8,(1e-8)/2,1e-7,(1e-7)/2,1e-6,(1e-6)/2,1e-5,(1e-5)/2,1e-4,(1e-4)/2,1e-3,(1e-3)/2,1e-2,(1e-2)/2,1e-1,(1e-1)/2]]
[means,error,bp,bi,bs]=search(x_train, y_train['Class'].ravel(), model, param_names, grids)

Fitting 6 folds for each of 28 candidates, totalling 168 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0269s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0808s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1049s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1863s.) Setting batch_size=16.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 168 out of 168 | elapsed:    0.8s finished


In [70]:
print('Лучший набор параметров {0}'.format(bp))
print('Лучшее значение score при этом наборе параметров {0}'.format(round(bs,5)))

Лучший набор параметров {'var_smoothing': 0.001}
Лучшее значение score при этом наборе параметров 0.8430555555555558


In [71]:
model=GaussianNB(var_smoothing=bp['var_smoothing'])
model.fit(x_train, y_train['Class'].ravel())
fold=6
print('Точность на тренировочной выборке', sum(cross_val_score(model,x_train,y_train,scoring=new_scorer,cv=fold,n_jobs=-1))/fold)

y_pred = model.predict(x_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred) 
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
print('Матрица ошибок\n',conf_mat)
print('Точность на тестовой выборке',new_score(y_test,y_pred))
print (metrics.classification_report(y_pred, y_test))

Точность на тренировочной выборке 0.8430555555555558
Матрица ошибок
     0   1
0  18   3
1   1  15
Точность на тестовой выборке 0.8713450292397661
              precision    recall  f1-score   support

           0       0.86      0.95      0.90        19
           1       0.94      0.83      0.88        18

    accuracy                           0.89        37
   macro avg       0.90      0.89      0.89        37
weighted avg       0.90      0.89      0.89        37

