In [647]:
import pandas as pd
import numpy as np
import pandas_profiling
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

Анализ и подготовка данных

In [648]:
df = pd.read_csv('data_DZ_6.txt', sep=",", header=None, names=[   
   "Sample_code_number",           # id number
   "Class",                        # 2 for benign, 4 for malignant
   "Clump_Thickness",              # 1 - 10
   "Uniformity_of_Cell_Size",      # 1 - 10
   "Uniformity_of_Cell_Shape",     # 1 - 10
   "Marginal_Adhesion",            # 1 - 10
   "Single_Epithelial_Cell_Size",  # 1 - 10
   "Bare_Nuclei",                  # 1 - 10
   "Bland_Chromatin",              # 1 - 10
   "Normal_Nucleoli",              # 1 - 10
   "Mitoses"                       # 1 - 10
   ])

In [494]:
pandas_profiling.ProfileReport(df)



In [649]:
X = df.drop(['Sample_code_number','Class'], axis=1)

In [650]:
y = np.where(df['Class'] == 2, 0, 1) # заменили 2 на 0, 4 на 1

In [651]:
X = preprocessing.scale(X) # стандартизировали данные

In [652]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)  # разбили на подвыборки

Метод опорных векторов

In [653]:
clf = SVC() # SVM с ядерным трюком

In [654]:
param_grid =[{'kernel': ['poly'], # задаем сетку параметров для полимиального ядра и RBF ядра
              'degree': [2, 5, 8, 10, 20], 
              'C': [0.0001, 0.001, 0.1, 3, 10, 100, 1000]},
             {'kernel': ['rbf'], 
              'gamma': [0.000001, 0.00001, 0.001, 0.01, 0.1, 10], 
              'C': [100, 1000, 10000, 100000]}]

In [655]:
grid_serch = GridSearchCV(clf, param_grid, cv=5, scoring ='f1')

In [656]:
grid_serch.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kernel': ['poly'], 'degree': [2, 5, 8, 10, 20], 'C': [0.0001, 0.001, 0.1, 3, 10, 100, 1000]}, {'kernel': ['rbf'], 'gamma': [1e-06, 1e-05, 0.001, 0.01, 0.1, 10], 'C': [100, 1000, 10000, 100000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [657]:
grid_serch.best_params_ # лучшие параметры классификатора

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}

In [658]:
y_pred = grid_serch.best_estimator_.predict(X_train) # предсказанные значения для тренировочной выборки

In [659]:
print('Accuracy_train_SVM:', accuracy_score(y_train, y_pred))
print('F1 score_train_SVM:', f1_score(y_train, y_pred))
print('Recall_train_SVM:', recall_score(y_train, y_pred))
print('Precision_train_SVM:', precision_score(y_train, y_pred))

Accuracy_train_SVM: 0.9700854700854701
F1 score_train_SVM: 0.9575757575757576
Recall_train_SVM: 0.9634146341463414
Precision_train_SVM: 0.9518072289156626


In [660]:
y_pred_test = grid_serch.best_estimator_.predict(X_test) # предсказанные значения для тестовой выборки

In [661]:
print('Accuracy_test_SVM:', accuracy_score(y_test, y_pred_test))
print('F1 score_test_SVM:', f1_score(y_test, y_pred_test))
print('Recall_test_SVM:', recall_score(y_test, y_pred_test))
print('Precision_test_SVM:', precision_score(y_test, y_pred_test))

Accuracy_test_SVM: 0.9696969696969697
F1 score_test_SVM: 0.9536423841059603
Recall_test_SVM: 0.935064935064935
Precision_test_SVM: 0.972972972972973


Логистическая регрессия

In [662]:
log_reg = LogisticRegression()

In [663]:
param_grid_log_reg =[{'penalty' : ['l1'], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]},
                     {'penalty' : ['l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}]

In [664]:
grid_serch_log_reg = GridSearchCV(log_reg, param_grid_log_reg, cv=5, scoring ='f1')

In [665]:
grid_serch_log_reg.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'penalty': ['l1'], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}, {'penalty': ['l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [666]:
grid_serch_log_reg.best_params_ # лучший параметр классификатора

{'C': 0.01, 'penalty': 'l2'}

In [667]:
y_pred_log_reg = grid_serch_log_reg.best_estimator_.predict(X_train) # предсказанные значения для тренировочной выборки

In [668]:
print('Accuracy_train_log_reg:', accuracy_score(y_train, y_pred_log_reg))
print('F1 score_train_log_reg:', f1_score(y_train, y_pred_log_reg))
print('Recall_train_log_reg:', recall_score(y_train, y_pred_log_reg))
print('Precision_train_log_reg:', precision_score(y_train, y_pred_log_reg))

Accuracy_train_log_reg: 0.967948717948718
F1 score_train_log_reg: 0.954954954954955
Recall_train_log_reg: 0.9695121951219512
Precision_train_log_reg: 0.9408284023668639


In [669]:
y_pred_test_log_reg = grid_serch_log_reg.best_estimator_.predict(X_test) # предсказанные значения для тестовой выборки

In [670]:
print('Accuracy_test_log_reg:', accuracy_score(y_test, y_pred_test_log_reg))
print('F1 score_test_log_reg:', f1_score(y_test, y_pred_test_log_reg))
print('Recall_test_log_reg:', recall_score(y_test, y_pred_test_log_reg))
print('Precision_test_log_reg:', precision_score(y_test, y_pred_test_log_reg))

Accuracy_test_log_reg: 0.974025974025974
F1 score_test_log_reg: 0.961038961038961
Recall_test_log_reg: 0.961038961038961
Precision_test_log_reg: 0.961038961038961
