# Цель: Реализовать SVM и Logistic Regression для данного датасета: https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original) Метрики оценки: Accuracy, Precision, Recall, F1-Score

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [3]:
import urllib.parse
path =  '../../../Загрузки/'
file = 'breast-cancer-wisconsin.data'
columns = ['id', 'Clump_Thick', 'Uniform_Size', 'Uniform_Shape', 'Marg_Adhesion', \
           'Sing_Epit_Cell_Size', 'Bare_Nuclei', 'Bland_ Chrom', 'Normal_Nuc', 'Mitoses', 'Class']  


In [5]:
data = pd.read_csv(path + file, sep=',', names=columns, index_col=False)
data.head(3)

Unnamed: 0,id,Clump_Thick,Uniform_Size,Uniform_Shape,Marg_Adhesion,Sing_Epit_Cell_Size,Bare_Nuclei,Bland_ Chrom,Normal_Nuc,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                     699 non-null int64
Clump_Thick            699 non-null int64
Uniform_Size           699 non-null int64
Uniform_Shape          699 non-null int64
Marg_Adhesion          699 non-null int64
Sing_Epit_Cell_Size    699 non-null int64
Bare_Nuclei            699 non-null object
Bland_ Chrom           699 non-null int64
Normal_Nuc             699 non-null int64
Mitoses                699 non-null int64
Class                  699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [15]:
type(data['Bare_Nuclei'][1])

str

In [16]:
data['Bare_Nuclei'] = pd.to_numeric(data['Bare_Nuclei'], errors='coerce')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                     699 non-null int64
Clump_Thick            699 non-null int64
Uniform_Size           699 non-null int64
Uniform_Shape          699 non-null int64
Marg_Adhesion          699 non-null int64
Sing_Epit_Cell_Size    699 non-null int64
Bare_Nuclei            683 non-null float64
Bland_ Chrom           699 non-null int64
Normal_Nuc             699 non-null int64
Mitoses                699 non-null int64
Class                  699 non-null int64
dtypes: float64(1), int64(10)
memory usage: 60.1 KB


In [17]:
data = data[np.isfinite(data['Bare_Nuclei'])]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
id                     683 non-null int64
Clump_Thick            683 non-null int64
Uniform_Size           683 non-null int64
Uniform_Shape          683 non-null int64
Marg_Adhesion          683 non-null int64
Sing_Epit_Cell_Size    683 non-null int64
Bare_Nuclei            683 non-null float64
Bland_ Chrom           683 non-null int64
Normal_Nuc             683 non-null int64
Mitoses                683 non-null int64
Class                  683 non-null int64
dtypes: float64(1), int64(10)
memory usage: 64.0 KB


In [18]:
data['Class'].value_counts()

2    444
4    239
Name: Class, dtype: int64

In [19]:
data['Class'] = data['Class'].replace({2:0,4:1})
data['Class'].value_counts()

0    444
1    239
Name: Class, dtype: int64

In [20]:
y = data['Class']
data.drop(['Class', 'id'], axis=1, inplace=True)
data.head(2)

Unnamed: 0,Clump_Thick,Uniform_Size,Uniform_Shape,Marg_Adhesion,Sing_Epit_Cell_Size,Bare_Nuclei,Bland_ Chrom,Normal_Nuc,Mitoses
0,5,1,1,1,2,1.0,3,1,1
1,5,4,4,5,7,10.0,3,2,1


In [21]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3)

In [41]:
scaler = StandardScaler()
scaler.fit(X_train,y_train)
X_scaled = scaler.transform (X_train)
X_test_scaled = scaler.transform(X_test)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


## LogisticRegression

In [45]:
model_LR = LogisticRegression()
paramgrid_LR = {'C': [0.01, 0.05, 0.1, 0.5, 1], 'penalty': ['l1','l2']}

optimizer_LR = GridSearchCV(model_LR, paramgrid_LR, cv=10)
optimizer_LR.fit(X_scaled, y_train)
predict_LR = optimizer_LR.best_estimator_.predict(X_test_scaled)



In [59]:
optimizer_LR.best_params_

{'C': 0.5, 'penalty': 'l1'}

## SVC

In [63]:
model_SVC = SVC()
paramgrid_SVC = {'kernel': ['linear', 'rbf', 'poly'], 'gamma': [0, 1,  4, 8], 'C': [0.1, 1, 10]}

optimizer_SVC = GridSearchCV(model_SVC, paramgrid_SVC, cv=10)
optimizer_SVC.fit(X_scaled, y_train)
predict_SVC = optimizer_SVC.best_estimator_.predict(X_test_scaled)



In [64]:
optimizer_SVC.best_params_

{'C': 1, 'gamma': 0, 'kernel': 'linear'}

# Score

$accuracy = \frac {TP+TN}{TP+TN+FP+FN}$

In [65]:
print('accuracy score LogisticRegression: ', accuracy_score(y_test, predict_LR))
print('accuracy score SVC: ', accuracy_score(y_test, predict_SVC))  
print('\n---------------------')

accuracy score LogisticRegression:  0.9707317073170731
accuracy score SVC:  0.9609756097560975

---------------------


$recall = \frac {TP}{TP+FN}$

In [66]:
print('recall LogisticRegression: ', recall_score(y_test, predict_LR))
print('recall SVC: ', recall_score(y_test, predict_SVC))
print('\n---------------------')

recall LogisticRegression:  0.9705882352941176
recall SVC:  0.9558823529411765

---------------------


$precision = \frac {TP}{TP+FP}$

In [67]:
print('precision LogisticRegression: ', precision_score(y_test, predict_LR))
print('precision SVC: ', precision_score(y_test, predict_SVC))
print('\n---------------------')

precision LogisticRegression:  0.9428571428571428
precision SVC:  0.9285714285714286

---------------------


$f1 = 2*\frac {precision*recall}{precision+recall}$

In [68]:
print('f1 score LogisticRegression: ', f1_score(y_test, predict_LR))
print('f1 score SVC: ', f1_score(y_test, predict_SVC))

f1 score LogisticRegression:  0.9565217391304348
f1 score SVC:  0.9420289855072465
