In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_table("breast-cancer-wisconsin.data",sep=",",header=None)
df.columns=['ID','clump_thickness','size_uniformity','shape_uniformity',
           'marginal_adhesion','epithelial_size','bare_nucleoli',
           'bland_chromatin','normal_nucleoli','mitoses','class']
df.set_index('ID',inplace=True)
df

Unnamed: 0_level_0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1,3,1,1,2
1002945,5,4,4,5,7,10,3,2,1,2
1015425,3,1,1,1,2,2,3,1,1,2
1016277,6,8,8,1,3,4,3,7,1,2
1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
776715,3,1,1,1,3,2,1,1,1,2
841769,2,1,1,1,2,1,1,1,1,2
888820,5,10,10,3,7,3,8,10,2,4
897471,4,8,6,4,3,4,10,6,1,4


In [3]:
print(df.describe())

       clump_thickness  size_uniformity  shape_uniformity  marginal_adhesion  \
count       699.000000       699.000000        699.000000         699.000000   
mean          4.417740         3.134478          3.207439           2.806867   
std           2.815741         3.051459          2.971913           2.855379   
min           1.000000         1.000000          1.000000           1.000000   
25%           2.000000         1.000000          1.000000           1.000000   
50%           4.000000         1.000000          1.000000           1.000000   
75%           6.000000         5.000000          5.000000           4.000000   
max          10.000000        10.000000         10.000000          10.000000   

       epithelial_size  bland_chromatin  normal_nucleoli     mitoses  \
count       699.000000       699.000000       699.000000  699.000000   
mean          3.216023         3.437768         2.866953    1.589413   
std           2.214300         2.438364         3.053634    1.7

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 699 entries, 1000025 to 897471
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   clump_thickness    699 non-null    int64 
 1   size_uniformity    699 non-null    int64 
 2   shape_uniformity   699 non-null    int64 
 3   marginal_adhesion  699 non-null    int64 
 4   epithelial_size    699 non-null    int64 
 5   bare_nucleoli      699 non-null    object
 6   bland_chromatin    699 non-null    int64 
 7   normal_nucleoli    699 non-null    int64 
 8   mitoses            699 non-null    int64 
 9   class              699 non-null    int64 
dtypes: int64(9), object(1)
memory usage: 60.1+ KB
None


In [5]:
print(df['bare_nucleoli'].unique())

['1' '10' '2' '4' '3' '9' '7' '?' '5' '8' '6']


In [6]:
df.replace("?",np.nan,inplace=True)
df.dropna(axis=0,inplace=True)

In [7]:
print(df.isna().sum())

clump_thickness      0
size_uniformity      0
shape_uniformity     0
marginal_adhesion    0
epithelial_size      0
bare_nucleoli        0
bland_chromatin      0
normal_nucleoli      0
mitoses              0
class                0
dtype: int64


In [8]:
X = df.drop('class',axis=1)
y = df['class']

In [9]:
print(y.value_counts())
from sklearn.metrics import accuracy_score

2    444
4    239
Name: class, dtype: int64


In [10]:
def FindBestModel(X,y,Scalers):
    
    scaled_X = []
    for key, scaler in Scalers.items():
        X_scale = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)
        scaled_X.append(X_scale)

    for i in range(len(scaled_X)):
        X_train, X_test, y_train, y_test = train_test_split(scaled_X[i],y,test_size=0.2,random_state=42)
        
        oversample = SMOTE()
        
        X_balanced, y_balanced = oversample.fit_resample(X_train, y_train)
        
        grid_params_dt = {
            'min_samples_split':[2,3,4,5],
            'max_features':[3,5,7,9],
            'max_depth':[3,5,7,9],
            'max_leaf_nodes':list(range(5,100))
        }
        
        grid_params_lr = {
            'penalty':['l2','none'],
            'C':[0.01,0.1,1.,3.,5.0]
        }

        grid_params_svm = [
            {'kernel':['linear'],'C':[10.,30.,100.,300.0]},
            {'kernel':['rbf'],'C':[1.0,3.0,10.,30.,100.,300.0],
                 'gamma':[0.01,0.03,0.1,0.3,1.0,3.0]}
        ]
        gs_dt_gini = GridSearchCV(DecisionTreeClassifier(criterion="gini"),grid_params_dt,verbose=1,cv=5,n_jobs=-1)
        gs_dt_gini.fit(X_balanced, y_balanced)
        dt_gini_score = gs_dt_gini.score(X_test,y_test)
        
        gs_dt_entropy = GridSearchCV(DecisionTreeClassifier(criterion="entropy"),grid_params_dt,verbose=1,cv=5,n_jobs=-1)
        gs_dt_entropy.fit(X_balanced, y_balanced)
        dt_entropy_score = gs_dt_entropy.score(X_test,y_test)
        
        gs_lr = GridSearchCV(LogisticRegression(), grid_params_lr,verbose=1,cv=5,n_jobs=-1)
        gs_lr.fit(X_balanced, y_balanced)
        lr_score = gs_lr.score(X_test,y_test)
        
        gs_svm = GridSearchCV(SVC(),grid_params_svm,cv=5,verbose=1,n_jobs=-1)
        gs_svm.fit(X_balanced, y_balanced)
        svm_score = gs_svm.score(X_test,y_test)
        
        if i==0:
            print("==========Standard Scaling==========")
        elif i==1:
            print("==========MinMax Scaling==========")
        elif i==2:
            print("==========MaxAbs Scaling==========")
        elif i==3:
            print("==========Robust Scaling==========")
            
        print("DecisionTree(Gini)\n best_parameter:",gs_dt_gini.best_params_)
        print("score:",dt_gini_score)
        print("DecisionTree(Entropy)\n best_parameter:",gs_dt_entropy.best_params_)
        print("score:",dt_entropy_score)
        print("LogisticRegression \n best_parameter:",gs_lr.best_params_)
        print("score:",lr_score)
        print("SVC \n best_parameter:",gs_svm.best_params_)
        print("score:",svm_score)


Scalers = {
    'StandardScaler':StandardScaler(),
    'MinMaxScaler':MinMaxScaler(),
    'MaxAbsScaler':MaxAbsScaler(),
    'RobustScaler':RobustScaler()
}

    
FindBestModel(X,y, Scalers)

Fitting 5 folds for each of 6080 candidates, totalling 30400 fits
Fitting 5 folds for each of 6080 candidates, totalling 30400 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Fitting 5 folds for each of 40 candidates, totalling 200 fits
DecisionTree(Gini)
 best_parameter: {'max_depth': 3, 'max_features': 5, 'max_leaf_nodes': 53, 'min_samples_split': 4}
score: 0.9343065693430657
DecisionTree(Entropy)
 best_parameter: {'max_depth': 5, 'max_features': 3, 'max_leaf_nodes': 44, 'min_samples_split': 2}
score: 0.9562043795620438
LogisticRegression 
 best_parameter: {'C': 0.01, 'penalty': 'none'}
score: 0.9781021897810219
SVC 
 best_parameter: {'C': 10.0, 'kernel': 'linear'}
score: 0.9635036496350365
Fitting 5 folds for each of 6080 candidates, totalling 30400 fits
Fitting 5 folds for each of 6080 candidates, totalling 30400 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 40 candidates, totalling 200 fits
DecisionTree(Gini)
 best_parameter: {'max_depth': 3, 'max_features': 5, 'max_leaf_nodes': 89, 'min_samples_split': 2}
score: 0.9343065693430657
DecisionTree(Entropy)
 best_parameter: {'max_depth': 3, 'max_features': 3,