In [1]:
# import warnings
# warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
cancer_data = pd.read_csv("data/breast-cancer-wisconsin.csv")
banknote_data = pd.read_csv("data/data_banknote_authentication.csv")
diabetes_data = pd.read_csv("data/diabetes.csv")
seeds_data = pd.read_csv("data/seeds_dataset.csv")

In [3]:
cancer_data["diagnosis"] = cancer_data["diagnosis"].apply(lambda x: 1 if x == 'M' else 0)
cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    int64  
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [4]:
cancer_Y = cancer_data["diagnosis"]
cancer_X = cancer_data.drop(columns=["id", "diagnosis"])

In [5]:
banknote_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1372 entries, 0 to 1371
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   variance  1372 non-null   float64
 1   skewness  1372 non-null   float64
 2   curtosis  1372 non-null   float64
 3   entropy   1372 non-null   float64
 4   class     1372 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 53.7 KB


In [6]:
banknote_Y = banknote_data["class"]
banknote_X = banknote_data.drop(columns=["class"])

In [7]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [8]:
diabetes_Y = diabetes_data["Outcome"]
diabetes_X = diabetes_data.drop(columns=["Outcome"])

In [9]:
seeds_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   area           199 non-null    float64
 1   perimeter      199 non-null    float64
 2   compactness    199 non-null    float64
 3   kernel_length  199 non-null    float64
 4   width          199 non-null    float64
 5   asymmetry      199 non-null    float64
 6   groove_length  199 non-null    float64
 7   class          199 non-null    int64  
dtypes: float64(7), int64(1)
memory usage: 12.6 KB


In [10]:
seeds_Y = seeds_data["class"]
seeds_X = seeds_data.drop(columns=["class"])

In [11]:
datasets = []
datasets.append(('Cancer', cancer_X, cancer_Y))
datasets.append(('Banknote', banknote_X, banknote_Y))
datasets.append(('Diabetes', diabetes_X, diabetes_Y))
datasets.append(('Seeds', seeds_X, seeds_Y))

In [12]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('TREE', DecisionTreeClassifier()))
models.append(('SVM', SVC(gamma='auto', probability=True)))
models.append(('GB', GradientBoostingClassifier()))
models.append(('RF', RandomForestClassifier()))

In [13]:
def runModels(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split (
        X, Y, test_size = 0.20, random_state=42)

    for name, model in models:
#         kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
#         cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
#         print('%s: %.4f (%.4f)' % (name, cv_results.mean(), cv_results.std()))
        
        model.fit(X_train, Y_train)
        predictions = model.predict(X_test)
        print('%s: %.4f' % (name, accuracy_score(Y_test, predictions)))

In [14]:
for name, X, Y in datasets:
    print(name)
    runModels(X, Y)
    print()

Cancer
LR: 0.9561
KNN: 0.9561
TREE: 0.9474
SVM: 0.6228
GB: 0.9561
RF: 0.9649

Banknote
LR: 0.9855
KNN: 1.0000
TREE: 0.9782
SVM: 1.0000
GB: 1.0000
RF: 0.9927

Diabetes
LR: 0.7597
KNN: 0.6623
TREE: 0.7273
SVM: 0.6429
GB: 0.7403
RF: 0.7273

Seeds
LR: 0.8500
KNN: 0.8500
TREE: 0.8500
SVM: 0.8750
GB: 0.8750
RF: 0.8500



In [15]:
scaler = StandardScaler()
for name, X, Y in datasets:
    print('Scaled', name)
    scaled_X = scaler.fit(X).transform(X)
    runModels(scaled_X, Y)
    print()

Scaled Cancer
LR: 0.9737
KNN: 0.9474
TREE: 0.9298
SVM: 0.9737
GB: 0.9561
RF: 0.9649

Scaled Banknote
LR: 0.9782
KNN: 1.0000
TREE: 0.9818
SVM: 1.0000
GB: 1.0000
RF: 0.9927

Scaled Diabetes
LR: 0.7532
KNN: 0.6883
TREE: 0.7468
SVM: 0.7338
GB: 0.7403
RF: 0.7273

Scaled Seeds
LR: 0.8750
KNN: 0.8750
TREE: 0.8500
SVM: 0.8750
GB: 0.9000
RF: 0.8500

