In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support

In [2]:
# !pip install -U scikit-learn

In [3]:
seed = 42

# Part 1. Implement 4 Cross Validation Methods

## 1) Load Data

In [4]:
feature_cols = [f'feature_{x}' for x in range(1, 31)]

In [5]:
cols = ["ID", "Diagnosis"] + feature_cols

In [6]:
df = pd.read_csv("wdbc.data", header=None, names=cols)

In [7]:
X = df[feature_cols]

In [8]:
y = df['Diagnosis'].map({'B': 0,
                         'M': 1})

In [9]:
y.unique()

array([1, 0])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)

## 2) DecisionTree Model

In [11]:
dt = DecisionTreeClassifier(random_state=seed)

## 3) Cross Validations

### 1. KFold

In [12]:
from sklearn.model_selection import KFold
cv1 = KFold(n_splits=3)
cv1

KFold(n_splits=3, random_state=None, shuffle=False)

In [13]:
for train, test in cv1.split(X, y):
    print(train.shape, test.shape)

(379,) (190,)
(379,) (190,)
(380,) (189,)


In [14]:
cross_val_score(dt, X_train, y_train, cv=cv1)

array([0.96052632, 0.92763158, 0.92715232])

### 2. RepeatedKFold

In [15]:
from sklearn.model_selection import RepeatedKFold
cv2 = RepeatedKFold(n_splits=3, n_repeats=2, random_state=seed)
cv2

RepeatedKFold(n_repeats=2, n_splits=3, random_state=42)

In [16]:
for train, test in cv2.split(X, y):
    print(train.shape, test.shape)

(379,) (190,)
(379,) (190,)
(380,) (189,)
(379,) (190,)
(379,) (190,)
(380,) (189,)


In [17]:
cross_val_score(dt, X_train, y_train, cv=cv2)

array([0.91447368, 0.94078947, 0.9205298 , 0.92105263, 0.92763158,
       0.92715232])

### 3. StratifiedKFold

In [18]:
from sklearn.model_selection import StratifiedKFold
cv3 = StratifiedKFold(n_splits=3)
cv3

StratifiedKFold(n_splits=3, random_state=None, shuffle=False)

In [19]:
for train, test in cv3.split(X, y):
    print(train.shape, test.shape)

(379,) (190,)
(379,) (190,)
(380,) (189,)


In [20]:
cross_val_score(dt, X_train, y_train, cv=cv3)

array([0.96052632, 0.92763158, 0.92715232])

### 4. ShuffleSplit

In [21]:
from sklearn.model_selection import ShuffleSplit
cv4 = ShuffleSplit(n_splits=3, test_size=0.3, random_state=seed)
cv4

ShuffleSplit(n_splits=3, random_state=42, test_size=0.3, train_size=None)

In [22]:
for train, test in cv4.split(X, y):
    print(train.shape, test.shape)

(398,) (171,)
(398,) (171,)
(398,) (171,)


In [23]:
cross_val_score(dt, X_train, y_train, cv=cv4)

array([0.91240876, 0.91240876, 0.94160584])

# Part 2. Compare with previous best models

In [24]:
def models(model_kind):
    
    if model_kind == 'svc':
        model = SVC(random_state=seed)
        params = {'C': [0.01, 0.1], 
              'gamma': [0.1, 1],
              'kernel': ['rbf']
            }
    elif model_kind == 'dt':
        model = DecisionTreeClassifier(random_state=seed)
        params = {
                'criterion':['gini', 'entropy'], 
                'max_depth':range(1,7),
            }
    elif model_kind == 'adaboost':
        model = AdaBoostClassifier(random_state=seed)
        params = {
            'n_estimators': [50, 70], 
            'learning_rate': [0.01, 0.1], 
            'algorithm': ['SAMME.R'],
        }
    elif model_kind == 'rf':
        model = RandomForestClassifier(random_state=seed)
        params = {
                'criterion':['gini', 'entropy'], 
                'max_depth':range(1,7),
            }
    else:
        print('invalid model')
    
    model = GridSearchCV(model, params)
    model.fit(X_train, y_train)
    print(model)
    print('Best Model Params:')
    print(model.best_params_)
    print('')
    
    y_train_pred = model.predict(X_train)
    print('Best model measuresments on train data set (Precision, Recall, f1)')
    train_scores = precision_recall_fscore_support(y_train, y_train_pred, average='macro')
    print(train_scores)
    
    print('Best model measuresments on test data set (Precision, Recall, f1)')
    y_test_pred = model.predict(X_test)
    test_scores = precision_recall_fscore_support(y_test, y_test_pred, average='macro')
    print(test_scores)
    
    print('')
    print('##### Cross Validation Measurements #######')
    print('# 1. KFold')
    for m in ['precision', 'recall', 'f1']:
        print(f"----{m}---")
        scores = cross_val_score(model, X_train, y_train, cv=cv1, scoring=m)
        print(scores)
        print(f"mean: {scores.mean()}; std: {scores.std()}")
    
    print('')
    print('# 2. ShuffleSplit')
    for m in ['precision', 'recall', 'f1']:
        print(f"----{m}---")
        scores = cross_val_score(model, X_train, y_train, cv=cv4, scoring=m)
        print(scores)
        print(f"mean: {scores.mean()}; std: {scores.std()}")
    
    

## SVM

In [25]:
models('svc')

GridSearchCV(estimator=SVC(random_state=42),
             param_grid={'C': [0.01, 0.1], 'gamma': [0.1, 1],
                         'kernel': ['rbf']})
Best Model Params:
{'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}

Best model measuresments on train data set (Precision, Recall, f1)
(0.3131868131868132, 0.5, 0.38513513513513514, None)
Best model measuresments on test data set (Precision, Recall, f1)
(0.3157894736842105, 0.5, 0.3870967741935484, None)

##### Cross Validation Measurements #######
# 1. KFold
----precision---


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[0. 0. 0.]
mean: 0.0; std: 0.0
----recall---
[0. 0. 0.]
mean: 0.0; std: 0.0
----f1---
[0. 0. 0.]
mean: 0.0; std: 0.0

# 2. ShuffleSplit
----precision---


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[0. 0. 0.]
mean: 0.0; std: 0.0
----recall---
[0. 0. 0.]
mean: 0.0; std: 0.0
----f1---
[0. 0. 0.]
mean: 0.0; std: 0.0


## Decision Tree

In [26]:
models('dt')

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 7)})
Best Model Params:
{'criterion': 'entropy', 'max_depth': 6}

Best model measuresments on train data set (Precision, Recall, f1)
(0.9965156794425087, 0.9941176470588236, 0.9952931683700914, None)
Best model measuresments on test data set (Precision, Recall, f1)
(0.9412393162393162, 0.9097222222222223, 0.9220512820512821, None)

##### Cross Validation Measurements #######
# 1. KFold
----precision---
[0.98076923 0.9245283  0.83928571]
mean: 0.9148610823139126; std: 0.05816349204233454
----recall---
[0.86440678 0.84482759 0.88679245]
mean: 0.8653422728993675; std: 0.017144850896151958
----f1---
[0.91891892 0.88288288 0.86238532]
mean: 0.8880623743009064; std: 0.023368529679520182

# 2. ShuffleSplit
----precision---
[0.91836735 0.90697674 0.95833333]
mean: 0.9278924748193851; std: 0.022021517629616745
----recall---
[0.

## AdaBoost

In [27]:
models('adaboost')

GridSearchCV(estimator=AdaBoostClassifier(random_state=42),
             param_grid={'algorithm': ['SAMME.R'], 'learning_rate': [0.01, 0.1],
                         'n_estimators': [50, 70]})
Best Model Params:
{'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 70}

Best model measuresments on train data set (Precision, Recall, f1)
(0.9849237281032603, 0.9776573787409701, 0.9810795076513639, None)
Best model measuresments on test data set (Precision, Recall, f1)
(0.9736842105263157, 0.9523809523809523, 0.9614864864864865, None)

##### Cross Validation Measurements #######
# 1. KFold
----precision---
[1.         0.96428571 0.87272727]
mean: 0.9456709956709957; std: 0.0536001715711705
----recall---
[0.94915254 0.93103448 0.90566038]
mean: 0.9286158008299976; std: 0.017837780525495797
----f1---
[0.97391304 0.94736842 0.88888889]
mean: 0.936723451139927; std: 0.03551772566143225

# 2. ShuffleSplit
----precision---
[0.90566038 0.93181818 0.94117647]
mean: 0.9262183432549692; st

## Random Forest

In [28]:
models('rf')

GridSearchCV(estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 7)})
Best Model Params:
{'criterion': 'gini', 'max_depth': 6}

Best model measuresments on train data set (Precision, Recall, f1)
(0.9965156794425087, 0.9941176470588236, 0.9952931683700914, None)
Best model measuresments on test data set (Precision, Recall, f1)
(0.98, 0.9642857142857143, 0.9712773998488284, None)

##### Cross Validation Measurements #######
# 1. KFold
----precision---
[1.         0.96226415 0.89090909]
mean: 0.9510577472841625; std: 0.045235636569711406
----recall---
[0.94915254 0.87931034 0.9245283 ]
mean: 0.9176637296957534; std: 0.028923172981924605
----f1---
[0.97391304 0.91891892 0.90740741]
mean: 0.9334131232681956; std: 0.02902081295767054

# 2. ShuffleSplit
----precision---
[0.92307692 0.97619048 0.96      ]
mean: 0.953089133089133; std: 0.022227347152870655
----recall---
[0.92307692 0.8913043