### Import Libraries

In [1]:
import sklearn 

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate

### Load Data

In [2]:
from azureml import Workspace

ws = Workspace()
ds = ws.datasets['cancer_data']

cancer_df = ds.to_dataframe()

In [3]:
cancer_df.sample(10)

Unnamed: 0,Class,Age,Menopause,Tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
200,0,4,0,4,0,0,3,1,1,0
202,1,2,2,7,0,0,1,0,1,0
8,0,2,2,10,0,0,2,0,1,0
213,1,3,2,4,0,0,1,1,2,0
285,1,3,0,5,4,0,3,0,1,0
272,1,2,2,2,0,1,3,1,2,0
258,1,3,0,5,5,1,2,0,3,1
123,0,3,0,5,0,0,1,1,1,0
22,0,1,2,4,0,0,2,1,1,0
199,0,3,2,4,0,0,1,0,1,0


In [4]:
Y = cancer_df['Class']

In [5]:
X = cancer_df.drop('Class', axis = 1)

In [6]:
X.shape , Y.shape

((286, 9), (286,))

### Stratified K Fold Cross Validation

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True)

In [13]:
i = 1

for train, test in skf.split(X, Y):
    print("\nTrain" + str(i), train[:10])
    print("Test" + str(i), test[:10])
    
    i += 1


Train1 [ 0  1  2  3  4  5  7  8  9 10]
Test1 [ 6 17 22 24 26 34 40 51 61 62]

Train2 [ 0  2  3  4  5  6  7  8  9 10]
Test2 [ 1 13 16 20 21 25 36 39 58 65]

Train3 [ 0  1  2  3  5  6 12 13 14 15]
Test3 [ 4  7  8  9 10 11 23 31 35 42]

Train4 [ 0  1  2  4  6  7  8  9 10 11]
Test4 [ 3  5 12 14 19 28 38 43 46 47]

Train5 [ 1  3  4  5  6  7  8  9 10 11]
Test5 [ 0  2 15 18 27 29 30 32 33 37]


In [14]:
len(X), len(train), len(test)

(286, 229, 57)

In [15]:
skf = StratifiedKFold(n_splits=2)

In [17]:
indices_list = list(skf.split(X, Y))
indices_list

[(array([101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
         114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
         127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
         140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
         153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
         166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178,
         179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
         192, 193, 194, 195, 196, 197, 198, 199, 200, 244, 245, 246, 247,
         248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
         261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
         274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285]),
  array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
          26,  27,  28,  29,  30,  31,  3

In [18]:
indices_list[0][0]

array([101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
       114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
       127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
       140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
       153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
       166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178,
       179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
       192, 193, 194, 195, 196, 197, 198, 199, 200, 244, 245, 246, 247,
       248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
       261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
       274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285])

In [19]:
x_train1, y_train1 = X.loc[indices_list[0][0]], Y[indices_list[0][0]]
x_test1, y_test1 = X.loc[indices_list[0][1]], Y[indices_list[0][1]]

x_train2, y_train2 = X.loc[indices_list[1][0]], Y[indices_list[1][0]]
x_test2, y_test2 = X.loc[indices_list[1][1]], Y[indices_list[1][1]]

In [20]:
x_train1.shape, y_train1.shape

((142, 9), (142,))

In [21]:
x_test1.shape, y_test1.shape

((144, 9), (144,))

In [22]:
x_train2.shape, y_train2.shape

((144, 9), (144,))

In [23]:
x_test2.shape, y_test2.shape

((142, 9), (142,))

In [24]:
reg = LogisticRegression(solver='liblinear', C=1).fit(x_train1, y_train1)

reg.score(x_test1, y_test1)

0.73611111111111116

In [25]:
reg = LogisticRegression(solver='liblinear', C=1).fit(x_train2, y_train2)

reg.score(x_test2, y_test2)

0.56338028169014087

In [26]:
clf = LogisticRegression(solver='liblinear', C=1)

cross_val_score(clf, X, Y, cv=2)

array([ 0.73611111,  0.56338028])

In [27]:
cross_validate(clf, X, Y, scoring='accuracy', cv=2, return_train_score=True)

{'fit_time': array([ 0.0053544 ,  0.00475287]),
 'score_time': array([ 0.00217676,  0.00151443]),
 'test_score': array([ 0.73611111,  0.56338028]),
 'train_score': array([ 0.78873239,  0.8125    ])}

In [28]:
sklearn.metrics.SCORERS.keys()

dict_keys(['precision_micro', 'neg_mean_squared_error', 'completeness_score', 'recall_weighted', 'fowlkes_mallows_score', 'explained_variance', 'f1_macro', 'recall', 'adjusted_mutual_info_score', 'r2', 'balanced_accuracy', 'precision_macro', 'precision_samples', 'normalized_mutual_info_score', 'accuracy', 'roc_auc', 'v_measure_score', 'precision', 'f1', 'mutual_info_score', 'neg_median_absolute_error', 'neg_mean_squared_log_error', 'f1_samples', 'homogeneity_score', 'neg_log_loss', 'precision_weighted', 'adjusted_rand_score', 'f1_micro', 'brier_score_loss', 'average_precision', 'f1_weighted', 'neg_mean_absolute_error', 'recall_samples', 'recall_macro', 'recall_micro'])

In [29]:
scoring = ['precision', 'recall']

cross_validate(clf, X, Y, scoring=scoring, cv=2, return_train_score=True)

{'fit_time': array([ 0.00429678,  0.00292516]),
 'score_time': array([ 0.00369167,  0.00323915]),
 'test_precision': array([ 1.        ,  0.37804878]),
 'test_recall': array([ 0.11627907,  0.73809524]),
 'train_precision': array([ 0.75      ,  0.94444444]),
 'train_recall': array([ 0.42857143,  0.39534884])}