In [4]:
import numpy as np
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm
from sklearn import preprocessing
from sklearn import metrics
from tak.tak import myprint

iris = datasets.load_iris()
iris.data.shape, iris.target.shape

((150, 4), (150,))

# 3.1.1 Computing CV metrics (cross_val_score + cross_val_predict)
- The simplest way to use cross-validation is to call the `cross_val_score` helper function on the estimator and the dataset.
- `scores` variable is an nparray of "scores" in each cv-folds

In [5]:
clf = svm.SVC(kernel='linear', C=1)
scores = cross_validation.cross_val_score(clf, iris.data, iris.target, cv=5)
print scores

# the mean score and the 95% confidence interval of the score estimate
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.96666667  1.          0.96666667  0.96666667  1.        ]
Accuracy: 0.98 (+/- 0.03)


- By default, the score computed at each CV iteration is the score method of the estimator.
- It is possible to change this by using the scoring parameter.
- In the case of the Iris dataset, the samples are balanced across target classes hence the accuracy and the F1-score are almost equal.


In [6]:
scores = cross_validation.cross_val_score(clf, iris.data, iris.target,cv=5, 
                                          scoring='f1_weighted')
scores                                        

array([ 0.96658312,  1.        ,  0.96658312,  0.96658312,  1.        ])

### We can also use other cv strategies using cv iterator
- **self-note**: i probably won't use the `cv.ShuffleSplit` class...just stick with Strafieid CV

Why?  See this from [link](http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.ShuffleSplit.html#sklearn.cross_validation.ShuffleSplit)
> Note: contrary to other cross-validation strategies, **random splits do not guarantee that all folds will be different**

In [8]:
# Ah, so ShuffleSplit is also an iterable
n_samples = iris.data.shape[0]
cv = cross_validation.ShuffleSplit(n_samples, n_iter=3,
    test_size=0.3, random_state=0)
print cv.__class__
for train_indices, test_indices in cv:
    print train_indices, test_indices
# cross_validation.cross_val_score(clf, iris.data, iris.target, cv=cv)

<class 'sklearn.cross_validation.ShuffleSplit'>
[ 60 116 144 119 108  69 135  56  80 123 133 106 146  50 147  85  30 101
  94  64  89  91 125  48  13 111  95  20  15  52   3 149  98   6  68 109
  96  12 102 120 104 128  46  11 110 124  41 148   1 113 139  42   4 129
  17  38   5  53 143 105   0  34  28  55  75  35  23  74  31 118  57 131
  65  32 138  14 122  19  29 130  49 136  99  82  79 115 145  72  77  25
  81 140 142  39  58  88  70  87  36  21   9 103  67 117  47] [114  62  33 107   7 100  40  86  76  71 134  51  73  54  63  37  78  90
  45  16 121  66  24   8 126  22  44  97  93  26 137  84  27 127 132  59
  18  83  61  92 112   2 141  43  10]
[ 80 107  90   0  36 112   5  57 102  55  34 128  33  21  73   7  45 129
 103 146 120  94  50 134  99 126 114   9  39  97 101  29  81  20  46  51
  53  23  27   2  28  37 111  10  84 137 127  43  87  69 144 140  35  76
   3  82 145 116  88  44 147   1  93  38  11 115  54  40  18  41  79  24
  56  71  13  31  85  70 132 125 123 100  32 104 

In [9]:
k_fold = cross_validation.KFold(n=6, n_folds=3)
print k_fold.__class__
for train_indices, test_indices in k_fold:
     print('Train: %s | test: %s' % (train_indices, test_indices))

<class 'sklearn.cross_validation.KFold'>
Train: [2 3 4 5] | test: [0 1]
Train: [0 1 4 5] | test: [2 3]
Train: [0 1 2 3] | test: [4 5]


### Data transformation with held out data
- data transformations similarly should be learnt from a training set and applied to held-out data for prediction.
- A Pipeline makes it easier to compose estimators, providing this behavior under cross-validation.

In [10]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    iris.data, iris.target, test_size=0.4, random_state=0)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
clf = svm.SVC(C=1).fit(X_train_transformed, y_train)
X_test_transformed = scaler.transform(X_test)
clf.score(X_test_transformed, y_test)  


0.93333333333333335

### Use of `pipeline` to simplify things

In [11]:
from sklearn.pipeline import make_pipeline
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
print(cv.__class__)
cross_validation.cross_val_score(clf, iris.data, iris.target, cv=cv)

<class 'sklearn.cross_validation.ShuffleSplit'>


array([ 0.97777778,  0.93333333,  0.95555556])

## 3.1.1.1. Obtaining predictions by cross-validation (cross_val_predict)
-  cross_val_predict has a similar interface to cross_val_score, but returns, for each element in the input, the prediction that was obtained for that element when it was in the test set
- Only cross-validation strategies that assign all elements to a test set exactly once can be used (otherwise, an exception is raised).

In [13]:
# These prediction can then be used to evaluate the classifier:
predicted = cross_validation.cross_val_predict(clf, iris.data,
                                               iris.target, cv=10)
myprint(metrics.accuracy_score(iris.target, predicted) )
myprint(predicted)

metrics.accuracy_score(iris.target, predicted)  = 0.966666666667
predicted = [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


# 3.1.2 CV iterators (KFold, StratifiedKFold, LeaveOneOut, LeavePOut, LeaveOneLabelOut, LeavePLabelOut)

# 3.1.3 A note on shuffling
- read below...contains important stuffs
- http://scikit-learn.org/stable/modules/cross_validation.html#a-note-on-shuffling

## About random_state parameter
- The random_state parameter defaults to None, meaning that the shuffling will be different every time KFold(..., shuffle=True) is iterated. 
- However, GridSearchCV will use the same shuffling for each set of parameters validated by a single call to its fit method.
- To ensure results are repeatable (on the same platform), use a fixed value for random_state.
