In [1]:
# SVM classifier with an Radial basis function kernel
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

In [5]:
# SVM classifier with an Radial basis function kernel
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

# The minimum value of each feature
min_train = X_train.min(axis=0)

# The range of each feature by substracting the minimum from the maximum.
range_train = X_train.max(axis=0) - min_train

# Substract the minimum for each feature and divide by the range to bring data between 0 & 1
X_train_scaled = (X_train - min_train) / range_train
X_test_scaled = (X_test - min_train) / range_train

svm = SVC(C=50)
svm.fit(X_train_scaled, y_train)

print('The accuracy on the training subset: {:.3f}'.format(svm.score(X_train_scaled, y_train)))
print('The accuracy on the test subset: {:.3f}'.format(svm.score(X_test_scaled, y_test)))



The accuracy on the training subset: 0.995
The accuracy on the test subset: 0.979


### <center> Uncertainty Estimation </center>

We know that our classifier is able to predict that a spacific tumor as benign or malignant but we want to know how certain the classifier is with making that choice.<br>
We want to make an estimation of its uncertainty.<br>
Most classifiers inside scikit-learn can do this using two methods :
* Decision function.
* Predicting Probabilities.

**Decision function :** Given a point finds the distance to the separating plan.
<br>
Look only the 20 first samples.<br>
These represent the walues for how strongly SVM believes a certain point is one class or the other. Positive values are associated with one class while negatives are associated with the other class.


In [9]:
print('The decision function is:\n\n{}'.format(svm.decision_function(X_test_scaled)[:20]))

The decision function is:

[-3.80566784  0.90857477  2.74638735  1.58183853  1.93151405  3.36683719
  4.2198839   2.50708394  2.12907524  3.76941411  0.11263957  1.54449847
  3.60775422  0.46292768  0.0769851  -2.43330679  2.09127626 -4.04283667
 -3.86705139 -4.07489494]


In [10]:
# Check if it belongs to the benign class.
print('Thresholded decision function:\n\n{}'.format(svm.decision_function(X_test_scaled)[:20]>0))

Thresholded decision function:

[False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True False  True False False False]


In [11]:
# 0 is benign and 1 is malignant
svm.classes_

array([0, 1])

**Predicting Probabilities :** It outputs the probability of an instance (a tumor sample in our case) of being in each of the classes.<br>
So how likely it is for a tumor sample to be malignant then how likely it is fot it to be benign.

In [14]:
# Set probability parameter to True.
svm = SVC(C=50, probability=True)
svm.fit(X_train_scaled, y_train)

print('Predicted probabilities for the samples (malignant and benign):\n\n{}'\
      .format(svm.predict_proba(X_test_scaled[:20])))

Predicted probabilities for the samples (malignant and benign):

[[9.97907120e-01 2.09287976e-03]
 [1.39824912e-01 8.60175088e-01]
 [7.18519827e-03 9.92814802e-01]
 [4.93975097e-02 9.50602490e-01]
 [2.79549967e-02 9.72045003e-01]
 [2.52401169e-03 9.97475988e-01]
 [1.06848801e-06 9.99998932e-01]
 [1.07371541e-02 9.89262846e-01]
 [2.01695902e-02 9.79830410e-01]
 [4.91799838e-06 9.99995082e-01]
 [3.83655058e-01 6.16344942e-01]
 [5.24492893e-02 9.47550711e-01]
 [8.50730313e-06 9.99991493e-01]
 [2.57149594e-01 7.42850406e-01]
 [3.98696349e-01 6.01303651e-01]
 [9.78654460e-01 2.13455397e-02]
 [2.14737527e-02 9.78526247e-01]
 [9.98599480e-01 1.40052037e-03]
 [9.98113779e-01 1.88622135e-03]
 [9.98673498e-01 1.32650224e-03]]


In [15]:
# Preict classes for samples
svm.predict(X_test_scaled)

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0])

### <center> Uncertainty Estimation in MultiClass Datasets </center>
What to do if we have more than two classes ?<br>
Let's have a look on the iris dataset which is preloaded and pre-processed in scikit-learn. It this dataset flowers are divided into three categories : Setosa, Versicolour, and Virginica.

In [23]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

iris = load_iris()
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [24]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42)
gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0)

gbrt.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.01, random_state=0)

In [18]:
print('The decision function for the 3-class iris dataset:\n\n{}'\
      .format(gbrt.decision_function(X_test[:10])))

The decision function for the 3-class iris dataset:

[[-1.995715    0.04758267 -1.92720695]
 [ 0.06146394 -1.90755736 -1.92793758]
 [-1.99058203 -1.87637861  0.09686725]
 [-1.995715    0.04758267 -1.92720695]
 [-1.99730159 -0.13469108 -1.20341483]
 [ 0.06146394 -1.90755736 -1.92793758]
 [-1.995715    0.04758267 -1.92720695]
 [-1.99677434 -1.87637861  0.09686725]
 [-1.995715    0.04758267 -1.92720695]
 [-1.995715    0.04758267 -1.92720695]]


In [25]:
print('Predicted probabilities for the samples in the iris dataset:\n\n{}' \
      .format(gbrt.predict_proba(X_test[:10])))

Predicted probabilities for the samples in the iris dataset:

[[0.10217718 0.78840034 0.10942248]
 [0.78347147 0.10936745 0.10716108]
 [0.09818072 0.11005864 0.79176065]
 [0.10217718 0.78840034 0.10942248]
 [0.10360005 0.66723901 0.22916094]
 [0.78347147 0.10936745 0.10716108]
 [0.10217718 0.78840034 0.10942248]
 [0.09763381 0.11012538 0.79224081]
 [0.10217718 0.78840034 0.10942248]
 [0.10217718 0.78840034 0.10942248]]
