In [35]:
from sklearn import datasets
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_predict, cross_validate
import pandas as pd
import numpy as np

iris = datasets.load_iris()

X = iris.data
y = iris.target

print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_virginica = (y_train == 2)
y_test_virginica = (y_test == 2)

In [4]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_virginica)

In [5]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_virginica, cv=3)
cm = confusion_matrix(y_train_virginica, y_train_pred)
cm

array([[64, 17],
       [ 0, 39]], dtype=int64)

In [18]:
incorrectIndexes = []
virginicaIndexes = []
for row_index, (prediction, label) in enumerate(zip (y_train_pred, y_train_virginica)):
  if prediction != label:
    incorrectIndexes.append(row_index)
    print('Row', row_index, '\t- classified:', prediction, '\tactual:', label)
  if label == True:
    virginicaIndexes.append(row_index)


Row 2 	- classified: True 	actual: False
Row 6 	- classified: True 	actual: False
Row 11 	- classified: True 	actual: False
Row 12 	- classified: True 	actual: False
Row 15 	- classified: True 	actual: False
Row 18 	- classified: True 	actual: False
Row 20 	- classified: True 	actual: False
Row 22 	- classified: True 	actual: False
Row 25 	- classified: True 	actual: False
Row 29 	- classified: True 	actual: False
Row 34 	- classified: True 	actual: False
Row 45 	- classified: True 	actual: False
Row 59 	- classified: True 	actual: False
Row 62 	- classified: True 	actual: False
Row 73 	- classified: True 	actual: False
Row 79 	- classified: True 	actual: False
Row 93 	- classified: True 	actual: False


In [43]:
falsePositives = pd.DataFrame(X_train[incorrectIndexes], columns=['sepal length','sepal width','petal length','petal width'])
virginicaData = pd.DataFrame(X_train[virginicaIndexes], columns=['sepal length','sepal width','petal length','petal width'])
notVirginicaData = pd.DataFrame(X_train[np.where(y_train != 2)], columns=['sepal length','sepal width','petal length','petal width'])
print('Virginica Averages:')
print(virginicaData.mean())
print('\nNot Virginica Averages:')
print(notVirginicaData.mean())
print('\n False Positive Averages')
print(falsePositives.mean())

Virginica Averages:
sepal length    6.533333
sepal width     2.966667
petal length    5.520513
petal width     2.000000
dtype: float64

Not Virginica Averages:
sepal length    5.460494
sepal width     3.107407
petal length    2.862963
petal width     0.790123
dtype: float64

 False Positive Averages
sepal length    5.929412
sepal width     2.847059
petal length    4.417647
petal width     1.382353
dtype: float64


In [41]:
falsePositives = X_train[incorrectIndexes]
falsePositives

array([[6.7, 3.1, 4.4, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [6. , 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [5.5, 2.4, 3.7, 1. ],
       [6.6, 3. , 4.4, 1.4],
       [5.7, 2.9, 4.2, 1.3],
       [5.6, 3. , 4.5, 1.5],
       [5.8, 2.7, 4.1, 1. ],
       [5. , 2. , 3.5, 1. ],
       [5.6, 2.7, 4.2, 1.3],
       [5.4, 3. , 4.5, 1.5],
       [5.9, 3.2, 4.8, 1.8],
       [6. , 2.7, 5.1, 1.6],
       [6.3, 2.5, 4.9, 1.5],
       [6.1, 2.9, 4.7, 1.4],
       [5.5, 2.6, 4.4, 1.2]])

In [46]:
versicolourData = pd.DataFrame(X_train[np.where(y_train == 1)], columns=['sepal length','sepal width','petal length','petal width'])
setosaData = pd.DataFrame(X_train[np.where(y_train == 0)], columns=['sepal length','sepal width','petal length','petal width'])

print('Not Virginica Averages:')
print(notVirginicaData.mean())
print('\nVersicolour Averages:')
print(versicolourData.mean())
print('\nSetosa Averages:')
print(setosaData.mean())
print('\n False Positive Averages')
print(falsePositives.mean())

Not Virginica Averages:
sepal length    5.460494
sepal width     3.107407
petal length    2.862963
petal width     0.790123
dtype: float64

Versicolour Averages:
sepal length    5.919512
sepal width     2.770732
petal length    4.241463
petal width     1.321951
dtype: float64

Setosa Averages:
sepal length    4.9900
sepal width     3.4525
petal length    1.4500
petal width     0.2450
dtype: float64

 False Positive Averages
sepal length    5.929412
sepal width     2.847059
petal length    4.417647
petal width     1.382353
dtype: float64


In [52]:
falsePositivesActual = pd.DataFrame(y_train[incorrectIndexes])
falsePositivesActual

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [None]:
cross_validate(estimator=sgd_clf, X=X_train, y=y_train_virginica, cv=3, scoring="accuracy")

{'fit_time': array([0.        , 0.00099874, 0.        ]),
 'score_time': array([0.00099945, 0.        , 0.0010035 ]),
 'test_score': array([0.725, 0.9  , 0.95 ])}

In [None]:
precision_score(y_train_virginica, y_train_pred)

0.6964285714285714

In [None]:
recall_score(y_train_virginica, y_train_pred)

1.0

## Observations
- The model has a 100% recal and 70% precision. 
    - This means that 100% of the actual positives were correctly classified as true positives
    -  Any mis-classifications were false positives and never any false negatives. 
- When looking at the averages of virginica VS not-virginica vs false positives
    - The averages of the false positives seems closer to virginical
- All the false positives are actually Versicolour. This is due to setting the model to virginica VS not-virginica
    - The averages of the false positives are closer to the averages of Versicolour
    - The averages of Versicolour was brought down when mixed with Setosa 