In [5]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# How to Evaluate A Classifier in Scikit Learn

In [290]:
pima = pd.read_csv(r"C:\Users\Home\Desktop\Python Notes\pima-indians-diabetes.csv")
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [291]:
# #This dataset has health status of different individuals & it is aimed at checking the outcome
# 0 : Non-diabetic
# 1 : Diabetic
# the result is based on other paramters such as Preganacies, Glocuse etc.
# Since the result has only 2 values i.e. 0 or 1, it is a classification problem

### Can we predict Diabeties Status of Individuals given the Health status?

In [296]:
pima.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [306]:
# choosing features
X = pima[['Pregnancies','Insulin','BMI','Age']]
y = pima["Outcome"]

In [304]:
# Using Logistic Regression with cros_val_score method to split and train the data
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
score = cross_val_score(log_reg, X,y,scoring = "accuracy", cv = 10)
print(score)
print(score.mean())

[ 0.71428571  0.66233766  0.7012987   0.63636364  0.67532468  0.71428571
  0.68831169  0.74025974  0.67105263  0.68421053]
0.688773069036


In [308]:
# Using Logistic Regression with train_test_split method to split and train the data
X_train, X_test,y_train,y_test = train_test_split(X,y,random_state = 0)
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
y_predict = log_reg.predict(X_test)

In [307]:
y_predict

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [310]:
# Checking model accuracy. Since its a classification problem, we need to use accuracy score (not MSE) to check the accuracy of
# results
score = metrics.accuracy_score(y_test,y_predict)
score

0.69270833333333337

#### Note : Train_test_split is giving better result in this case, when are using LogisticRegression model.

### Null Accuracy : An accuracy which can be achieved by always predicting the most frequent class

In [318]:
y_test.value_counts()

0    130
1     62
Name: Outcome, dtype: int64

So, 0 i.e. Non-diabetic are more in out testing dataset

In [319]:
# Let's find mean values of 0 and 1 in the y_test series/distribution. Note that the 2 values present are only 0 and 1
# mean of 1 = (0*130 + 1*60)/2

In [323]:
# percentage of 1s in testing set
mean_of_one = y_test.mean()
mean_of_one

0.3229166666666667

In [324]:
# percentage of 1s in traning set
mean_of_zero = 1-mean_of_one
mean_of_zero

0.6770833333333333

In [329]:
# NUll accuracy 
max(mean_of_one,mean_of_zero)

0.6770833333333333

This means that 68% is null accuracy of the problem i.e. in 68 out of 100 cases, the dumb model predicts that a person doesn't
have diabetes i.e. outcome = 0

In [330]:
# Shortcut for calculating NUll accuracy when outcome is binary i.e. outcome = 0 or 1
max(y_test.mean(), 1-y_test.mean())

0.6770833333333333

In [333]:
# Calculating Null accuracy when we have more than 2 outcomes i.e. 0 and 1(for multi-classification problem)
y_test.value_counts().head(1)/len(y_test)

# above will give max mean bcoz value_counts() method arranges the result from high to low so the upper most i.e head(1) will
# have highest frequency data , which we can divide with length of the series
# note : here the that the y_test should be a pandas series

0    0.677083
Name: Outcome, dtype: float64

In [335]:
### Comparing the test and predicted values
print(y_test.values[0:25])
print(y_predict[0:25])

# above is bcoz y_test is a pandas series while y_predict is a numpy array

[1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0]
[0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


##### We can notice in above that most of the times when test value is 0, model predits it correctly.Howeever, it mostly fails to predict the test value 1 as 1

##### Conclusion

1. Classification accuracy is the easiest metric to understand
2. But it doesn't tell the underlying distn of true value which we here checked by calculating null accuracy
3. also, doesn't tell the types of error which model is making

## Confusion Matrix - An improvement over Classification accuracy metric

##### A confusion matrix is a table that describes the performance of a classification model

In [354]:
confusion = metrics.confusion_matrix(y_test,y_predict)
confusion

# Very Important: when using confusion matrix, pass y_true first and then pass y_predict. if we pass in other way, results will
# reverse. this may not be in the important factor in accuracy_score matrix

array([[118,  12],
       [ 47,  15]], dtype=int64)

since there are two outcomes i.e. 0 or 1, we get a 2 by 2 matrix

In [343]:
#total predicted/test observations
y_test.value_counts().sum()

192

In [355]:
confusion

array([[118,  12],
       [ 47,  15]], dtype=int64)

1. 118 = predicted : 0, actual : 0 - True Negative(TN) : Classifier correctly predicted that patients dont have diabetes
2. 15 = predicted : 1, actual : 1 - True Positive(TP) : Classifier correctly predicted that patients have diabetes
3. 12 = predicted :1, actual : 0 - False Positive (FP): Classifier incorrectly predicted that they have diabetes (Type 1 error)
2. 47 = predicted : 0, actual : 1, False Negative(FN) : Classifier incorrectly predicted that they dont have diabetes (Type 2 error)

Trick to Remember :
1. By convention 0 means Negative, 1 is Positive (here 0 means non-diabetic, 1 means diabetic)
2. if predicted = Actual , : True
3. if predicted is not equal to Actual, false

In [357]:
# saving the individual values

TN = confusion[0,0]
TP = confusion[1,1]
FN = confusion[1,0]
FP = confusion[0,1]

# Metrics Computed from a confusion matrix

### Classification accuracy : Overall, how often is the classifier accurate?

In [366]:
print((TP + TN)/(TP+TN+FP+FN))
print(metrics.accuracy_score(y_test,y_predict))

# thus accuracy_score gives the accuracy score of classifier i.e. the times it gies correct results when compared with the overal
# results

0.692708333333
0.692708333333


### Classification Error(also known as misclassification rate) : Overall, how often is the classifier incorrect?

In [365]:
print((FP + FN)/(TP+TN+FP+FN))
print(1 - metrics.accuracy_score(y_test,y_predict))

0.307291666667
0.307291666667


### Sensitivity(True Positive Rate) : When actual value is positive, how often is prediction correct?

In [367]:
confusion

array([[118,  12],
       [ 47,  15]], dtype=int64)

In [376]:
sensitivity = TP/(TP + FN)
print(sensitivity)
print(metrics.recall_score(y_test,y_predict))
 # it is basically when the model is predicting True out of total no. of times that the model actually has True value 
# i.e. 15/(47+15)

# metrics.recall_score can automatically caluclate the sensitivity

0.241935483871
0.241935483871


### Specificity(True Negative Rate) : When actual value is negative, how often is prediction correct?

In [377]:
specificity = TN/(TN + FP)
specificity

0.90769230769230769

#### False positive Rate : When actual is negative, how often model is incorrect?

In [379]:
FPR = 1 - specificity
FPR

0.092307692307692313

#### Precision : When a positive value is predicted, how often is prediction correct?

In [382]:
print(TP/(TP+FP))
print(metrics.precision_score(y_test,y_predict))

0.555555555556
0.555555555556


### Conclusion

1. Confusion matrix gives a better picture of model performance
2. there are many matrices such as F1 or mathew correlation coffecient etc. to calculate and chck the model performance
3. which matrices to focus on?
   
   It depends on your business problem/objective

Choice of metric depends on your business objective:

1. Spam filter (positive class is "spam"): Optimize for precision or specificity because false negatives (spam goes to the inbox) are more acceptable than false positives (non-spam is caught by the spam filter)

2. Fraudulent transaction detector (positive class is "fraud"): Optimize for sensitivity because false positives (normal transactions that are flagged as possible fraud) are more acceptable than false negatives (fraudulent transactions that are not detected)