In [1]:
import pandas as pd


## Exercise 1
Given the following confusion matrix, evaluate (by hand) the model's performance.


|               | actual cat | actual dog |
|:------------  |-----------:|-----------:|
| predicted cat |         34 |          7 |
| predicted dog |         13 |         46 |

- We treat is cat as postive case, is dog as negtaive case.

- If I decided use cat as classifier:
- FP false positive would be 7
- FN false negative would be 13
- TP 34
- TN 46

In [2]:
df = pd.DataFrame({'actual cat': [34, 13],'actual dog': [7, 46]}, index = ['predicted cat','predicted dog'])
df

Unnamed: 0,actual cat,actual dog
predicted cat,34,7
predicted dog,13,46


In [3]:
TP = 34
TN = 46
FP = 7
FN = 13
Accuracy = (TP + TN) / (TP + TN + FP + FN)
Accuracy

0.8

In [4]:
Precision = TP / (TP + FP)
Precision

0.8292682926829268

In [5]:
Recall = TP / (TP+FN)
Recall

0.723404255319149

## Exercise 2

In [6]:
c3 = pd.read_csv('c3.csv')
c3.head()

Unnamed: 0,actual,model1,model2,model3
0,No Defect,No Defect,Defect,No Defect
1,No Defect,No Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect
3,No Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect


In [7]:
c3.actual.value_counts()

No Defect    184
Defect        16
Name: actual, dtype: int64

In [8]:
c3["baseline"] = c3.actual.value_counts().index[0]
c3.head(3)

Unnamed: 0,actual,model1,model2,model3,baseline
0,No Defect,No Defect,Defect,No Defect,No Defect
1,No Defect,No Defect,Defect,Defect,No Defect
2,No Defect,No Defect,Defect,No Defect,No Defect


In [9]:
# The best metric for Quality Control is recall
# model 1
subset = c3[c3.actual == "Defect"]
model_recall = (subset.actual == subset.model1).mean()
baseline_recall = (subset.baseline == subset.actual).mean()

print("Model 1")
print(f"Model recall: {model_recall:.2%}")
print(f"Baseline recall: {baseline_recall:.2%}")

Model 1
Model recall: 50.00%
Baseline recall: 0.00%


In [10]:
# model 2
subset = c3[c3.actual == "Defect"]
model_recall = (subset.actual == subset.model2).mean()
baseline_recall = (subset.baseline == subset.actual).mean()

print("Model 2")
print(f"Model recall: {model_recall:.2%}")
print(f"Baseline recall: {baseline_recall:.2%}")

Model 2
Model recall: 56.25%
Baseline recall: 0.00%


In [11]:
# model 3
subset = c3[c3.actual == "Defect"]
model_recall = (subset.actual == subset.model3).mean()
baseline_recall = (subset.baseline == subset.actual).mean()

print("Model 3")
print(f"Model recall: {model_recall:.2%}")
print(f"Baseline recall: {baseline_recall:.2%}")

Model 3
Model recall: 81.25%
Baseline recall: 0.00%


### Model 3 is the best model since its recall value is the highest.

## Exercise 2 (second part)
- The PR team has decided to launch a program that gives customers with a defective duck a vacation to Hawaii.
- They need you to predict which ducks will have defects, but tell you they really don't want to accidentally give out a vacation package when the duck really doesn't have a defect.
- Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

In [12]:
# Since false positives are expensive in this case, we'll use precision to minimize false positives.

In [13]:
# Evaluate model 1
subset = c3[c3.model1 == "Defect"]
model_precision = (subset.actual == subset.model1).mean()

subset = c3[c3.baseline == "Defect"]
baseline_precision = (subset.actual == subset.baseline).mean()

print("Model 1")
print(f"Model precision: {model_precision:.2%}")
print(f"Baseline precision: {baseline_precision:.2%}")

Model 1
Model precision: 80.00%
Baseline precision: nan%


In [14]:
# Evaluate model 2
subset = c3[c3.model2 == "Defect"]
model_precision = (subset.actual == subset.model2).mean()

subset = c3[c3.baseline == "Defect"]
baseline_precision = (subset.actual == subset.baseline).mean()

print("Model 2")
print(f"Model precision: {model_precision:.2%}")
print(f"Baseline precision: {baseline_precision:.2%}")

Model 2
Model precision: 10.00%
Baseline precision: nan%


In [15]:
# Evaluate model 3
subset = c3[c3.model3 == "Defect"]
model_precision = (subset.actual == subset.model3).mean()

subset = c3[c3.baseline == "Defect"]
baseline_precision = (subset.actual == subset.baseline).mean()

print("Model 3")
print(f"Model precision: {model_precision:.2%}")
print(f"Baseline precision: {baseline_precision:.2%}")

Model 3
Model precision: 13.13%
Baseline precision: nan%


### Takeaway : choose model 1 as the best model since it has the highest precision value.

## Exercise 3

In [16]:
df = pd.read_csv('gives_you_paws.csv')
df.head()

Unnamed: 0,actual,model1,model2,model3,model4
0,cat,cat,dog,cat,dog
1,dog,dog,cat,cat,dog
2,dog,cat,cat,cat,dog
3,dog,dog,dog,cat,dog
4,cat,cat,cat,dog,dog


In [17]:
df.actual.value_counts()

dog    3254
cat    1746
Name: actual, dtype: int64

In [18]:
df['baseline_prediction'] = 'dog'
df.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline_prediction
0,cat,cat,dog,cat,dog,dog
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
4,cat,cat,cat,dog,dog,dog


In [19]:
model1_accuracy = (df.model1 == df.actual).mean()
baseline_accuracy = (df.baseline_prediction == df.actual).mean()
print(f'   model1 accuracy: {model1_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

   model1 accuracy: 80.74%
baseline accuracy: 65.08%


In [20]:
model2_accuracy = (df.model2 == df.actual).mean()
baseline_accuracy = (df.baseline_prediction == df.actual).mean()
print(f'   model2 accuracy: {model2_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

   model2 accuracy: 63.04%
baseline accuracy: 65.08%


In [21]:
model3_accuracy = (df.model3 == df.actual).mean()
baseline_accuracy = (df.baseline_prediction == df.actual).mean()
print(f'   model3 accuracy: {model3_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

   model3 accuracy: 50.96%
baseline accuracy: 65.08%


In [22]:
model4_accuracy = (df.model4 == df.actual).mean()
baseline_accuracy = (df.baseline_prediction == df.actual).mean()
print(f'   model4 accuracy: {model4_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

   model4 accuracy: 74.26%
baseline accuracy: 65.08%


In [23]:
# Suppose you are working on a team that solely deals with dog pictures. 
# Which of these models would you recomend for Phase I? 
# For Phase II?
df.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline_prediction
0,cat,cat,dog,cat,dog,dog
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
4,cat,cat,cat,dog,dog,dog


In [24]:
from sklearn.metrics import classification_report

In [25]:
target_names = df.actual.unique()

In [26]:
x1 = classification_report(df.actual, df.model1, target_names=target_names, output_dict=True)
pd.DataFrame(x1)

Unnamed: 0,cat,dog,accuracy,macro avg,weighted avg
precision,0.689772,0.890024,0.8074,0.789898,0.820096
recall,0.815006,0.803319,0.8074,0.809162,0.8074
f1-score,0.747178,0.844452,0.8074,0.795815,0.810484
support,1746.0,3254.0,0.8074,5000.0,5000.0


In [27]:
x2 = classification_report(df.actual, df.model2, target_names=target_names, output_dict=True)
pd.DataFrame(x2)

Unnamed: 0,cat,dog,accuracy,macro avg,weighted avg
precision,0.484122,0.893177,0.6304,0.688649,0.750335
recall,0.890607,0.490781,0.6304,0.690694,0.6304
f1-score,0.627269,0.633479,0.6304,0.630374,0.63131
support,1746.0,3254.0,0.6304,5000.0,5000.0


In [28]:
x3 = classification_report(df.actual, df.model3, target_names=target_names, output_dict=True)
pd.DataFrame(x3)

Unnamed: 0,cat,dog,accuracy,macro avg,weighted avg
precision,0.358347,0.659888,0.5096,0.509118,0.55459
recall,0.511455,0.508605,0.5096,0.51003,0.5096
f1-score,0.421425,0.574453,0.5096,0.497939,0.521016
support,1746.0,3254.0,0.5096,5000.0,5000.0


In [31]:
x4 =  classification_report(df.actual, df.model4, target_names=target_names, output_dict=True)
pd.DataFrame(x4)

Unnamed: 0,cat,dog,accuracy,macro avg,weighted avg
precision,0.807229,0.731249,0.7426,0.769239,0.757781
recall,0.345361,0.955747,0.7426,0.650554,0.7426
f1-score,0.483755,0.82856,0.7426,0.656157,0.708154
support,1746.0,3254.0,0.7426,5000.0,5000.0
