# Evaluating a Model

UCI Machine Learning Repository: [Spambase Data Set](https://archive.ics.uci.edu/ml/datasets/Spambase)

In [None]:
import pandas as pd

In [None]:
spam = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data', header=None)

In [None]:
spam.head()

In [None]:
X = spam.drop(57, axis=1)
y = spam[57]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(X_train, y_train)

y_pred_class = logreg.predict(X_test)

In [None]:
from sklearn import metrics

print(metrics.accuracy_score(y_test, y_pred_class))

**Null accuracy:** accuracy that could be achieved by always predicting the most frequent class

In [None]:
pd.Series(y_test).value_counts()

In [None]:
pd.Series(y_test).value_counts().head(1) / len(y_test)

### Confusion Matrix

In [None]:
print(metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
print(metrics.accuracy_score(y_test, y_pred_class))

In [None]:
print(metrics.recall_score(y_test, y_pred_class))

In [None]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

In [None]:
print(TP / float(TP + FP))
print(metrics.precision_score(y_test, y_pred_class))

### Classification Report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_class, target_names=['ham', 'spam']))

### Receiver Operating Characteristic (ROC)

In [None]:
y_pred_prob = logreg.predict_proba(X_test)[:, 1]
y_pred_prob

In [None]:
plt.hist(y_pred_prob, bins=8)
plt.xlim(0, 1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('Predicted probability of spam')
plt.ylabel('Frequency')

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)

In [None]:
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for spam classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

### Area Under Curve (AUC)

In [None]:
metrics.roc_auc_score(y_test, y_pred_prob)