# 4. Evaluation Metrics for Classification

## Setting Import libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score

## Preparation

In [2]:
df = pd.read_csv('AER_credit_card_data.csv')
df

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.5200,0.033270,124.983300,yes,no,3,54,1,12
1,yes,0,33.25000,2.4200,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5000,0.004156,15.000000,yes,no,4,58,1,5
3,yes,0,30.50000,2.5400,0.065214,137.869200,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.503300,yes,no,2,64,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1314,yes,0,33.58333,4.5660,0.002146,7.333333,yes,no,0,94,1,19
1315,no,5,23.91667,3.1920,0.000376,0.000000,no,no,3,12,1,5
1316,yes,0,40.58333,4.6000,0.026513,101.298300,yes,no,2,1,1,2
1317,yes,0,32.83333,3.7000,0.008999,26.996670,no,yes,0,60,1,7


### Data Preprocessing

In [3]:
custom_seed = 1

In [4]:
target = 'card'

In [5]:
df.card = (df.card == 'yes').astype(int)

### Setting up the validation framework

In [6]:
def validation_framework(data=df, val=0.25, test=0.2, random_seed=custom_seed, target=target):
    """ Prepares the validation framework for a machine learning model
        :param str target: The target variable
        :param pd.DataFrame data: a dataframe
        :param float random_seed: a seed that controls reproducibility
        :return: the train, validation and test set, separated by features (X) and target (y)
        :rtype: tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]
        """
    df_full_train, df_test = train_test_split(data, test_size=test, random_state=random_seed)
    df_train, df_val = train_test_split(df_full_train, test_size=val, random_state=random_seed)

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values

    del df_train[target]
    del df_val[target]
    del df_test[target]

    return df_full_train, df_train, df_val, df_test, y_train, y_val, y_test


df_full_train, df_train, df_val, df_test, y_train, y_val, y_test = validation_framework()

### Q1 - ROC AUC could also be used to evaluate feature importance of numerical variables.

In [7]:
categorical = [x for x in df.select_dtypes(object).columns]
numerical = [df.select_dtypes(np.number).columns]

areas = dict()
for variable in ['reports', 'age', 'income', 'share', 'expenditure', 'dependents', 'months', 'majorcards', 'active']:
    model = LogisticRegression()
    model.fit(df_train[[variable]], y_train)
    y_pred = model.predict_proba(df_train[[variable]])[:, 1]
    areas[variable] = roc_auc_score(y_train, y_pred).round(4)

print(sorted(areas, key=areas.get, reverse=True)[:3]) 

['expenditure', 'share', 'reports']


### Training the model

In [8]:
features = ["reports", "age", "income", "share", "expenditure", "dependents", "months", "majorcards", "active", "owner",
            "selfemp"]

In [9]:
def train_logistic_regression(df_train, y_train, C=1.0):
    dicts = df_train[features].to_dict(orient='records')

    dictionary_vectorizer = DictVectorizer(sparse=False)
    X_train = dictionary_vectorizer.fit_transform(dicts)

    logistic_model = LogisticRegression(solver='liblinear', C=C, max_iter=1_000)
    logistic_model.fit(X_train, y_train)

    return dictionary_vectorizer, logistic_model


def predict(df, dv, model):
    dicts = df[features].to_dict(orient='records')

    X = dv.fit_transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred


dv, model = train_logistic_regression(df_train, y_train)

y_pred = predict(df_val, dv, model)

roc_auc_score(y_val, y_pred)

0.995171242063847

### Question 2 What's the AUC of this model on the validation dataset? (round to 3 digits)

In [10]:
print(roc_auc_score(y_val, y_pred))

0.995171242063847


### Question 3 Computing Precision and Recall for our Model.
Evaluate the model on the validation dataset on all thresholds from 0.0 to 1.0 with step 0.01
For each threshold, compute precision and recall
Plot them
At which threshold precision and recall curves intersect?

In [None]:
thresholds = np.arange(0.0, 1.0, 0.01)
scores = []
for threshold in thresholds:
    real_positive = (y_val == 1)
    real_negative = (y_val == 0)

    predict_positive = (y_pred >= threshold)
    predict_negative = (y_pred < threshold)

    tp = (predict_positive & real_positive).sum()
    tn = (predict_negative & real_negative).sum()

    fp = (predict_positive & real_negative).sum()
    fn = (predict_negative & real_positive).sum()

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    scores.append((threshold, precision, recall))

scores = pd.DataFrame(scores, columns=['threshold', 'precision', 'recall'])
# Plotting the data
plt.plot(scores.threshold, scores.precision, label='Precision')  # Thresholds vs Precision
plt.plot(scores.threshold, scores.recall, label='Recall')  # Thresholds vs Recall
plt.legend()
plt.show()

### Q4- Computing the F1 Score
Precision and recall are conflicting - when one grows, the other goes down.
That's why they are often combined into the F1 score - a metrics that takes into account both

In [17]:
def f1_score(precision, recall):
    return 2 * ((precision * recall) / (precision + recall))


scores['f1_score'] = f1_score(scores.precision, scores.recall)

# At which threshold F1 is maximal?
maximal_f1_score = scores.nlargest(n=1, columns='f1_score')[
    ['threshold', 'f1_score']]  # F1 is maximal at the 0.35 threshold
print(maximal_f1_score)

# Plotting the f1_score across all the thresholds
plt.plot(scores.threshold, scores.f1_score, label='f1_score')
plt.legend()
plt.show()

AttributeError: 'list' object has no attribute 'precision'

### Q5 - Use the `KFold` class from sklearn to evaluate our model on 5 different folds:
Iterate over different folds of df_full_train
Split the data into train and validation
Train the model on train with these parameters: LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
Use AUC to evaluate the model on validation
How large is standard devidation of the AUC scores across different folds?

In [18]:
kFold = KFold(n_splits=5, shuffle=True, random_state=custom_seed)
scores = []

for train_idx, val_idx in kFold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.card.values
    y_val = df_val.card.values

    dv, model = train_logistic_regression(df_train, y_train, C=1.0)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

std_folds = np.std(scores)
print(std_folds)

0.0027434713804377724


### Q6 - Now let's use 5-Fold cross-validation to find the best parameter C
Iterate over the following C values: [0.01, 0.1, 1, 10]
Initialize KFold with the same parameters as previously
Use these parametes for the model: LogisticRegression(solver='liblinear', C=C, max_iter=1000)
Compute the mean score as well as the std (round the mean and std to 3 decimal digits)
Which C leads to the best mean score?

In [19]:
C = [0.01, 0.1, 1, 10]
fold_stats = pd.DataFrame()

for c in [0.01, 0.1, 1, 10]:
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)

    scores = []
    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]

        y_train = df_train.card.values
        y_val = df_val.card.values

        dv, model = train_logistic_regression(df_train, y_train, C=c)
        y_pred = predict(df_val, dv, model)

        auc = roc_auc_score(y_val, y_pred)
        scores.append(auc)
    data = pd.DataFrame([{'param': c, 'mean': np.mean(scores).round(3), 'sd': np.std(scores).round(3)}])
    fold_stats = pd.concat([fold_stats, data])

print(fold_stats) 

   param   mean     sd
0   0.01  0.992  0.006
0   0.10  0.995  0.004
0   1.00  0.996  0.003
0  10.00  0.996  0.003
