# Day 09. Exercise 02
# Metrics

## 0. Imports

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [31]:
df = pd.read_csv("../data/day-of-week-not-scaled.csv")
df['dayofweek'] = pd.read_csv("../data/dayofweek.csv")['dayofweek']
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [32]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, stratify=y, test_size=0.2)

## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [None]:
svc = SVC(C=10, class_weight=None, gamma='auto', kernel='rbf', probability=True, random_state=21).fit(X_train, y_train)
y_pred = svc.predict(X_test)
y_score = svc.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_score, multi_class='ovo', average='weighted')
print(f''' 
accuracy is {accuracy}
precision is {precision}
recall is {recall}
roc_auc is {roc_auc}
''')

 
accuracy is 0.8875739644970414
precision is 0.8926729169690374
recall is 0.8875739644970414
roc_auc is 0.9787793228216216



## 3. Decision tree

1. The same task for decision tree

In [None]:
tree = DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=21, random_state=21).fit(X_train, y_train)
y_pred = tree.predict(X_test)
y_score = tree.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_score, multi_class='ovo', average='weighted')
print(f''' 
accuracy is {accuracy}
precision is {precision}
recall is {recall}
roc_auc is {roc_auc}
''')

 
accuracy is 0.8905325443786982
precision is 0.8940158937843722
recall is 0.8905325443786982
roc_auc is 0.9379290651156622



## 4. Random forest

1. The same task for random forest.

In [None]:
rf = RandomForestClassifier(class_weight=None, criterion='gini', max_depth=28, n_estimators=50, random_state=21).fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_score = rf.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_score, multi_class='ovo', average='weighted')
print(f''' 
accuracy is {accuracy}
precision is {precision}
recall is {recall}
roc_auc is {roc_auc}
''')

 
accuracy is 0.9289940828402367
precision is 0.9300865038851309
recall is 0.9289940828402367
roc_auc is 0.9903274757720744



## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

In [None]:
rf = RandomForestClassifier(class_weight=None, criterion='gini', max_depth=28, n_estimators=50, random_state=21).fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [85]:
df_temp = X_test.copy()
df_temp['y_true'] = y_test
df_temp['y_pred'] = y_pred

weekdays = dict()
labnames = dict()
users = dict()

labname_columns = [name for name in df_temp.columns if 'labname_' in name]
users_columns = [name for name in df_temp.columns if 'uid_' in name]

def find_labname_user(row):
    lab, usr = '', ''
    for labname in labname_columns:
        if row[labname] == 1:
            lab = labname
    for username in users_columns:
        if row[username] == 1:
            usr = username
    return lab, usr


for i in range(len(df_temp)):
    if df_temp['y_pred'].iloc[i] != df_temp['y_true'].iloc[i]:
        weekday = df_temp['y_true'].iloc[i]
        labname, user = find_labname_user(df_temp.iloc[i])
        weekdays[weekday] = weekdays.get(weekday, 0) + 1
        labnames[labname] = labnames.get(labname, 0) + 1
        users[user] = users.get(user, 0) + 1


In [86]:
weekday_counts = df_temp['y_true'].value_counts()
for weekday in weekdays.keys():
    weekdays[weekday] = weekdays[weekday]/weekday_counts[weekday]*100
weekday_error = sorted(weekdays.items(), key=lambda x : x[1], reverse=True)
weekday_error[0]

(np.int64(0), np.float64(25.925925925925924))

In [87]:
labname_error = sorted(labnames.items(), key=lambda x : x[1], reverse=True)
labname_error[0]

('labname_project1', 9)

In [88]:
users_errors = sorted(users.items(), key=lambda x : x[1], reverse=True)
users_errors[0]

('uid_user_19', 4)

In [90]:
from joblib import dump
dump(rf, 'model.joblib', compress=9)

['model.joblib']

## 6. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.

In [93]:
def count_metrics(models, parameters):
    results = {}
    for model, parametr in zip(models, parameters):
        model.set_params(**parametr)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_score = model.predict_proba(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        roc_auc = roc_auc_score(y_test, y_score, multi_class='ovo', average='weighted')

        model_name = model.__class__.__name__

        results[model_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'roc_auc': roc_auc
        }

    return results



In [94]:
models = [SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
params = [
    {'C': 10,
    'class_weight': None,
    'gamma': 'auto',
    'kernel': 'rbf',
    'probability': True,
    'random_state': 21
    },

    {'class_weight': 'balanced',
    'criterion': 'gini',
    'max_depth': 21,
    'random_state': 21
    },

    {'class_weight': None,
    'criterion': 'gini',
    'max_depth': 28,
    'n_estimators': 50,
    'random_state': 21
    }

]

count_metrics(models, params)

{'SVC': {'accuracy': 0.8875739644970414,
  'precision': 0.8926729169690374,
  'recall': 0.8875739644970414,
  'roc_auc': np.float64(0.9787793228216216)},
 'DecisionTreeClassifier': {'accuracy': 0.8905325443786982,
  'precision': 0.8940158937843722,
  'recall': 0.8905325443786982,
  'roc_auc': np.float64(0.9379290651156622)},
 'RandomForestClassifier': {'accuracy': 0.9289940828402367,
  'precision': 0.9300865038851309,
  'recall': 0.9289940828402367,
  'roc_auc': np.float64(0.9903274757720744)}}