# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [133]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
import itertools
from tqdm.notebook import tqdm

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [110]:
df = pd.read_csv("../data/day-of-week-not-scaled.csv")
df['dayofweek'] = pd.read_csv("../data/dayofweek.csv")['dayofweek']
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [111]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, test_size=0.2, stratify=y)

In [112]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, random_state=21, test_size=0.2, stratify=y_train)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [113]:
svc = SVC(C=10, class_weight=None, gamma='auto', kernel='rbf', probability=True, random_state=21).fit(X_train, y_train)
y_pred = svc.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred, average='weighted')
recall = recall_score(y_valid, y_pred, average='weighted')
print(f''' 
accuracy is {accuracy}
precision is {precision}
recall is {recall}
''')

 
accuracy is 0.8777777777777778
precision is 0.8816152211617203
recall is 0.8777777777777778



In [114]:
tree = DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=21, random_state=21).fit(X_train, y_train)
y_pred = tree.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred, average='weighted')
recall = recall_score(y_valid, y_pred, average='weighted')
print(f''' 
accuracy is {accuracy}
precision is {precision}
recall is {recall}
''')

 
accuracy is 0.8666666666666667
precision is 0.8716971333998339
recall is 0.8666666666666667



In [115]:
rf = RandomForestClassifier(class_weight=None, criterion='gini', max_depth=28, n_estimators=50, random_state=21).fit(X_train, y_train)
y_pred = rf.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred, average='weighted')
recall = recall_score(y_valid, y_pred, average='weighted')
print(f''' 
accuracy is {accuracy}
precision is {precision}
recall is {recall}
''')

 
accuracy is 0.8925925925925926
precision is 0.8936100873030975
recall is 0.8925925925925926



## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [116]:
voting = VotingClassifier([('svc', svc), ('tree', tree), ('rf', rf)]).fit(X_train, y_train)
y_pred = voting.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred, average='weighted')
recall = recall_score(y_valid, y_pred, average='weighted')
print(f''' 
accuracy is {accuracy}
precision is {precision}
recall is {recall}
''')

 
accuracy is 0.8925925925925926
precision is 0.8923580160769755
recall is 0.8925925925925926



In [121]:
weights = [list(x) for x in itertools.product(range(1, 5), range(1, 5), range(1, 5))]
param_grid = {
    'voting' : ['hard', 'soft'],
    'weights' : weights
}
params = list(itertools.product(
    param_grid['voting'],
    param_grid['weights']
))

results = []

for voting, weights in tqdm(params):
    model = VotingClassifier([('svc', svc), ('tree', tree), ('rf', rf)], voting=voting, weights=weights).fit(X_train, y_train)
    accuracy = accuracy_score(y_valid, model.predict(X_valid))
    precision = precision_score(y_valid, model.predict(X_valid), average='weighted')
    results.append({
        'voting' : voting,
        'weights' : weights,
        'accuracy' : accuracy,
        'precision' : precision
    })

  0%|          | 0/128 [00:00<?, ?it/s]

In [122]:
pd.DataFrame(results).sort_values(by=['accuracy', 'precision'], ascending=False).head(10)

Unnamed: 0,voting,weights,accuracy,precision
112,soft,"[4, 1, 1]",0.907407,0.910258
114,soft,"[4, 1, 3]",0.907407,0.909871
113,soft,"[4, 1, 2]",0.907407,0.909683
25,hard,"[2, 3, 2]",0.907407,0.908327
30,hard,"[2, 4, 3]",0.907407,0.908327
45,hard,"[3, 4, 2]",0.907407,0.908327
46,hard,"[3, 4, 3]",0.907407,0.908327
96,soft,"[3, 1, 1]",0.903704,0.905991
80,soft,"[2, 1, 1]",0.903704,0.905142
117,soft,"[4, 2, 2]",0.903704,0.905142


In [123]:
voting = VotingClassifier([('svc', svc), ('tree', tree), ('rf', rf)], voting='soft', weights=[4, 1, 1]).fit(X_train, y_train)
y_pred = voting.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print(f''' 
accuracy is {accuracy}
precision is {precision}
recall is {recall}
''')

 
accuracy is 0.8994082840236687
precision is 0.9028443153805513
recall is 0.8994082840236687



## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [124]:
bg = BaggingClassifier(svc)
results = []

for n_estimators in tqdm(range(10, 101, 10)):
    model = BaggingClassifier(svc, n_estimators=n_estimators).fit(X_train, y_train)
    accuracy = accuracy_score(y_valid, model.predict(X_valid))
    precision = precision_score(y_valid, model.predict(X_valid), average='weighted')
    results.append({
        'n_estimators' : n_estimators,
        'accuracy' : accuracy,
        'precision' : precision
    })

  0%|          | 0/10 [00:00<?, ?it/s]

In [126]:
pd.DataFrame(results).sort_values(by=['accuracy', 'precision'], ascending=False).head(5)

Unnamed: 0,n_estimators,accuracy,precision
7,80,0.892593,0.899893
1,20,0.888889,0.897925
6,70,0.888889,0.89728
5,60,0.888889,0.897068
9,100,0.885185,0.89446


In [127]:
bg = BaggingClassifier(svc, n_estimators=80).fit(X_train, y_train)
y_pred = bg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print(f''' 
accuracy is {accuracy}
precision is {precision}
recall is {recall}
''')

 
accuracy is 0.8846153846153846
precision is 0.8900420636290868
recall is 0.8846153846153846



## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [129]:
stacking = StackingClassifier([('svc', svc), ('tree', tree), ('rf', rf)]).fit(X_train, y_train)
y_pred = stacking.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred, average='weighted')
recall = recall_score(y_valid, y_pred, average='weighted')
print(f''' 
accuracy is {accuracy}
precision is {precision}
recall is {recall}
''')

 
accuracy is 0.9037037037037037
precision is 0.9043709100721851
recall is 0.9037037037037037



In [136]:
param_grid = {
    'n_splits' : range(2, 8),
    'passthrough' : [True, False]
}

params = list(itertools.product(
    param_grid['n_splits'],
    param_grid['passthrough']
))

results = []

for n_splits, passthrough in tqdm(params):
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
    model = StackingClassifier([('svc', svc), ('tree', tree), ('rf', rf)], 
                               final_estimator=LogisticRegression(solver='lbfgs', max_iter=1000),
                               passthrough=passthrough, cv=cv).fit(X_train, y_train)
    y_pred = model.predict(X_valid)

    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='weighted')
    
    results.append({
        'n_splits' : n_splits,
        'passthrough' : passthrough,
        'accuracy' : accuracy,
        'precision' : precision
    })

  0%|          | 0/12 [00:00<?, ?it/s]

In [137]:
pd.DataFrame(results).sort_values(by=['accuracy', 'precision'], ascending=False).head(5)

Unnamed: 0,n_splits,passthrough,accuracy,precision
7,5,False,0.922222,0.924848
6,5,True,0.911111,0.913366
8,6,True,0.911111,0.913214
4,4,True,0.907407,0.910764
10,7,True,0.907407,0.910305


In [138]:
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=21)
stacking = StackingClassifier([('svc', svc), ('tree', tree), ('rf', rf)], 
                               final_estimator=LogisticRegression(solver='lbfgs', max_iter=1000),
                               passthrough=False, cv=cv).fit(X_train, y_train)

y_pred = stacking.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print(f''' 
accuracy is {accuracy}
precision is {precision}
recall is {recall}
''')

 
accuracy is 0.9112426035502958
precision is 0.9143638893231798
recall is 0.9112426035502958



## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [None]:
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=21)
stacking = StackingClassifier([('svc', svc), ('tree', tree), ('rf', rf)], 
                               final_estimator=LogisticRegression(solver='lbfgs', max_iter=1000),
                               passthrough=False, cv=cv).fit(X_train, y_train)
y_pred = stacking.predict(X_test)

In [139]:
df_temp = X_test.copy()
df_temp['y_true'] = y_test
df_temp['y_pred'] = y_pred

weekdays = dict()
labnames = dict()
users = dict()

labname_columns = [name for name in df_temp.columns if 'labname_' in name]
users_columns = [name for name in df_temp.columns if 'uid_' in name]

def find_labname_user(row):
    lab, usr = '', ''
    for labname in labname_columns:
        if row[labname] == 1:
            lab = labname
    for username in users_columns:
        if row[username] == 1:
            usr = username
    return lab, usr


for i in range(len(df_temp)):
    if df_temp['y_pred'].iloc[i] != df_temp['y_true'].iloc[i]:
        weekday = df_temp['y_true'].iloc[i]
        labname, user = find_labname_user(df_temp.iloc[i])
        weekdays[weekday] = weekdays.get(weekday, 0) + 1
        labnames[labname] = labnames.get(labname, 0) + 1
        users[user] = users.get(user, 0) + 1


In [140]:
weekday_counts = df_temp['y_true'].value_counts()
for weekday in weekdays.keys():
    weekdays[weekday] = weekdays[weekday]/weekday_counts[weekday]*100
weekday_error = sorted(weekdays.items(), key=lambda x : x[1], reverse=True)
weekday_error[0]

(np.int64(0), np.float64(25.925925925925924))

In [144]:
labname_error = sorted(labnames.items(), key=lambda x : x[1], reverse=True)
labname_error[0]

('labname_project1', 10)

In [145]:
users_errors = sorted(users.items(), key=lambda x : x[1], reverse=True)
users_errors[0]

('uid_user_2', 4)

In [147]:
from joblib import dump
dump(stacking, 'model.joblib', compress=9)

['model.joblib']