# Setup

In [1]:
%matplotlib inline

import numpy as np, pandas as pd
import matplotlib.pyplot as plt 
from pathlib import Path
import seaborn as sns 
import sklearn
from sklearn import datasets

In [2]:
# This is a quick check of whether the notebook is currently running on Google Colaboratory, as that makes some difference for the code below.
# We'll do this in every notebook of the course.
if 'google.colab' in str(get_ipython()):
    print('The notebook is running on Colab. colab=True.')
    colab=True
else:
    print('The notebook is not running on Colab. colab=False.')
    colab=False

The notebook is not running on Colab. colab=False.


# Data

In [3]:
NB_DIR = Path.cwd()
DATA = NB_DIR/'data'
DATA.mkdir(exist_ok=True)

La oss bruke datasettet fra innlevering 1:

In [4]:
df = pd.read_csv('train.csv')

In [5]:
df.head()

Unnamed: 0,id,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,...,f_66,f_67,f_68,f_69,f_70,f_71,f_72,f_73,f_74,target
0,161363,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,8.0,0.0,0.0,0.0,,,0.0,0.0,6
1,78028,16.0,0.0,1.0,1.0,6.0,2.0,2.0,2.0,14.0,...,0.0,41.0,3.0,0.0,0.0,0.0,2.0,1.0,1.0,7
2,35324,0.0,3.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,5
3,67966,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1
4,110279,3.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,,...,0.0,0.0,1.0,5.0,4.0,0.0,0.0,0.0,1.0,5


# Forbered data for maskinlæring

In [6]:
X, y  = df.drop(["id", "target"], axis=1), df.target

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

Som vi vet må vi imputere manglende verdier og skalere features:

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [10]:
imp = SimpleImputer(strategy='mean')
X_train_imp = imp.fit_transform(X_train)
X_test_imp = imp.transform(X_test)

# We store the results as data frames, for convenience:
X_train = pd.DataFrame(data=X_train_imp, columns=X_train.columns)
X_test = pd.DataFrame(data=X_test_imp, columns=X_test.columns)

In [11]:
scale = MinMaxScaler()
X_train_sc = scale.fit_transform(X_train)
X_test_sc = scale.transform(X_test)

X_train = pd.DataFrame(data=X_train_sc, columns=X_train.columns)
X_test = pd.DataFrame(data=X_test_sc, columns=X_test.columns)

In [12]:
X_train.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_65,f_66,f_67,f_68,f_69,f_70,f_71,f_72,f_73,f_74
0,0.032787,0.019608,0.390625,0.014286,0.026316,0.0,0.0,0.0,0.157895,0.013889,...,0.0,0.0,0.022987,0.0,0.0,0.0,0.0,0.0,0.392308,0.019231
1,0.0,0.0,0.0,0.014286,0.0,0.0,0.0,0.0,0.184211,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012253
2,0.163934,0.019608,0.171875,0.0,0.105263,0.171053,0.0,0.0,0.026316,0.027778,...,0.018519,0.0,0.063291,0.0,0.523077,0.0,0.333333,0.016393,0.076923,0.0
3,0.0,0.0,0.0,0.0,0.0,0.013158,0.0,0.0,0.0,0.013889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.013158,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.016393,0.0,0.0


# Ensembling

Som vi har sett på finnes det mange ulike måter å bygge ensembles fra et sett med modeller. Noen relativt enkle fremgangsmåter, andre mer avanserte. 

Her er en relativt enkel en:

# "Voting" og "model averaging"

Som vi husker fra Random Forests kan det være en god idé å trene flere ulike modeller på samme oppgave, og så kombinere deres prediksjoner ved _avstemming_ (hvis klassifikasjon) eller ved å ta gjennomsnitt (hvis regresjon). 

Denne ideen er mer generell enn random forests: det er ikke nødvendig at hvert medlem i ensemblet er av samme type (beslutningstrær for random forests). En kan trene mange _ulike_ typer modeller, og så kombinere de i et slikt ensemble. 

Vi kan bruke scikit-learn's `VotingClassifer` og `VotingRegressor` til dette:

> NB!: Før man begynner å ensemble modeller vil en typisk allerede ha gjennomført hyperparameteroptimalisering for hvert medlem av ensemblet. Altså, en bør først forsøke å øke ytelsen til hvert medlem før en kombinerer de i et ensemble (og forøker å øke ytelsen til ensemblet).

In [13]:
from sklearn.ensemble import VotingClassifier

Her er et eksempel der vi bruker mange _litt_ ulike versjoner av samme modell:

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
n_models = 20
seed=42

models = [(f'rf{i}', RandomForestClassifier(random_state=seed+i, n_jobs=-1)) for i in range(n_models)]

In [16]:
models

[('rf0', RandomForestClassifier(n_jobs=-1, random_state=42)),
 ('rf1', RandomForestClassifier(n_jobs=-1, random_state=43)),
 ('rf2', RandomForestClassifier(n_jobs=-1, random_state=44)),
 ('rf3', RandomForestClassifier(n_jobs=-1, random_state=45)),
 ('rf4', RandomForestClassifier(n_jobs=-1, random_state=46)),
 ('rf5', RandomForestClassifier(n_jobs=-1, random_state=47)),
 ('rf6', RandomForestClassifier(n_jobs=-1, random_state=48)),
 ('rf7', RandomForestClassifier(n_jobs=-1, random_state=49)),
 ('rf8', RandomForestClassifier(n_jobs=-1, random_state=50)),
 ('rf9', RandomForestClassifier(n_jobs=-1, random_state=51)),
 ('rf10', RandomForestClassifier(n_jobs=-1, random_state=52)),
 ('rf11', RandomForestClassifier(n_jobs=-1, random_state=53)),
 ('rf12', RandomForestClassifier(n_jobs=-1, random_state=54)),
 ('rf13', RandomForestClassifier(n_jobs=-1, random_state=55)),
 ('rf14', RandomForestClassifier(n_jobs=-1, random_state=56)),
 ('rf15', RandomForestClassifier(n_jobs=-1, random_state=57)),
 (

In [17]:
ensemble = VotingClassifier(models)

In [18]:
ensemble

VotingClassifier(estimators=[('rf0',
                              RandomForestClassifier(n_jobs=-1,
                                                     random_state=42)),
                             ('rf1',
                              RandomForestClassifier(n_jobs=-1,
                                                     random_state=43)),
                             ('rf2',
                              RandomForestClassifier(n_jobs=-1,
                                                     random_state=44)),
                             ('rf3',
                              RandomForestClassifier(n_jobs=-1,
                                                     random_state=45)),
                             ('rf4',
                              RandomForestClassifier(n_jobs=-1,
                                                     random_state=46)),
                             ('rf5',
                              RandomForestClassifier(n...
                             ('rf14',
  

> Default innstilling for `VotingClassifier` er såkalt uvektet "hard voting". Det betyr at hver modells stemme teller like mye, og at en teller opp output-prediksjonene for å finne hvem som vant majoriteten. Begge disse strategiene kan justeres. For eksempel kan man velge `voting="soft"` for å bruke modellenes _konfidens_ for prediksjonene og ikke bare hver enkelt modells endelige stemme. En modell som er veldig sikker i sin prediksjon vil da gis større vekt enn en modell som er mer usikker. Fritt frem å forsøke denne og andre strategier ved å modifisere på koden!

In [19]:
ensemble.fit(X_train, y_train)

MemoryError: 

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print(accuracy_score(y_test, ensemble.predict(X_test)))

Bedre enn hver av modellene for seg selv!

In [23]:
for m in models:
    m[1].fit(X_train, y_train)
    acc = accuracy_score(y_test, m[1].predict(X_test))
    print(f'Accuracy til modell {m[0]}: {acc}\n')

Accuracy til modell rf0: 0.35274666666666665

Accuracy til modell rf1: 0.35258666666666666

Accuracy til modell rf2: 0.35256

Accuracy til modell rf3: 0.3517866666666667

Accuracy til modell rf4: 0.35285333333333335

Accuracy til modell rf5: 0.3525066666666667

Accuracy til modell rf6: 0.35234666666666664

Accuracy til modell rf7: 0.35018666666666665

Accuracy til modell rf8: 0.35392

Accuracy til modell rf9: 0.3528266666666667

Accuracy til modell rf10: 0.3512

Accuracy til modell rf11: 0.35328

Accuracy til modell rf12: 0.3521066666666667

Accuracy til modell rf13: 0.3508266666666667

Accuracy til modell rf14: 0.3510666666666667

Accuracy til modell rf15: 0.35064

Accuracy til modell rf16: 0.35042666666666666

Accuracy til modell rf17: 0.3508266666666667

Accuracy til modell rf18: 0.3538133333333333

Accuracy til modell rf19: 0.35061333333333333



Men vi behøver altså ikke bruke små variasjoner av samme modell mange ganger. Vi kan ofte med fordel bruke mange ulike modeller. 

> Siden datasettet i innlevering 1 er så stort vil mange av modellene bruke lang tid på trening. Vi bruker derfor et annet, mindre datasett nedenfor. 

In [19]:
df = pd.read_csv('https://assets.datacamp.com/production/course_1939/datasets/diabetes.csv')

X, y  = df.drop('diabetes', axis=1), df.diabetes

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

scale = MinMaxScaler()

X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [20]:
df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [21]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [22]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
gb = GradientBoostingClassifier(random_state=42)
svc = SVC(random_state=42)
gnb = GaussianNB()

In [23]:
models = [('rf', rf), 
          ('gb', gb),
          ('svc', svc),
          ('gnb', gnb)]

In [24]:
ensemble = VotingClassifier(models)
ensemble.fit(X_train, y_train)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(n_jobs=-1,
                                                     random_state=42)),
                             ('gb',
                              GradientBoostingClassifier(random_state=42)),
                             ('svc', SVC(random_state=42)),
                             ('gnb', GaussianNB())])

In [27]:
accuracy_score(y_test, ensemble.predict(X_test))

0.75

Bedre enn hver modell hver for seg:

In [28]:
for m in models:
    m[1].fit(X_train, y_train)
    acc = accuracy_score(y_test, m[1].predict(X_test))
    print(f'Accuracy: {acc}\n')

Accuracy: 0.7395833333333334

Accuracy: 0.7395833333333334

Accuracy: 0.7395833333333334

Accuracy: 0.734375



Du må gjerne utforske de mange andre mulighetene i scikit-learn (se f.eks. https://scikit-learn.org/stable/supervised_learning.html) og utenfor scikit-learn (f.eks. LightGBM og XGBoost).

# Andre, mer avanserte teknikker og bibliotek

Det er flere bibliotek for ensembling som kan resultere i kraftigere modeller enn det scikit-learn kan konsturere. To mye brukte (som du fort kommer borti på Kaggle-konkurranser!) er 

- LightGBM: Et mye brukt bibliotek for "boosting". Ta en titt på dokumentasjonen her: https://lightgbm.readthedocs.io/en/latest/index.html
- XGBoost: Et annet mye brukt bibliotek. Her er noen kilder som forklarer hvordan XGBoost fungerer og hvordan du effektivt kan bruke XGBoost: https://xgboost.readthedocs.io/en/latest, https://www.analyticsvidhya.com/blog/2018/09/an-end-to-end-guide-to-understand-the-math-behind-xgboost, https://campus.datacamp.com/courses/extreme-gradient-boosting-with-xgboost/classification-with-xgboost

En generell teknikk vi nevnte i forelesningen er **stacking**, der flere ulike modeller kombineres ved at man trener en **blender** (istedenfor enkel voting). Hvis du vil teste dette ut, ta en titt på scikit-learns `StackingClassifier` og `StackingRegressor`, eller ML-Ensemble https://github.com/flennerhag/mlens og vecstack https://github.com/vecxoz/vecstack.

<img width=30% src="assets/stack.png">

Hva hvis det var mulig å automatisk velge modeller _dynamisk_ når en predikerer? Altså, for et gitt datapunkt, bruke klassifikatorene som en anslår som mest lovende for akkurat dette datapunktet? Dette kan kalles **dynamic ensemble selection** eller **dynamic classifier selection**, og finnes for eksempel i biblioteket DESlib: https://github.com/scikit-learn-contrib/DESlib, https://deslib.readthedocs.io/en/latest/. 