In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<hr style="border-width:2px;border-color:#75DFC1">
<h1 style = "text-align:center" > UniGe Cosmo Machine Learning 2020/21</h1> 
<h2 style = "text-align:center"> Tutorial 3 </h2> 
<h3 style = "text-align:center"> 16.11.2020 - Michele Mancarella</h3> 
<hr style="border-width:2px;border-color:#75DFC1">


> This tutotial deals with classification problems and compares different algorithms. We will **distringuish stars from QSOs in the SDSS catalogue** using photometric data. 

> Summary:
* Reminder of logistic regression
* Decision Trees , Random Forests
* Hyperparameter tuning, grid search, pipelines
* Custom evaluation metrics
* Support Vector Machines
* k Nearest Neighbors



<img src="imgs/MLworkflow.png">

> Goals:
* Learn how to apply most widely used ML algotithms with sklearn
* More advanced: change evaluation metric, pipelines, hyperparameter tuning


> Packages and resources:
* This tutorial is based on **scikit-learn**. [**scikit-learn**](https://scikit-learn.org/stable/) is an open source, user-friendly machine learning library. It has an extensive documentation as well as tutorials. Check that out ;)


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns#; sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

a4_dims = (11.7, 8.27)

<hr style="border-width:2px;border-color:#75DFC1">
<h2 style = "text-align:center"> Data </h2> 
<hr style="border-width:2px;border-color:#75DFC1">

Fetch data

In [None]:
from astroML.datasets import fetch_sdss_galaxy_colors
data = fetch_sdss_galaxy_colors()
data = data[::5]

In [None]:
data.shape

Numpy VS Pandas

In [None]:
type(data)

In [None]:
data[:5]

In [None]:
colnames = list(data.dtype.fields.keys())
colnames

In [None]:
df = pd.DataFrame(data , columns=colnames)

In [None]:
df.head()

In [None]:
df['u-g'] = df['u']-df['g']
df['g-r'] = df['g']-df['r']
df['r-i'] = df['r']-df['i']
df['i-z'] = df['i']-df['z']

In [None]:
df['specClass_label'] = pd.get_dummies(df['specClass'],prefix='specClass')['specClass_GALAXY']

In [None]:
df.head()

In [None]:

plt.figure(figsize=a4_dims)
ax = sns.pairplot( df,
                   vars=['u-g', 'g-r', 'r-i', 'i-z'],
                     hue='specClass' #, style=hue
                     );


In [None]:
plt.figure(figsize=a4_dims)
ax = sns.scatterplot( x='u-g',y='i-z', data=df,
                   #vars=['u-g', 'g-r', 'r-i', 'i-z'],
                     hue='specClass' #, style=hue
                     );

ax.set_xlim(-0.5, 2.5);
ax.set_ylim(-0.5, 1.5);

In [None]:
X_all, y_all = df[['u', 'g', 'r', 'i', 'z', 'u-g', 'g-r', 'r-i', 'i-z']], df['specClass_label']

### Split into train and test set

In [None]:
from sklearn.model_selection import train_test_split 

In [None]:
# More advanced: use k-fold or multiple train-test split !


X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size = 0.25, \
                                                   stratify = y_all )

# Note: use stratify to keep proportion of output classes when splitting ! 
# Super important if you have unmbalanced classes
# When doing k-fold cross validation, check Stratified k-fold



<hr style="border-width:2px;border-color:#75DFC1">
<h2 style = "text-align:center"> General setup </h2> 
<hr style="border-width:2px;border-color:#75DFC1">

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, mean_absolute_error
from sklearn.model_selection import cross_val_score


def score_res(clf, test_X, test_y, X, y, myCV=10):
    y_pred = clf.predict(test_X)
    acc = accuracy_score(y_pred, test_y)
    scores = cross_val_score(clf, X, y, cv=myCV)
    meanAcc=scores.mean()
    print('Accuracy : %s \n' %acc)
    print (classification_report(test_y, y_pred))
    print ('Confusion_matrix:')
    print (confusion_matrix(test_y, y_pred))
    print('\n k-fold cross validation on full dataset with %s folds: ' %myCV)
    print (scores)
    print("\n Accuracy: %0.2f (+/- %0.2f)\n" % (meanAcc, scores.std() * 2))
    #scores1=SKFold(clf,all_X,all_y,myCV)
 
    #y_pred=clf.predict(test_X)
    #mae=mean_absolute_error(test_y, y_pred)
    #print("Mean absolute error: %s" %mae)
    return meanAcc

In [None]:
def plot_decision_regions(X, y, classifier, resolution=0.02, eps=0.1):

    if isinstance(X,pd.DataFrame):
        X=X.to_numpy()
        y=y.to_numpy()

    from matplotlib.colors import ListedColormap
    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')

    cmap = ListedColormap(colors[:len(np.unique(y))])
    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - eps, X[:, 0].max() + eps
    x2_min, x2_max = X[:, 1].min() - eps, X[:, 1].max() + eps
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                     np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    
    plt.figure(figsize=a4_dims)
    plt.contourf(xx1, xx2, Z, alpha=0.2)#, cmap=cmap)
    #plt.xlim(xx1.min(), xx1.max())
    #plt.ylim(xx2.min(), xx2.max())
    # plot class samples
    for idx, cl in enumerate(np.unique(y)):
        sns.scatterplot(x=X[y == cl, 0], y=X[y == cl, 1])
                #alpha=0.8, #c=cmap(idx),
                #marker=markers[idx], label=cl)
    plt.xlim(-0.5, 2.5);
    plt.ylim(-1, 1.5);

<hr style="border-width:2px;border-color:#75DFC1">
<h2 style = "text-align:center"> Basic ML w. sklearn -  Logistic Regression </h2> 
<hr style="border-width:2px;border-color:#75DFC1">

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
 
 Reminder: basic usage of sklearn
 
 > *  Split into train/test set:
```python 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)```

> * Train your model
```python 
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)```

> * Evaluate
```python 
logreg.score(X_test, y_test)```
 gives accuracy. For other metrics:
```python 
from sklearn.metrics import precision_score, recall_score, f1_score
y_pred = logreg.predict(X_test)
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
```

Slightly better: evaluate with k-fold cross validation. This is what the function score_res() does (with accuracy as a metric)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_all, y_train_all)

In [None]:
logreg.score(X_test_all, y_test_all)

In [None]:
logreg.score(X_train_all, y_train_all)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
y_pred_all = logreg.predict(X_test_all)
print(precision_score(y_test_all, y_pred_all))
print(recall_score(y_test_all, y_pred_all))
print(f1_score(y_test_all, y_pred_all))

In [None]:
X_test_all.shape

In [None]:
from sklearn.metrics import roc_curve, auc
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
thr=dict()
roc_auc = dict()

y_score = logreg.predict_proba(X_test_all)

for i in range(2):
    fpr[i], tpr[i], thr[i] = roc_curve(np.eye(2)[y_test_all][:, i], y_score[:,i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(np.eye(2)[y_test_all].ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
steps=[8,10]

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,10))
labels_dict={0: 'QSO', 1:'Galaxy'}
colors=['darkorange', 'darkgreen']

lw = 2
for i_lab in range(2):

    step=steps[i_lab]

    ax.plot(fpr[i_lab], tpr[i_lab],color=colors[i_lab],
         lw=lw, label=labels_dict[i_lab]+', area = %0.2f' % (roc_auc[i_lab]));
    for x, y, txt in zip(fpr[i_lab][::step], tpr[i_lab][::step], thr[i_lab][::step]):
        ax.annotate(np.round(txt,2), (x, y), fontsize=12);
        ax.plot(fpr[i_lab][::step], tpr[i_lab][::step], 'o', color=colors[i_lab], lw=lw);

ax.set_xlim([0.0, 1.0]);
ax.set_ylim([0.0, 1.05]);
ax.set_xlabel('False Positive Rate',fontsize=18);
ax.legend(loc="lower right",fontsize=18);
ax.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
ax.set_ylabel('True Positive Rate',fontsize=18);
fig.suptitle('Receiver operating characteristic',fontsize=18) ;

<hr style="border-width:2px;border-color:#75DFC1">
<h2 style = "text-align:center">  Decision Trees & feature selection </h2> 
<hr style="border-width:2px;border-color:#75DFC1">

 http://scikit-learn.org/stable/modules/tree.html
 


#### Blind application

In [None]:
from sklearn import tree

DTclf = tree.DecisionTreeClassifier()
DTclf=DTclf.fit(X_train_all, y_train_all)

acc_dectree=score_res(DTclf, X_test_all, y_test_all, X_all, y_all, 10)



> With a decision tree, we can visualise the hierarchical partition of the data using [**Graphviz**](http://www.graphviz.org).

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

In [None]:
dot_data = StringIO()
export_graphviz(DTclf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

> <span style="color:#09b038; text-decoration : underline"> Question :</span><br>  
    
* Do you understand? If so, can you explain it to me? 


> Fortunately, Decision Trees are **white boxes**: we can compute and visualise *feature importance* to see what feature have larger impact on the final decision

In [None]:
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[:].plot( kind = 'barh' )
    print (model.score( X , y ))
    return imp

myImp=plot_model_var_imp(DTclf,X_test_all,y_test_all)

#### Regularization #1: use less features

In [None]:
X, y = df[['u-g', 'i-z']] , df['specClass_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, \
                                                   stratify = y )

In [None]:
DTclf = tree.DecisionTreeClassifier( )
DTclf=DTclf.fit(X_train, y_train)

acc_dectree=score_res(DTclf, X_test, y_test, X, y, 10)


In [None]:
dot_data = StringIO()
export_graphviz(DTclf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
plot_decision_regions(X_test, y_test, DTclf, resolution=0.01, eps=0.1)

> The above decision regions are the perfect example of **overfitting**. Check : see what the performance on the *training set* is

In [None]:
score_res(DTclf, X_train, y_train, X, y, 10)

Much better than on test set: we are indeed overfitting

In [None]:
plot_decision_regions(X_train, y_train, DTclf, resolution=0.001, eps=0.1)

<img src="imgs/overfitting.png">

#### Regularization #2 - Reduce depth

In [None]:
DTclf = tree.DecisionTreeClassifier(max_depth=1, min_samples_split=40)
DTclf=DTclf.fit(X_train, y_train)

acc_dectree=score_res(DTclf, X_test, y_test, X, y, 10)



In [None]:
dot_data = StringIO()
export_graphviz(DTclf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
plot_decision_regions(X_test, y_test, DTclf, resolution=0.02, eps=0.1)

In [None]:
score_res(DTclf, X_train, y_train, X, y, 10)

<hr style="border-width:2px;border-color:#75DFC1">
<h2 style = "text-align:center"> Advanced grid search </h2> 
<hr style="border-width:2px;border-color:#75DFC1">

> Make the above intuitions automatic, scalable, statistically more robust.
* automatic, scalable: with pipelines
* more robust: optimisze hyperparameters w. cross validation

> Cross-validation in sklearn:

* Hyperparameters are specified through a dictionary

```python

params = {
   'C': np.logspace(-5, 5, 50) # 50 values equally spaced in Log between 10**-5 and 10**5
    
}
```
* Initialize classifier
```python
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
```
* Use GridSearchCV
```python
grid = GridSearchCV(logreg, params, verbose=0, cv=3,\
                    scoring=make_scorer(f1_score))
# note: refit=True by default
grid.fit(X_train, y_train).score(X_test, y_test)
```


> Pipeline in sklearn. Pipelines are useful to optimize over multiple hyperparameters
* Build the pipeline: List all transformations that we want to apply to our data. For each step, a tuple of 2 objects: (name, acual objet performing operation)

```python

steps = [
    ('scaler', MinMaxScaler()),
    ('pca', PCA()),
    ('rf', RandomForestClassifier())
]
pipeline = Pipeline(steps)
```

* When passing hyperparameters values, we should now specify which ones to use at each step
```python
params = {
    'pca__n_components': np.arange(1, 51, 5),
    'rf__n_estimators': np.arange(5, 55, 5)       
}
grid = GridSearchCV(pipeline, params, verbose=1)
```

* Fit

```python

grid.fit(X_train, y_train).score(X_test, y_test)

# Note: .fit method fits ALL the methods in steps !
```

* best params:
``` python
grid.best_params_
```

* Flexible! You can also optimize on methods , e.g.:
```python
params_1 = {
  'rf': [LogisticRegression(solver='lbfgs', max_iter=500)],
  'rf__C': np.logspace(-5,-5,10)
}
params_2 = {
  'rf': [RandomForestClassifier()],
  'rf__n_estimators': np.arange(5,55,5) 
}
grid = GridSearchCV(pipeline, [params_1, params_2], verbose=1)
grid.fit(X_train, y_train)
grid.best_score_
```

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV



def my_pipeline(my_clf, x_train, x_test , y_train, y_test, scorer, parameters):

    pipeline = Pipeline([('clf', my_clf)])

    nn = x_train.shape[0]

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,verbose=1, scoring=scorer)
    grid_search.fit(x_train, y_train)
    print ('Best score: %0.3f' % grid_search.best_score_)
    print ('Best parameters set:')
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print ('\t%s: %r' % (param_name, best_parameters[param_name]))

    predictions = grid_search.predict(x_test)
    print (classification_report(y_test, predictions))
    
    return grid_search

### Random forest + different metrics

<img src="imgs/metrics.png">

Precision : how many predicted galaxies/QSO are actually galaxies? 

Recall: how many true galaxies/QSO are correctly identified? 

f1: harmonic mean

In [None]:
from sklearn import ensemble

parameters_rf = {
        'clf__criterion': ('gini', 'entropy'),
        'clf__max_depth': (1, 2, 3, 5 ),
        'clf__min_samples_leaf':( 5, 10, 20),
    'clf__min_samples_split':(5, 10, 20)
    }

class_RF = my_pipeline(ensemble.RandomForestClassifier(n_jobs=-1, random_state=321),
                       X_train, X_test, y_train, y_test,
                       make_scorer(f1_score), parameters_rf )

In [None]:
plot_decision_regions(X_train.to_numpy(), y_train.to_numpy(), class_RF, resolution=0.02, eps=0.1)

#### Change metric - recall

In [None]:
class_RF_2 = my_pipeline(ensemble.RandomForestClassifier(n_jobs=-1, random_state=321),
                       X_train, X_test, y_train, y_test,
                       make_scorer(recall_score), parameters_rf )

In [None]:
plot_decision_regions(X_train, y_train, class_RF_2, resolution=0.02, eps=0.1)
# reminder: goal is to correctly classify as many galaxies as possible (orange)
# price is to lower significantly the recall on the other class, i.e. the true QSOs that are correcly classified

#### Change metric - precision

In [None]:
class_RF_3 = my_pipeline(ensemble.RandomForestClassifier(n_jobs=-1, random_state=321),
                       X_train, X_test, y_train, y_test,
                       make_scorer(precision_score), parameters_rf )

In [None]:
plot_decision_regions(X_train, y_train, class_RF_3, resolution=0.02, eps=0.1)
# precision= ability of the classifier not to label as positive a sample that is negative
# Here I want not to label a quasar as galaxy - push to the right where possible

#### Custom metric

Define a function thath specifies your metric and returns the score. Then pass it to the pipeline using make_scorer

In [None]:
def custom_metric(y_true, y_pred): 
    ''' Computes geometrical mean of precision and recall '''
    rec = recall_score(y_true, y_pred)
    prec=precision_score(y_true, y_pred)
    return np.sqrt(rec*prec)


# Actually useful for multi class problems: precision/recall on specific subclass
def custom_recall(y_true, y_pred): 
    ''' Computes recall only on labels 0 '''
    rec = recall_score(y_true, y_pred, labels=[0], pos_label=0, average='binary')
    return rec #target_accuracy

my_scorer = make_scorer(custom_recall)



In [None]:
class_RF_1 = my_pipeline(ensemble.RandomForestClassifier(n_jobs=-1, random_state=321),
                       X_train, X_test, y_train, y_test,
                       my_scorer, parameters_rf )

In [None]:
plot_decision_regions(X_train, y_train, class_RF_1, resolution=0.02, eps=0.1)
# reminder: now, the goal is to correctly classify as many QSO as possible (blue) - pushes boundary to the right

### SVM

#### Linear boudaries - example: poor evaluation metric!

In [None]:
## 2. SVM
# http://scikit-learn.org/stable/modules/svm.html#svm-classification

from sklearn import svm

parameters_SVM_1 = {'clf__kernel': ['poly', 'linear'],
         'clf__C': [1, 0.1, 0.01],
          'clf__gamma': [ 0.01, 0.1, 1],
          }

SVMbest_1=my_pipeline(svm.SVC( class_weight='balanced'),
                       X_train, X_test, y_train, y_test,
                       make_scorer(precision_score) , parameters_SVM_1 )


In [None]:
plot_decision_regions(X_test.to_numpy(), y_test.to_numpy(), SVMbest_1, resolution=0.02, eps=0.1)

> According to our metric, this is almost the **perfect** classifier!! (looking for max fraction of quasars correcly identified as quasars)
let's use a more balanced metric


In [None]:


SVMbest_2=my_pipeline(svm.SVC( class_weight='balanced'),
                       X_train, X_test, y_train, y_test,
                       make_scorer(f1_score), parameters_SVM_1 )


In [None]:
plot_decision_regions(X_test.to_numpy(), y_test.to_numpy(), SVMbest_2, resolution=0.02, eps=0.1)

#### Non Linear boudaries

To do better I might need to draw more complex boundaries

In [None]:


parameters_SVM = {'clf__kernel': ['rbf'],
         'clf__C': [0.01,1, 0.5, 0.1],
          'clf__gamma': [  0.01, 0.1,1, 5],
          }

In [None]:
SVMbest_3=my_pipeline(svm.SVC( class_weight='balanced'),
                       X_train, X_test, y_train, y_test,
                       make_scorer(recall_score), parameters_SVM )

#acc_svc=score_res(SVMbest, X_test, y_test,X, y 10)

In [None]:
plot_decision_regions(X_test.to_numpy(), y_test.to_numpy(), SVMbest_3, resolution=0.02, eps=0.1)

In [None]:
SVMbest_4=my_pipeline(svm.SVC( class_weight='balanced'),
                       X_train, X_test, y_train, y_test,
                       make_scorer(precision_score), parameters_SVM )

#acc_svc=score_res(SVMbest, X_test, y_test,X, y 10)

In [None]:
plot_decision_regions(X_test.to_numpy(), y_test.to_numpy(), SVMbest_4, resolution=0.02, eps=0.1)

#### Play with metrics

In [None]:
def custom_recall_1(y_true, y_pred): 
    ''' Computes recall only on labels 0 '''
    rec = recall_score(y_true, y_pred, pos_label=0, average='binary')
    return rec #target_accuracy

my_scorer_1 = make_scorer(custom_recall_1)

# fraction of galaxies correctly classified as galaxies : (true galaxies correcly classified)/(true galaxies correcly classified + quasars misclassified as galaxies)

In [None]:
SVMbest_5=my_pipeline(svm.SVC( class_weight='balanced'),
                       X_train, X_test, y_train, y_test,
                       my_scorer_1, parameters_SVM )

In [None]:
plot_decision_regions(X_train.to_numpy(), y_train.to_numpy(), SVMbest_5, resolution=0.02, eps=0.1)

In [None]:
def custom_precision(y_true, y_pred): 
    ''' Computes recall only on labels 0 '''
    rec = precision_score(y_true, y_pred, pos_label=0, average='binary')
    return rec #target_accuracy

my_scorer_2 = make_scorer(custom_precision)

# Maximise fraction of (correctly classified  galaxies)/(all classified as galaxies) 

In [None]:
SVMbest_6 = my_pipeline(svm.SVC(),
                       X_train, X_test, y_train, y_test,
                       my_scorer_2, parameters_SVM )


In [None]:
plot_decision_regions(X_train.to_numpy(), y_train.to_numpy(), SVMbest_6, resolution=0.02, eps=0.1)

#### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier


parameters_KNN = {'clf__n_neighbors': [1, 5, 10, 20, 100],
                  'clf__metric': ['euclidean', 'manhattan', 'chebyshev']
          }

KNN_best_1=my_pipeline(KNeighborsClassifier(),
                       X_train, X_test, y_train, y_test,
                       my_scorer , parameters_KNN )



In [None]:
plot_decision_regions(X_test.to_numpy(), y_test.to_numpy(), KNN_best_1, resolution=0.02, eps=0.1)

In [None]:
KNN_best_2=my_pipeline(KNeighborsClassifier(),
                       X_train, X_test, y_train, y_test,
                       make_scorer(f1_score) , parameters_KNN )


In [None]:
plot_decision_regions(X_test.to_numpy(), y_test.to_numpy(), KNN_best_2, resolution=0.02, eps=0.1)

In [None]:
KNN_best_3=my_pipeline(KNeighborsClassifier(),
                       X_train, X_test, y_train, y_test,
                       make_scorer(precision_score)  , parameters_KNN )


In [None]:
plot_decision_regions(X_train.to_numpy(), y_train.to_numpy(), KNN_best_3, resolution=0.02, eps=0.1)

<hr style="border-width:2px;border-color:#75DFC1">
<h2 style = "text-align:center"> Summary and take away message </h2> 
<hr style="border-width:2px;border-color:#75DFC1">


> Cross-validation and hyperparameter tuning: do it!

> Metric is crucial and subjective: depends on your goals and what kind of error you want to penalise more

> Different algorithms according to the king of boudary (e.g. choice of kernels for nonlinear problems)

> sklearn-related: use pipelines + gridsearchCV 