In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys

while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research')):
    os.chdir("..")
sys.path.append('src')
os.getcwd()

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import (
    FunctionTransformer,
)
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split


from src.custom_transformers import (
    DropColumnTransformer,
    CustomImputer,
    CustomStandardScaler,
    CustomLabelEncoder,
    CustomOneHotEncoder,
)

In [None]:
# Load Titanic dataset from seaborn
raw_data = sns.load_dataset("titanic")
raw_data.head()

In [None]:
data_cleaning = make_pipeline(
    DropColumnTransformer(columns=["deck"]),
    CustomImputer(strategy="mean", columns=["age"]),
    CustomImputer(strategy="most_frequent", columns=["embarked"]),
    FunctionTransformer(lambda X: X.drop_duplicates(), validate=False),
    # add your own outlier remover transformer step here
)

df_cleaned = data_cleaning.fit_transform(raw_data)
df_cleaned.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_cleaned.drop(columns=["alive", "survived"]),
    df_cleaned["alive"],
    test_size=0.2,
    random_state=42,
)

In [None]:
preprocessing_pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    ## add your own preprocessing steps here
)

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

clf = LogisticRegression(random_state=0, solver="newton-cg", multi_class="multinomial")

pipeline = make_pipeline(preprocessing_pipeline, clf)

pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

## Exercise 1

Create a custom function to show confussion matrix in a nice way. You may for instance try using seaborn package

In [None]:
pass

## Use cross validation

In [None]:
from sklearn.model_selection import cross_val_score

subset_count = 5
X = df_cleaned.drop(columns=["alive", "survived"])
y = df_cleaned["alive"]

scores = cross_val_score(
    pipeline, X, y, cv=subset_count, scoring="accuracy"
)  # pipeline should be reseted at the begining of each iteration
pd.DataFrame(scores)

# SVM

In [None]:
import warnings

warnings.filterwarnings("ignore")
from sklearn.svm import SVC

pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    ## add your own preprocessing steps here
    SVC(kernel="rbf"),
)

pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

## Exercise 2

Perform a cross validation in case of SVM.

In [None]:
pass

## Exercise 3

Try using different kernels wirh SVM, perhaps polynomial.

In [None]:
pass

# kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k = 5  # count of neighbors

pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    ## add your own preprocessing steps here
    KNeighborsClassifier(n_neighbors=k),
)
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

# Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    ## add your own preprocessing steps here
    DecisionTreeClassifier(),
)
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

# Perceptron Network

In [None]:
from sklearn.neural_network import MLPClassifier

pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    ## add your own preprocessing steps here
    MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation="relu",
        solver="adam",
        max_iter=1000,
        random_state=42,
    ),
)
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

# Compare with dummy classifier

In [None]:
y_test.value_counts() / len(y_test)

## Exercise 4

Neural nets are sensible to its parameters. Try changing activatation function and numbers of neurons in hidden layers to see if you could increase its performance

# Learning curve

We can try to plot the performance of each model during its training and evaluation

In [None]:
pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    ## add your own preprocessing steps here
    MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation="relu",
        solver="adam",
        max_iter=10000,
        random_state=42,
    ),
)
pipeline.fit(X_train, y_train)

fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(pipeline[-1].loss_curve_)
ax.set_xlabel("Number of iterations")
ax.set_ylabel("Loss")
plt.show()

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    pipeline, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 0.99, 50)
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, label=f"(Train)", marker="o")
plt.fill_between(
    train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.15
)
plt.plot(train_sizes, test_mean, label=f"(Test)", marker="o")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.15)

# Exercise 5

Try
```python
from sklearn.datasets import load_iris
```

And try to create the best performing classifier in CV against 10 folds.

# Pass-Fail Exercise 

Complete the exercises presented in this notebook. Then copy this notebook to your student directory and create a Merge request with it. Please do not commit thios file.