In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys

while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research')):
    os.chdir("..")
sys.path.append('src')
os.getcwd()

In [72]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import (
    FunctionTransformer,
)
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split


from src.custom_transformers import (
    DropColumnTransformer,
    CustomImputer,
    CustomStandardScaler,
    CustomLabelEncoder,
    CustomOneHotEncoder,
)

In [None]:
# Load Titanic dataset from seaborn
raw_data = sns.load_dataset("titanic")
raw_data.head()

In [74]:
from src.outlier_remover_wojciech_jurewicz import OutlierRemoveTransformer

In [None]:
data_cleaning = make_pipeline(
    DropColumnTransformer(columns=["deck"]),
    CustomImputer(strategy="mean", columns=["age"]),
    CustomImputer(strategy="most_frequent", columns=["embarked"]),
    FunctionTransformer(lambda X: X.drop_duplicates(), validate=False),
    OutlierRemoveTransformer(threshold=3)
)

df_cleaned = data_cleaning.fit_transform(raw_data)
df_cleaned.head()

In [76]:
X_train, X_test, y_train, y_test = train_test_split(
    df_cleaned.drop(columns=["alive", "survived"]),
    df_cleaned["alive"],
    test_size=0.2,
    random_state=42,
)

In [77]:
preprocessing_pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    CustomLabelEncoder(columns=["alone"])
)

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

clf = LogisticRegression(random_state=0, solver="newton-cg", multi_class="multinomial")

pipeline = make_pipeline(preprocessing_pipeline, clf)

pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

## Exercise 1

Create a custom function to show confussion matrix in a nice way. You may for instance try using seaborn package

In [82]:
from sklearn import metrics
import matplotlib.pyplot as plt

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cnf_matrix, display_labels = [0, 1])

cm_display.plot()
plt.show()

## Use cross validation

In [None]:
from sklearn.model_selection import cross_val_score

subset_count = 5
X = df_cleaned.drop(columns=["alive", "survived"])
y = df_cleaned["alive"]

scores = cross_val_score(
    pipeline, X, y, cv=subset_count, scoring="accuracy"
)  # pipeline should be reseted at the begining of each iteration
pd.DataFrame(scores)

# SVM

In [None]:
import warnings

warnings.filterwarnings("ignore")
from sklearn.svm import SVC

pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    CustomLabelEncoder(columns=["alone"]),
    SVC(kernel="rbf"),
)

pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

## Exercuse 2

Perform a cross validation in case of SVM.

In [None]:
subset_count = 5
X = df_cleaned.drop(columns=["alive", "survived"])
y = df_cleaned["alive"]

scores = cross_val_score(
    pipeline, X, y, cv=subset_count, scoring="accuracy"
)
pd.DataFrame(scores)

## Exercise 3

Try using different kernels wirh SVM, perhaps polynomial.

In [None]:
pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    CustomLabelEncoder(columns=["alone"]),
    SVC(kernel="poly"),
)

pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

In [None]:
subset_count = 5
X = df_cleaned.drop(columns=["alive", "survived"])
y = df_cleaned["alive"]

scores = cross_val_score(
    pipeline, X, y, cv=subset_count, scoring="accuracy"
)
pd.DataFrame(scores)

# kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k = 5  # count of neighbors

pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    CustomLabelEncoder(columns=["alone"]),
    KNeighborsClassifier(n_neighbors=k),
)
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

# Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    CustomLabelEncoder(columns=["alone"]),
    DecisionTreeClassifier(),
)
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

# Perceptron Network

In [None]:
from sklearn.neural_network import MLPClassifier

pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    CustomLabelEncoder(columns=["alone"]),
    MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation="relu",
        solver="adam",
        max_iter=1000,
        random_state=42,
    ),
)
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

# Compare with dummy classifier

In [None]:
y_test.value_counts() / len(y_test)

## Exercise 4

Neural nets are sensible to its parameters. Try changing activatation function and numbers of neurons in hidden layers to see if you could increase its performance

In [None]:
pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    CustomLabelEncoder(columns=["alone"]),
    MLPClassifier(
        hidden_layer_sizes=(200, 10),
        activation="logistic",
        solver="adam",
        max_iter=1000,
        random_state=42,
    ),
)
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

# Learning curve

We can try to plot the performance of each model during its training and evaluation

In [None]:
pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who", "adult_male", "class"]),
    CustomStandardScaler(columns=["fare", "age"]),
    CustomLabelEncoder(columns=["alone"]),
    MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation="relu",
        solver="adam",
        max_iter=10000,
        random_state=42,
    ),
)
pipeline.fit(X_train, y_train)

fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(pipeline[-1].loss_curve_)
ax.set_xlabel("Number of iterations")
ax.set_ylabel("Loss")
plt.show()

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    pipeline, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 0.99, 50)
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, label=f"(Train)", marker="o")
plt.fill_between(
    train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.15
)
plt.plot(train_sizes, test_mean, label=f"(Test)", marker="o")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.15)

# Exercise 5

Try
```python
from sklearn.datasets import load_iris
```

And try to create the best performing classifier in CV against 10 folds.

In [114]:
from sklearn.datasets import load_iris

In [115]:
data = load_iris()
raw_df_iris = pd.DataFrame(
	data=np.column_stack((data.data, data.target)),
	columns=data.feature_names + ["target"]
)

In [None]:
raw_df_iris.head()

In [None]:
raw_df_iris.shape

In [None]:
print(raw_df_iris.isnull().sum())

In [None]:
raw_df_iris[raw_df_iris.duplicated()]

In [None]:
data_cleaning_iris = make_pipeline(
    FunctionTransformer(lambda X: X.drop_duplicates(), validate=False),
    OutlierRemoveTransformer(threshold=3)
)

df_cleaned_iris = data_cleaning_iris.fit_transform(raw_df_iris)
df_cleaned_iris.head()

In [121]:
X_train, X_test, y_train, y_test = train_test_split(
    df_cleaned_iris.drop(columns=["target"]),
    df_cleaned_iris["target"],
    test_size=0.2,
    random_state=42,
)

## Logistic regression

In [140]:
preprocessing_pipeline = make_pipeline(
    CustomStandardScaler(columns=["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]),
)

In [None]:
clf = LogisticRegression(random_state=0, solver="newton-cg", multi_class="multinomial")

pipeline = make_pipeline(preprocessing_pipeline, clf)

pipeline.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import KFold, cross_val_score

X = df_cleaned_iris.drop(columns=["target"])
y = df_cleaned_iris["target"]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=kfold, scoring="accuracy")

pd.DataFrame(scores)

## SVM

In [None]:
pipeline = make_pipeline(
    CustomStandardScaler(columns=["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]),
    SVC(kernel="rbf"),
)
pipeline.fit(X_train, y_train)

In [None]:
X = df_cleaned_iris.drop(columns=["target"])
y = df_cleaned_iris["target"]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=kfold, scoring="accuracy")

pd.DataFrame(scores)

## KNN

In [None]:
pipeline = make_pipeline(
    CustomStandardScaler(columns=["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]),
    KNeighborsClassifier(n_neighbors=3),
)
pipeline.fit(X_train, y_train)

In [None]:
X = df_cleaned_iris.drop(columns=["target"])
y = df_cleaned_iris["target"]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=kfold, scoring="accuracy")

pd.DataFrame(scores)

## Decisiton tree

In [None]:
pipeline = make_pipeline(
    CustomStandardScaler(columns=["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]),
    DecisionTreeClassifier(),
)
pipeline.fit(X_train, y_train)

In [None]:
X = df_cleaned_iris.drop(columns=["target"])
y = df_cleaned_iris["target"]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=kfold, scoring="accuracy")

pd.DataFrame(scores)

## Percepton Network

In [None]:
pipeline = make_pipeline(
CustomStandardScaler(columns=["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]),
    MLPClassifier(
        hidden_layer_sizes=(10, 5),
        activation="tanh",
        solver="adam",
        max_iter=1000,
        random_state=42,
    ),
)
pipeline.fit(X_train, y_train)

In [None]:
X = df_cleaned_iris.drop(columns=["target"])
y = df_cleaned_iris["target"]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_val_score(pipeline, X, y, cv=kfold, scoring="accuracy")

pd.DataFrame(scores)

The Percepton Network performed the best

# Pass-Fail Exercise 

Complete the exercises presented in this notebook. Then copy this notebook to your student directory and create a Merge request with it. Please do not commit thios file.