In [None]:
import pandas as pd
import numpy as np
import os
%matplotlib inline

# Data Transformations

In [None]:
col_names = ["Status of existing checking account", "Duration in months", "Credit history", "Purpose", "Credit amount",
           "Savings account/bonds", "Present employment since", "Installment rate in percentage of disposable income",
           "Personal status and sex", "Other debtors / guarantors", "Present residence since", "Property", "Age in years",
           "Other installment plans", "Housing", "Number of existing credits at this bank", "Job", 
           "Number of people being liable to provide maintenance for", "Telephone", "Foreign worker", "Good flag"]

In [None]:
raw_data = pd.read_csv("german.data", sep=" ", names=col_names)

In [None]:
raw_data.corr("spearman")["Good flag"]

In [None]:
raw_data['Good flag'] = raw_data["Good flag"] - 1

In [None]:
for column in raw_data.columns:
    if column != "Credit amount":
        print(raw_data.groupby(by=column)["Good flag"].sum()/raw_data.groupby(by=column)["Good flag"].count(), raw_data.groupby(by=column)["Good flag"].count())

In [None]:
raw_data["Duration in months"].hist(bins=100)

In [None]:
raw_data["Duration in months binned"] = pd.cut(raw_data["Duration in months"], bins=[0, 6, 12, 15, 18, 24, 30, 36, 72])
raw_data["Age in years binned"] = pd.cut(raw_data["Age in years"], bins=[19, 25, 30, 35, 40, 50, 75])

In [None]:
numeric_vars = list(raw_data.dtypes.where(raw_data.dtypes == 'int64').dropna().index)
numeric_vars.remove('Good flag')
cat_vars = list(raw_data.dtypes.where(raw_data.dtypes != 'int64').dropna().index)

from sklearn.preprocessing import scale

raw_data_scaled = pd.DataFrame(scale(raw_data[numeric_vars]), columns=numeric_vars)
dummy_attr = pd.get_dummies(raw_data[cat_vars], dummy_na=True)
dummy_data = raw_data_scaled.merge(dummy_attr, left_index=True, right_index=True)
dummy_data['Good flag'] = raw_data['Good flag']

## Data split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Preparing the data
train_data, oos_data = train_test_split(dummy_data, test_size=0.2, random_state=0, stratify=dummy_data["Good flag"])

attr = list(train_data.columns.copy())
attr.remove("Good flag")

X = train_data[attr].values
y = train_data["Good flag"].values

# Scikit Learn API

The Scikit-Learn's classes API consist of three main functions - 
<code>fit()</code> - fitting a given class to data;
<code>transform()</code> - transforms the data in a certain way;
<code>predict()</code> - gives the output of a model or algorithm.

# Algorithms

## Decision Tree - basics

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

tree_clf = DecisionTreeClassifier()

In [None]:
tree_clf.fit(X, y)
roc_auc_score(tree_clf.predict(X), y)

In [None]:
# From the above the tree is clearly overfitting. Let's investigate the structure of the tree
tree_clf.tree_.node_count

In [None]:
from sklearn.tree import export_graphviz
import graphviz

def graph_tree(clf, attr_list):
    dot_data = export_graphviz(clf, out_file=None, feature_names=attr_list, filled=True, rounded=True,
                           special_characters=True)  
    graph = graphviz.Source(dot_data)  
    return graph

graph_tree(tree_clf, attr)

## Decision trees - regularization

- <code>max_depth</code> - specifies how many "levels" a decision tree can have;

In [None]:
tree_clf2 = DecisionTreeClassifier(max_depth=2, class_weight='balanced')
tree_clf2.fit(X, y)
print(roc_auc_score(tree_clf2.predict(X), y))

graph_tree(tree_clf2, attr)

In [None]:
#Example of Gini-impurity at middle node
node_value = tree_clf2.tree_.value 
sum_vals = node_value[1].sum()
gini = node_value[1][0][0]/sum_vals * (1-node_value[1][0][0]/sum_vals) + node_value[1][0][1]/sum_vals * (1-node_value[1][0][1]/sum_vals)
print(gini)

In [None]:
def feature_importances(clf):
    features = pd.DataFrame([clf.feature_importances_], columns=attr).transpose()
    return features.sort_values(by=0, ascending=False).head(n=5)

feature_importances(tree_clf2)

- <code>min_samples_split</code> - the minimum number of samples required to split a node (can be given asa fraction);

In [None]:
tree_clf2 = DecisionTreeClassifier(min_samples_split=80, class_weight='balanced')
tree_clf2.fit(X, y)
print(roc_auc_score(tree_clf2.predict(X), y))

graph_tree(tree_clf2, attr)

- <code>min_samples_leaf</code> - the minimum number of samples required to be at a leaf node (can be given as a fraction);

In [None]:
tree_clf2 = DecisionTreeClassifier(min_samples_leaf=80, class_weight='balanced')
#tree_clf2 = DecisionTreeClassifier(min_samples_leaf=0.1)
tree_clf2.fit(X, y)
print(roc_auc_score(tree_clf2.predict(X), y))

graph_tree(tree_clf2, attr)

- <code>min_weight_fraction_leaf</code> - the minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node;

In [None]:
tree_clf2 = DecisionTreeClassifier(min_weight_fraction_leaf=0.1)
tree_clf2.fit(X, y)
print(roc_auc_score(tree_clf2.predict(X), y))

graph_tree(tree_clf2, attr)

- <code>max_features</code> - the number of features to consider when looking for the best split;

In [None]:
tree_clf2 = DecisionTreeClassifier(max_features=50, class_weight='balanced')
tree_clf2.fit(X, y)
print(roc_auc_score(tree_clf2.predict(X), y))

graph_tree(tree_clf2, attr)

- <code>max_leaf_nodes</code> - grow a tree with <code>max_leaf_nodes</code> in best-first fashion;

In [None]:
tree_clf2 = DecisionTreeClassifier(max_leaf_nodes=4, class_weight='balanced')
tree_clf2.fit(X, y)
print(roc_auc_score(tree_clf2.predict(X), y))

graph_tree(tree_clf2, attr)

- <code>min_impurity_decrease</code> - a node will be split if this split induces a decrease of the impurity greater than or equal to this value;

In [None]:
tree_clf2 = DecisionTreeClassifier(min_impurity_decrease=0.01, class_weight='balanced')
tree_clf2.fit(X, y)
print(roc_auc_score(tree_clf2.predict(X), y))

graph_tree(tree_clf2, attr)

- <code>class_weight</code> - weights associated with classes.

## Random Forests - basics

<b>Exercise</b>

Using <code>RandomForestClassifier</code> class train a random forest and compute ROC AUC score. Use two of the regulariztion parameters for decision tree to train your model - a random forest is basically a lot of decision trees. Try also changing the number of trees of your forest using <code>n_estimators</code> parameter.

Checking which features were the most important for the forest trained above.

In [None]:
feature_importances(frst_clf)

## Gradient Boosting - basics

<b>Exercise</b>

Using <code>GradientBoostingClassifier</code> class train a gradient boosting classifier and compute ROC AUC score. Afterwards, check which features were the most important using <code>feature\_importances\_</code> attribute. You can control learning rate with <code>learning\_rate</code> parameter.

## Neural networks - basics

In [None]:
from sklearn.neural_network import MLPClassifier

nn_clf = MLPClassifier()
nn_clf.fit(X, y)
roc_auc_score(nn_clf.predict(X), y)

- <code>hidden_layer_sizes</code> - a tuple with indices indicating number of neurons in hidden layers

<b>Exercise</b>

Using <code>MLPClassifier</code> class train a neural network with 600 neurons in the first hidden layer. Compute ROC AUC score.

In [None]:
#Basic parameters
nn_clf.n_layers_, nn_clf.n_outputs_, nn_clf.out_activation_

- <code>activation</code> - activation function from a list \[‘identity’, ‘logistic’, ‘tanh’, ‘relu’\]

<b>Exercise</b>

Using the code from the cell above train a neural network with one of the activation functions above (defualt is 'relu')

In [None]:
nn_clf

- <code>learning_rate</code> - defines the learning rate update schedule as per one of the schedules from the list {‘constant’, ‘invscaling’, ‘adaptive’} - 'constant' means no change.

In [None]:
#Source: https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
title = "Learning Curves ('tanh' activation function)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=20, test_size=0.2, random_state=0)

plot_learning_curve(nn_clf, title, X, y, ylim=(0.6, 1), cv=cv, n_jobs=-1)

plt.show()
plt.close()

In [None]:
nn_clf1 = MLPClassifier(hidden_layer_sizes=(600,), activation='logistic')
nn_clf.fit(X, y)
#roc_auc_score(nn_clf.predict(X), y)

title = "Learning Curves ('logistic' activation function)"

plot_learning_curve(nn_clf1, title, X, y, ylim=(0.65, 1.05), cv=cv, n_jobs=-1)

plt.show()
plt.close()

In [None]:
nn_clf1 = MLPClassifier(hidden_layer_sizes=(600,), activation='logistic', learning_rate='invscaling')
nn_clf.fit(X, y)
#roc_auc_score(nn_clf.predict(X), y)

title = "Learning Curves ('logistic' activation function)"

plot_learning_curve(nn_clf1, title, X, y, ylim=(0.65, 1.05), cv=cv, n_jobs=-1)

plt.show()
plt.close()

- <code>max_iter</code> - terminate learning after specified amount of iterations;

<b>Exercise</b>

Train a neural network with <code>max_iter</code>

## SVM - basics

<b>Exercise</b>

Using <code>SVC</code> class train supporting vector machine and compute ROC AUC score.

# Validation techniques

## Out-of-sample validation

<b>Exercise</b>

Using <code>test_train_split</code> function split the <code>train_data</code> set into <code>train_in_sample</code> and <code>train_out_of_sample</code> sets, split them into <code>X_is, X_oos, y_is, y_os</code> with 80-20 proportion.

<b>Exercise</b>

Choose a classifying algorithm and train in on the in-sample set produced above. Using <code>predict()</code> method compute ROC AUC score on the in-sample and out-of-sample sets.

Compare your results with other colleagues who used the same algorithm.

## K-fold validation

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)

In [None]:
for index, (train_index, test_index) in enumerate(skf.split(X, y)):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    tree_clf2.fit(X_train, y_train)
    
    print("The in-sample ROC AUC score for the ", index, "split is ", 
          roc_auc_score(tree_clf2.predict(X_train), y_train), ".")
    print("The out-of-sample ROC AUC score for the ", index, "split is ", 
          roc_auc_score(tree_clf2.predict(X_test), y_test), ".")
    print("\n")

<b>Exercise</b>

Validate any of the algorithms discussed above and decide whether it's better than the decision tree trained above using average ROC AUC score from the out-of-sample.

# Parameter grids

In [None]:
param_grid_tree = [
    {'splitter': ['best', 'random'], 'max_depth': [2, 4, 6, 8]},
    {'splitter': ['best', 'random'], 'min_samples_split': [2, 6, 10, 16]},
    {'splitter': ['best', 'random'], 'min_samples_leaf': [1, 2, 4]},
    {'splitter': ['best', 'random'], 'max_features': [10, "auto", "sqrt", None]},
    {'splitter': ['best', 'random'], 'max_leaf_nodes': [50, 100, 150]}
]

In [None]:
from sklearn.model_selection import GridSearchCV

grd_tree = GridSearchCV(DecisionTreeClassifier(class_weight='balanced'), param_grid_tree,
                        cv=5, scoring='roc_auc')

In [None]:
grd_tree.fit(X, y)
grd_tree.best_score_

In [None]:
graph_tree(grd_tree.best_estimator_,attr)

<b>Exercise</b>

Try out some combinations of regularization parameters. Check which parameters were chosen!

<b>Exercise</b>

Eperiment with <code>RandomForestClassifier</code> and <code>GradientBoostingClassifier</code>. Check what best results you can get.

# Pipelines

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class DataSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attr_names):
        self.attr_names = attr_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attr_names].values
    
class DataDummy(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        dummy = pd.get_dummies(pd.DataFrame(X), dummy_na=True)
        return dummy.values

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion

num_pipeline = Pipeline([
    ('select_attr', DataSelector(numeric_vars)),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('select_attr', DataSelector(cat_vars)),
    ('scaler', DataDummy())
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipe', num_pipeline),
    ('cat_pipe', cat_pipeline)
])

data_prepared = full_pipeline.fit_transform(raw_data)

In [None]:
data_prepared