[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/umatter/EDFB/blob/main/notebooks/Python/Clustering_and_Credit_Risk.ipynb)

# Practical of Real Financial Data

---
This notebook includes an end-to-end unsupervised and supervised learning task on financial data. It is prepared for GitHub + Colab sharing: the first cell bootstraps exact dependencies, and the dataset is loaded from the repo (or downloaded) so it runs end-to-end without manual uploads.

*   Describe the data and carry-out all necessary pre-processing
*   Run k-means clustering using different k values
*   Evaluate the perfromance of the different clusters and select the best value for k
*   Train a logistic regression classifier that predicts whether the company will default on its loan using the full dataset
*   Train a separate model for each of the identified clusters



In [None]:
# @title Setup (installs exact versions; safe to rerun)
import sys
!pip -q install --upgrade pip
!pip -q install numpy==1.26.4 pandas==2.2.2 scikit-learn==1.5.1 matplotlib==3.9.0 seaborn==0.13.2 statsmodels==0.14.2 scipy==1.13.1 plotly==5.23.0

# Imports
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import MaxNLocator # PyLab is a procedural interface to the Matplotlib object-oriented plotting library.
from pathlib import Path
import urllib.request
%matplotlib inline

print('Python:', sys.version)
import sklearn, matplotlib
print('Loaded versions:')
print('- numpy', np.__version__)
print('- pandas', pd.__version__)
print('- scikit-learn', sklearn.__version__)
print('- matplotlib', matplotlib.__version__)
print('- seaborn', sns.__version__)


In [None]:
# To make this notebook's output stable across runs (we make the output reproducable)
np.random.seed(42)

In [None]:
# Define cluster_kmeans function
from sklearn.cluster import KMeans
from sklearn import metrics
# from sklearn.metrics import pairwise_distances

# silhouette: 1=good, 0=overlap, -1=bad
# Within Cluster Sum of Squares: lower is better

def cluster_kmeans(df, nclust):

    kmeans = KMeans(n_clusters=nclust, random_state=0).fit(df)
    label = kmeans.labels_
    centroids = kmeans.cluster_centers_
    sil=metrics.silhouette_score(df, label, metric='euclidean', random_state=0)
    wcss = kmeans.inertia_

    return sil, wcss, label, centroids

In [None]:
# Load borrower_companies.csv from repo (data/borrower_companies.csv) or download from GitHub
data_path = Path('data/borrower_companies.csv')
data_path.parent.mkdir(parents=True, exist_ok=True)
if data_path.exists():
    print('Loading data from', data_path)
    dataset = pd.read_csv(data_path)
else:
    url = 'https://raw.githubusercontent.com/umatter/EDFB/main/data/borrower_companies.csv'
    try:
        print('Attempting to download borrower_companies.csv from', url)
        urllib.request.urlretrieve(url, data_path)
        dataset = pd.read_csv(data_path)
        print('Downloaded to', data_path)
    except Exception as e:
        raise RuntimeError(f'Could not obtain borrower_companies.csv: {e}')

In [None]:
# Data is loaded into `dataset` above.

In [None]:
# In the following steps, we investigate the properties of the data.
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.dtypes

In [None]:
dataset.isna().any()

In [None]:
dataset.describe()

In [None]:
# We check the distribution of the features included in the dataset through box plots. A box plot is a method for graphically depicting groups of numerical data through their quartiles.
from sklearn import preprocessing
def box_plot(df, standardize=True):

    fig=plt.figure(figsize=(20,10))

    if standardize==True:
        # standardize columns for better visualization
        df=pd.DataFrame(preprocessing.StandardScaler().fit_transform(df.values), columns = df.columns) # Standard.Scaler (x-m)/s
    fig=sns.boxplot(x='value', y='variable', data=pd.melt(df.reset_index(), id_vars='index', value_vars=list(df.columns)),
               orient='h')
    fig.tick_params(labelsize=10)
    fig.set_xlabel('')
    fig.set_ylabel('')
    fig.set_title('Note that variables are standardized\nfor better visualization', fontsize=20)
    plt.show()


box_plot(dataset.drop(columns="status"), standardize=True)

In [None]:
# The abave graph indicated the presence of many outliers. In the following step we apply the z-score.
# A z-score indicated the number of standard deviations above or below the mean that each value falls.
# For example, a Z-score of 3 indicates that an observation is three standard deviations above the average
# while a Z-score of -3 signifies it is three standard deviations below the mean. A standard cut-off value for
# finding outliers are Z-scores of +/-3 or 4 further from zero
from scipy import stats
z = np.abs(stats.zscore(dataset))
dataset_o = dataset[(z < 4).all(axis=1)]

In [None]:
# We check the shape of the new data. We have reduced the dataset significantly
dataset_o.shape

In [None]:
dataset_o.describe()

In [None]:
# Simiarly, we again check the distribution of the features through box plot. Although reduced, we still have significant amount of outliers in the sample.
box_plot(dataset_o.drop(columns="status"), standardize=True)

In [None]:
# In the next section, we proceed with running a clustering algorithm on the data so to identify groups of homogenous borrower-companies.
X = dataset_o.drop(columns="status")
y = dataset_o.copy().status
X = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X.values), columns = X.columns)

In [None]:
# Since, we cannot plot the data as it is multidimensional, we use the dimensionality reduction technique - Principal Component Analysis (PCA).
# We notice that the first 2 PC account for ~40% of the variations in the dataset.

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px

pca = PCA(n_components=X.shape[1], random_state=0).fit(X)
scores = pca.transform(StandardScaler().fit_transform(X))

exp_var_pca = pca.explained_variance_ratio_
cum_sum_eigenvalues = np.cumsum(exp_var_pca)

plt.figure(figsize=(10,5))
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid',label='Cumulative explained variance')
plt.ylabel('Cumulative Explained Variance', size=15)
plt.xlabel('Number of Principal Components', size=15)
plt.legend(loc='best', fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# In the next step, we validate the number of clusters i.e. we evaluate clusters on X
max_n_clusters = 7

tab=pd.DataFrame(columns = ['Clusters', 'Silhouette(max)', 'WCSS(min)'], dtype=int).fillna('')
tab['Silhouette(max)']=tab['Silhouette(max)'].astype(float)
label_list={}

fig, ax = plt.subplots(math.ceil((max_n_clusters-1) / 2), 2, figsize=(20,max_n_clusters *4), constrained_layout=True)
ax=ax.flatten()
for i in range(max_n_clusters-1):

    nclust = i + 2
    sil, wcss, label, _ = cluster_kmeans(X, nclust)
    df = pd.DataFrame(data=scores,index=label)
    centroids = df.groupby(level=0).mean().values
    tab = pd.concat([tab, pd.DataFrame([[nclust, sil, wcss]], columns=tab.columns)], ignore_index=True)
    label_list[str(nclust)]=label

    ax[i].scatter(scores[:,0], scores[:,1], c=label, cmap='Accent', s=40)
    ax[i].scatter(centroids[:,0], centroids[:,1], c=range(nclust), cmap='Accent', s=300, marker='P')
    ax[i].set_title('Clusters: ' + str(nclust), fontsize = 30)
    textstr = 'Sil: ' + str(round(sil, 3)) + '\nWCSS: ' + str(int(wcss))
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
    ax[i].text(0.75, 0.97, textstr, transform=ax[i].transAxes, fontsize=25,
        verticalalignment='top', bbox=props)

plt.show()
display(tab)


In [None]:
# Next, we determine optimal number of clusters with Elbow method and the Silhouette coefficinet.
# What would you suggest as the ideal cut-off point?

fig, ax1 = plt.subplots(figsize=(10,5))
ax1.plot(tab.Clusters, tab['Silhouette(max)'], 'bx-', color = 'blue')
ax1.set_xlabel('Number of clusters', fontsize = 20)
ax1.set_ylabel('Silhouette', fontsize = 20, color = 'blue')
ax1.tick_params(axis="x", labelsize=15)
ax1.tick_params(axis='y', labelcolor='blue', labelsize=13)

ax2 = ax1.twinx()
ax2.plot(tab.Clusters, tab['WCSS(min)'], 'bx-', color = 'red')
ax2.set_ylabel('WCSS', fontsize = 20, color = 'red')
ax2.tick_params(axis='y', labelcolor='red', labelsize=13)

In [None]:
# In the next step, we inspect the clusters' features. Specifically, we want to check whether there is a significant difference in the distribution of the features amount the clusters.

import warnings
warnings.filterwarnings('ignore')
number_of_clusters = 3


label = label_list[str(number_of_clusters)]
fig, ax = plt.subplots(math.ceil(X.shape[1] / 2), 2, figsize=(20,20), constrained_layout=True)
ax=ax.flatten()
from sklearn import preprocessing
X_labels=pd.DataFrame(data=X.values, index=label, columns=X.columns)
i=0

for var in X_labels.columns:

    for clust in range(number_of_clusters):

        df = X_labels.copy()[X_labels.index == clust]
        sns.distplot(df[var], ax=ax[i], norm_hist=True, label='Cluster ' + str(clust+1), hist_kws=dict(alpha=0.4))
        ax[i].set_title(var, fontsize=30)
        ax[i].set_xlabel('')
        ax[i].legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=16)

    i += 1

In [None]:
# Next, we run the k-means and create 3 seperate dataset for each of the clusters
kmeans = KMeans(3, random_state=0).fit(X)
label = kmeans.labels_
X["label"] = label
X['status'] = dataset_o.copy().status

In [None]:
cluster_0 = X[X["label"] == 0]
cluster_1 = X[X["label"] == 1]
cluster_2 = X[X["label"] == 2]

**Training a classifer on the entire dataset vs one for each of the clusters**
In this section, we are going to demonstrate the usefulness of unsupervised learning algorithms and clustering in particular as a pre-modelling step. Specifically, we wil

* Train a logistic regression classifier that predicts whether the company will default on its loan using the full dataset
* Train a seperate model for each of the identified clusters


In [None]:
# We start with the outlier free datasets containing all observations
dataset_o.describe()

In [None]:
# As a reminder, we check the dispersion with box plot
box_plot(dataset_o.drop(columns="status"), standardize=True)

In [None]:
# Check distribution for target variable
plt.figure(figsize=(10,10))
sns.catplot(x='status', kind="count", data=dataset_o) # categorical plots
plt.show()

In [None]:
# The dataset is very unbalanced so we remove some observation for y=0 to be equal to 2*size of y=1.
# This is called "undersampling"

# We keep all y=1
from sklearn.model_selection import train_test_split
data_1 = dataset_o[dataset_o['status'] == 1]
print(data_1.shape)

# We take y=0 as double the size of data_1
# Moreover we "stratify" the sampling in order to take the same distribution for each variable
# We use the train_test_split function and we keep the test only
all_data_0 = dataset_o[dataset_o['status'] == 0]
percentage_corresponding_to_double_size = 2*data_1.shape[0] / all_data_0.shape[0] # 2*size_1 compared to size_0

X = all_data_0.drop(columns=['status'])
y = all_data_0['status'].to_frame()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)
data_0_big, data_0_small = train_test_split(all_data_0, test_size=percentage_corresponding_to_double_size,
                                                    random_state=0, shuffle=True)
print(data_0_big.shape) # remaining from the dataset
print(data_0_small.shape)


In [None]:
# We merge the two dataset

dataset=pd.concat([data_1, data_0_small], axis= 0).reset_index(drop=True)  # axis = 1 by column and = 0 by row
print(dataset.shape)

In [None]:
# We define X and y and standardise
X = dataset.drop(columns=['status'])
y = dataset['status'].values.reshape(-1,1)
print(X.shape)
print(y.shape)

In [None]:
X = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X.values), columns = X.columns)

In [None]:
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=dataset.status)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Fit the model on training set
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=0) # solver (https://towardsdatascience.com/dont-sweat-the-solver-stuff-aea7cddc3451)
model.fit(X_train,y_train) # training the algorithm

In [None]:
import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit()
print(result.summary2())

In [None]:
# Get fitted value on testing set
y_test_predicted = model.predict(X_test)

# Compare predictions
display(pd.DataFrame({'True': y_test.flatten(), 'Predicted': y_test_predicted.flatten()}))

# Compare predicted probabilities (default threshold for converting to 0 or 1 is 0.5)
y_test_predicted_prob = model.predict_proba(X_test)[:,1]
display(pd.DataFrame({'True': y_test.flatten(), 'Predicted_prob': y_test_predicted_prob.flatten(), 'Predicted': y_test_predicted.flatten()}))

In [None]:
# Evaluate confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_test_predicted)

In [None]:
# Evaluate confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Only use the labels that appear in the data
    classes = ['0', '1']
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)

    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax


np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_test_predicted)
plt.show()

In [None]:
# Evaluate precision, recall, F1-score on test set
# A macro-average will compute the metric independently for each class and then take the average (hence treating all classes equally),
# whereas a micro-average will aggregate the contributions of all classes to compute the average metric.
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_predicted))

In [None]:
# Finally, we plot the ROC curve and the corresponding area under the curve.
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, y_test_predicted)
fpr, tpr, thresholds = roc_curve(y_test, y_test_predicted_prob)


plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

**Unsupervised learning as a pre-modelling step**

In the next section, we evaluate a logitic classifer trained seperately on each cluster, starting with "cluster_0"

In [None]:
# We strat by describing the subset.
cluster_0.describe()

In [None]:
# Check dispersion with box plot
box_plot(cluster_0.drop(columns="status"), standardize=True)

In [None]:
# Check distribution for target variable
plt.figure(figsize=(10,10))
sns.catplot(x='status', kind="count", data=cluster_0) # categorical plots
plt.show()

In [None]:
# Similarly as before, the subsample is very unbalanced so we remove some observation for y=0 to be equal to 2*size of y=1.
# We keep all y=1
from sklearn.model_selection import train_test_split
data_1 = cluster_0[cluster_0['status'] == 1]
print(data_1.shape)

# We take y=0 as double the size of data_1
# Moreover we "stratify" the sampling in order to take the same distribution for each variable
# We use the train_test_split function and we keep the test only
all_data_0 = cluster_0[cluster_0['status'] == 0]
percentage_corresponding_to_double_size = 2*data_1.shape[0] / all_data_0.shape[0] # 2*size_1 compared to size_0

X = all_data_0.drop(columns=['status'])
y = all_data_0['status'].to_frame()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)
data_0_big, data_0_small = train_test_split(all_data_0, test_size=percentage_corresponding_to_double_size,
                                                    random_state=0, shuffle=True)
print(data_0_big.shape) # remaining from the dataset
print(data_0_small.shape)

In [None]:
# We merge the two dataset
dataset=pd.concat([data_1, data_0_small], axis= 0).reset_index(drop=True)  # axis = 1 by column and = 0 by row
print(dataset.shape)

In [None]:
# We check distribution for target variable after downsampling

plt.figure(figsize=(10,10))
sns.catplot(x='status', kind="count", data=dataset)
plt.show()

In [None]:
# We define X and y and strandardize
X = dataset.drop(columns=['status'])
y = dataset['status'].values.reshape(-1,1)
print(X.shape)
print(y.shape)

In [None]:
X = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X.values), columns = X.columns)

In [None]:
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=dataset.status)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Fit the model on training set
model = LogisticRegression(solver='lbfgs', random_state=0) # solver (https://towardsdatascience.com/dont-sweat-the-solver-stuff-aea7cddc3451)
model.fit(X_train,y_train) # training the algorithm

In [None]:
# Get fitted value on test set
y_test_predicted = model.predict(X_test)

# Compare predictions
display(pd.DataFrame({'True': y_test.flatten(), 'Predicted': y_test_predicted.flatten()}))

# Compare predicted probabilities (default threshold for converting to 0 or 1 is 0.5)
y_test_predicted_prob = model.predict_proba(X_test)[:,1]
display(pd.DataFrame({'True': y_test.flatten(), 'Predicted_prob': y_test_predicted_prob.flatten(), 'Predicted': y_test_predicted.flatten()}))

In [None]:
# Evaluate confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_test_predicted)

In [None]:
# Evaluate confusion matrix
plot_confusion_matrix(y_test, y_test_predicted)
plt.show()

In [None]:
# Evaluate precision, recall, F1-score on test set
# A macro-average will compute the metric independently for each class and then take the average (hence treating all classes equally),
# whereas a micro-average will aggregate the contributions of all classes to compute the average metric.
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_predicted))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, y_test_predicted)
fpr, tpr, thresholds = roc_curve(y_test, y_test_predicted_prob)


plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()



**Cluster_1: ML model**

In [None]:
cluster_1.describe()

In [None]:
cluster_1.shape

In [None]:
# Check dispersion with box plot
box_plot(cluster_1.drop(columns="status"), standardize=True)

In [None]:
# Check distribution for target variable
plt.figure(figsize=(10,10))
sns.catplot(x='status', kind="count", data=cluster_1) # categorical plots
plt.show()

In [None]:
# Dataset is very unbalanced so we remove some observation for y=0 to be equal to 2*size of y=1.
# We keep all y=1
from sklearn.model_selection import train_test_split
data_1 = cluster_1[cluster_1['status'] == 1]
print(data_1.shape)

# We take y=0 as double the size of data_1
# Moreover we "stratify" the sampling in order to take the same distribution for each variable
# We use the train_test_split function and we keep the test only
all_data_0 = cluster_1[cluster_1['status'] == 0]
percentage_corresponding_to_double_size = 2*data_1.shape[0] / all_data_0.shape[0] # 2*size_1 compared to size_0

X = all_data_0.drop(columns=['status'])
y = all_data_0['status'].to_frame()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)
data_0_big, data_0_small = train_test_split(all_data_0, test_size=percentage_corresponding_to_double_size,
                                                    random_state=0, shuffle=True)
print(data_0_big.shape) # remaining from the dataset
print(data_0_small.shape)

In [None]:
# Merge two dataset

dataset=pd.concat([data_1, data_0_small], axis= 0).reset_index(drop=True)  # axis = 1 by column and = 0 by row
print(dataset.shape)

In [None]:
# Check distribution for target variable after downsampling

plt.figure(figsize=(10,10))
sns.catplot(x='status', kind="count", data=dataset)
plt.show()

In [None]:
X = dataset.drop(columns=['status'])
y = dataset['status'].values.reshape(-1,1)
print(X.shape)
print(y.shape)

In [None]:
X = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X.values), columns = X.columns)

In [None]:
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=dataset.status)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Fit the model on training set
model = LogisticRegression(solver='lbfgs', random_state=0) # solver (https://towardsdatascience.com/dont-sweat-the-solver-stuff-aea7cddc3451)
model.fit(X_train,y_train) # training the algorithm

In [None]:
# Get fitted value on test set
y_test_predicted = model.predict(X_test)

# Compare predictions
display(pd.DataFrame({'True': y_test.flatten(), 'Predicted': y_test_predicted.flatten()}))

# Compare predicted probabilities (default threshold for converting to 0 or 1 is 0.5)
y_test_predicted_prob = model.predict_proba(X_test)[:,1]
display(pd.DataFrame({'True': y_test.flatten(), 'Predicted_prob': y_test_predicted_prob.flatten(), 'Predicted': y_test_predicted.flatten()}))

In [None]:
# Evaluate confusion matrix
plot_confusion_matrix(y_test, y_test_predicted)
plt.show()

In [None]:
# Plot the ROC curve and the corresponding AUC value.
logit_roc_auc = roc_auc_score(y_test, y_test_predicted)
fpr, tpr, thresholds = roc_curve(y_test, y_test_predicted_prob)


plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

**Cluster_2: ML model**

In [None]:
cluster_2.describe()

In [None]:
cluster_2.shape

In [None]:
# Check dispersion with box plot
box_plot(cluster_2.drop(columns="status"), standardize=True)

In [None]:
# Check distribution for target variable
plt.figure(figsize=(10,10))
sns.catplot(x='status', kind="count", data=cluster_2) # categorical plots
plt.show()

In [None]:
# Dataset is very unbalanced so we remove some observation for y=0 to be equal to 2*size of y=1.
# We keep all y=1
from sklearn.model_selection import train_test_split
data_1 = cluster_2[cluster_2['status'] == 1]
print(data_1.shape)

# We take y=0 as double the size of data_1
# Moreover we "stratify" the sampling in order to take the same distribution for each variable
# We use the train_test_split function and we keep the test only
all_data_0 = cluster_2[cluster_2['status'] == 0]
percentage_corresponding_to_double_size = 2*data_1.shape[0] / all_data_0.shape[0] # 2*size_1 compared to size_0

X = all_data_0.drop(columns=['status'])
y = all_data_0['status'].to_frame()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)
data_0_big, data_0_small = train_test_split(all_data_0, test_size=percentage_corresponding_to_double_size,
                                                    random_state=0, shuffle=True)
print(data_0_big.shape) # remaining from the dataset
print(data_0_small.shape)

In [None]:
# Merge two dataset

dataset=pd.concat([data_1, data_0_small], axis= 0).reset_index(drop=True)  # axis = 1 by column and = 0 by row
print(dataset.shape)

In [None]:
# Check distribution for target variable after downsampling

plt.figure(figsize=(10,10))
sns.catplot(x='status', kind="count", data=dataset)
plt.show()

In [None]:
X = dataset.drop(columns=['status'])
y = dataset['status'].values.reshape(-1,1)
print(X.shape)
print(y.shape)

In [None]:
X = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X.values), columns = X.columns)

In [None]:
# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=dataset.status)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Fit the model on training set
model = LogisticRegression(solver='lbfgs', random_state=0) # solver (https://towardsdatascience.com/dont-sweat-the-solver-stuff-aea7cddc3451)
model.fit(X_train,y_train) # training the algorithm

In [None]:
# Get fitted value on test set
y_test_predicted = model.predict(X_test)

# Compare predictions
display(pd.DataFrame({'True': y_test.flatten(), 'Predicted': y_test_predicted.flatten()}))

# Compare predicted probabilities (default threshold for converting to 0 or 1 is 0.5)
y_test_predicted_prob = model.predict_proba(X_test)[:,1]
display(pd.DataFrame({'True': y_test.flatten(), 'Predicted_prob': y_test_predicted_prob.flatten(), 'Predicted': y_test_predicted.flatten()}))

In [None]:
# Evaluate confusion matrix
plot_confusion_matrix(y_test, y_test_predicted)
plt.show()

In [None]:
logit_roc_auc = roc_auc_score(y_test, y_test_predicted)
fpr, tpr, thresholds = roc_curve(y_test, y_test_predicted_prob)


plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()