# Introduction



# Dataset and Libraries setup

##Libraries and Dataset setup

Download the dataset and the libraries needed.

**Libraries**
- pandas
- numpy
- kaggle
- plotly

In [None]:
### Download files
! apt-get update > /dev/null
! apt-get upgrade > /dev/null
! apt-get install unzip > /dev/nulll

# Python libs
! pip install -q kaggle > /dev/null
! pip install plotly==4.14.3 > /dev/null
! pip install -U kaleido > /dev/null

Extracting templates from packages: 100%


Import plot and other useful libraries

In [None]:
### Import section

# Plot
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go

# Others
import csv
from io import StringIO
from datetime import datetime

##Import dataset
There are two ways to download the dataset, one from kaggle using kaggle api the other one from github (the dataset may be out of date)

In [None]:
### Delete old folders and create new ones
! rm -r /content/data > /dev/null
! mkdir /content/data/ > /dev/null
! rm -r ~/.kaggle > /dev/null
! mkdir ~/.kaggle > /dev/null

### Kaggle

Connect to kaggle, download datataset and setup the files

In [None]:
### Kaggle download setup 

# Insert here the link to the token json file
! wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1-4sfcaQg3DdP6ZoRnm0uFvM5VlMhPDSM' -P /content -O kaggle.json > /dev/null

# Copy token in the right folder
! cp kaggle.json ~/.kaggle/ > /dev/null
! chmod 600 ~/.kaggle/kaggle.json > /dev/null

# Download dataset
! kaggle datasets download -d fedesoriano/company-bankruptcy-prediction -p /content/data > /dev/null

# Unzip and remove the zip
! unzip /content/data/company-bankruptcy-prediction.zip -d /content/data > /dev/null
! rm /content/data/company-bankruptcy-prediction.zip > /dev/null

### Github
Connect to Github, download datataset and setup the files

In [None]:
# ### Download dataset from github repository

# # Donwload all the files
# ! wget -P /content/data https://raw.githubusercontent.com/thisispivi/Deep-Learning-Company-Bankruptcy-Prediction/main/data/data.zip

# # Unzip and remove the zip
# ! unzip /content/data/data.zip -d /content/data
# ! rm /content/data/data.zip

## Read Files
In this section we import the csv files.

In [None]:
df = pd.read_csv('data/data.csv')

Split the dataset into labela and data

In [None]:
labels = df['Bankrupt?']
data = df.drop(['Bankrupt?'], axis=1)

# Deep Learning

## Import Section

In [None]:
from tensorflow import keras
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE



## Analyze dataset

In this section we will analyze the dataset shape, balance and if it has null values in its rows

### Shape

Check the shape of the dataset

In [None]:
print('Data shape:', data.shape)
print('Labels shape:', labels.shape)

NameError: ignored

### Null values

Check if there are null values

In [None]:
df.isnull().sum(axis = 0)

There are no null values so we don't have to deal with them

### Balance

Check if the dataset is balanced

In [None]:
result = df['Bankrupt?'].value_counts()
zero_percentage = round((result[0]*100)/(result[0]+result[1]),2)
print("No. of 0: "+ str(result[0]) + "\nNo. of 1: " + str(result[1]) + 
      "\nPercentage of 0: "+ str(zero_percentage)+ " %\nPercentage of 1: "+
      str(round((100-zero_percentage),2))+" %")

In [None]:
plt.bar(x=["No Bankrupt", "Bankrupt"], height=[result[0], result[1]], color=["royalblue", "indianred"])
plt.ylabel("Count")
plt.title("Number of No Bankrupt rows vs number of Bankrupt rows ")

In [None]:
plt.pie([result[0], result[1]], labels=["No Bankrupt", "Bankrupt"], explode=(0.1, 0), autopct='%1.2f%%', colors=["thistle", "paleturquoise"], radius=1.2)
plt.title("Percentage of No Bankrupt vs percentage of Bankrupt")

The dataset is not balanced. If we don't fix this when we will run the model we will have a perfect accuracy, because the network will concentrate only on the major class.

## Normalize values

In this section we want to normalize the values, so we take the colums with values over 1 and with values less than 0 and we normalize them using *StandardScaler()*. This scaler uses the mean and the standard deviation to set all values to between 0 and 1.

In [None]:
### Normalize values

# Take the columns with values over 1
cols_for_scale = df.max()[df.max()>1]
# Take the columns with values less than 0
df.min()[df.min()<0] # It is none there aren't negative values
# Normalize values
scale = StandardScaler()
scaled = scale.fit_transform(df[cols_for_scale.keys()])
# Substitute the old values with the normalized ones
i = 0
for column in cols_for_scale.keys():
    df[column] = scaled[:,i]
    i += 1
# Update labels and data
labels = df['Bankrupt?']
data = df.drop(['Bankrupt?'], axis=1)

## Balance Dataset using SMOTE
To balance the dataset we use SMOTE (Synthetic Minority Oversampling Technique).

[Link](https://towardsdatascience.com/applying-smote-for-class-imbalance-with-just-a-few-lines-of-code-python-cdf603e58688)

Import SMOTE and resample the data

In [None]:
sm = SMOTE()
data_new, labels_new = sm.fit_resample(data, labels)

Check the shapes

In [None]:
print('Data shape:', data_new.shape)
print('Labels shape:', labels_new.shape)

In [None]:
new_df = pd.DataFrame(labels_new)
result = new_df[0].value_counts()
zero_percentage = round((result[0]*100)/(result[0]+result[1]),2)
print("No. of 0: "+ str(result[0]) + "\nNo. of 1: " + str(result[1]) + 
      "\nPercentage of 0: "+ str(zero_percentage)+ " %\nPercentage of 1: "+
      str(round((100-zero_percentage),2))+" %")

In [None]:
plt.bar(x=["No Bankrupt", "Bankrupt"], height=[result[0], result[1]], color=["royalblue", "indianred"])
plt.ylabel("Count")
plt.title("Number of No Bankrupt rows vs number of Bankrupt rows ")

In [None]:
plt.pie([result[0], result[1]], labels=["No Bankrupt", "Bankrupt"], explode=(0.1, 0), autopct='%1.2f%%', colors=["thistle", "paleturquoise"], radius=1)
plt.title("Percentage of No Bankrupt vs percentage of Bankrupt")

As we can see the dataset is perfectly balanced.

## Split data into training, validation and test set

Split the data in:
* x_train: The training set data
* y_train: The training set label
* x_valid: The validation set data
* y_valid: The validation set label
* x_test: The validation set data
* y_test: The validation set label

The dimension will be something like

* Training: 70%
* Validation: 20%
* Test: 10%

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data_new, labels_new, train_size=0.9)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, train_size=0.8)

Print all the sizes

In [None]:
print('Train data shape:', x_train.shape)
print('Train labels shape:', y_train.shape)
print('Validation data shape:', x_valid.shape)
print('Validation labels shape:', y_valid.shape)
print('Test data shape:', x_test.shape)
print('Test labels shape:', y_test.shape)

## Create New Model

### Options

In this section there are some boolean variables to tune what the code will do:

* train_model -> True: the network will be trained / False: network wont' be trained
* model_loss -> True: plot the model loss / False: don't plot the model loss
* model_accuracy -> True: plot the model accuracy / False: don't plot the model accuracy
* evaluate_model -> True: evaluate the model / False: don't evaluate the model
* conf_matr -> True: plot the confusion matrix / False: don't plot the confusion matrix
* plot_model -> True: plot the structure of the network / False: don't plot the structure of the network
* save_model -> True: save the model / False: don't save the model

In [None]:
train_model = True
model_loss = True
model_accuracy = True
evaluate_model = True
conf_matr = True
plot_model = True
save_model = False

### Create the network

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(128, activation='relu', input_shape=(95,)))
model.add(keras.layers.Dense(64,kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(32,kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(16,kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
optimizer = keras.optimizers.RMSprop(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

### Train the network

In [None]:
if train_model == True:
  history = model.fit(x_train, y_train, epochs=200, validation_data=(x_valid,y_valid))

### Loss graph of the model

In [None]:
if model_loss == True:
  plt.subplots(figsize=(12,8))
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('Model Loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Validation'], loc='upper right')
  plt.show()

### Accuracy graph of the model

In [None]:
if model_accuracy == True:
  plt.subplots(figsize=(12,8))
  plt.plot(history.history['accuracy'])
  plt.plot(history.history['val_accuracy'])
  plt.title('Model Accuracy')
  plt.ylabel('Accuracy')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Validation'], loc='lower right')
  plt.show()

### Evaluate the model

Check how well the dataset perform on the test set


In [None]:
if evaluate_model == True:
  model.evaluate(x_test, y_test)

### Confusion Matrix

Compute the label prediction using the test set and plot the confusion matrix.

In [None]:
if conf_matr == True:
  predictions = model.predict(x_test)
  classes = predictions > 0.5
  cm = confusion_matrix(y_test,classes)

  # Plot
  plt.figure(figsize=(10,7))
  ax = plt.subplot()
  sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap="PuBu");  # annot=True to annotate cells, ftm='g' to disable scientific notation
  ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
  ax.set_title('Confusion Matrix'); 
  ax.xaxis.set_ticklabels(['No Bankrupt', 'Bankrupt']); ax.yaxis.set_ticklabels(['No Bankrupt', 'Bankrupt']);
  print(classification_report(y_test,classes))

### Test performance original dataset

Here we see how the network performs on the dataset unmodified by the SMOTE

In [None]:
x_original_train, x_original_test, y_original_train, y_original_test = train_test_split(data, labels, train_size=0.9)

#### Confusion Matrix

In [None]:
if conf_matr == True:
  predictions = model.predict(x_original_test)
  classes = predictions > 0.5
  cm = confusion_matrix(y_original_test,classes)

  # Plot
  plt.figure(figsize=(10,7))
  ax = plt.subplot()
  sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap="PuBu");  # annot=True to annotate cells, ftm='g' to disable scientific notation
  ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
  ax.set_title('Confusion Matrix'); 
  ax.xaxis.set_ticklabels(['No Bankrupt', 'Bankrupt']); ax.yaxis.set_ticklabels(['No Bankrupt', 'Bankrupt']);
  print(classification_report(y_test,classes))

### Plot model

In [None]:
if plot_model == True:
  dot_img_file = "network.png"
  keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)

### Save the model

In [None]:
if save_model == True:
  file_name = 'acc_97'
  model.save(file_name)

# ! zip -r model.zip acc_97/

## Load the model

Load on colab the model.zip file. Uncomment to use this section

Unzip the model

In [None]:
# ! unzip model.zip

Import the model

In [None]:
# file_name = 'acc_97'
# model = keras.models.load_model(file_name)