In [1]:
import sys
sys.path.append("../")

import pandas as pd
import numpy as np

from data_cleaning.pre_process import pre_process_raw_data
import data_exploration.visualisation_functions as vf 

from bokeh.io import output_notebook, curdoc
from bokeh.plotting import show

output_notebook()

## Load data


In [34]:
df = pd.read_csv("../static/data/LoanData.csv", index_col=[0], low_memory=False)
df = pre_process_raw_data(df)
df['perc_null'] = round(df.isnull().sum(axis=1)/49*100,2)
df['perc_null_category'] = df['perc_null'].apply(lambda x: 'None' if x == 0 else ('Low' if x < 10 else 'High'))

# Basic stats of loans

In [29]:
df[['Amount', 'Interest', 'LoanDuration']].describe().loc[['count','mean', 'std', '50%']].round(2)

Unnamed: 0,Amount,Interest,LoanDuration
count,21450.0,21450.0,21450.0
mean,2126.16,26.29,37.76
std,2061.99,8.52,19.25
50%,1500.0,28.0,36.0


In [30]:
df_paid = df[df['PaidLoan']==True]
df_paid[['Amount', 'Interest', 'LoanDuration']].describe().loc[['count','mean', 'std', '50%']].round(2)

Unnamed: 0,Amount,Interest,LoanDuration
count,11998.0,11998.0,11998.0
mean,1851.56,25.18,33.13
std,1858.35,8.17,19.35
50%,1100.0,26.05,36.0


In [31]:
df_paid = df[df['PaidLoan']==False]
df_paid[['Amount', 'Interest', 'LoanDuration']].describe().loc[['count','mean', 'std', '50%']].round(2)

Unnamed: 0,Amount,Interest,LoanDuration
count,9452.0,9452.0,9452.0
mean,2474.72,27.69,43.63
std,2246.88,8.73,17.42
50%,1893.5,28.0,48.0


## Distribution of categorical variables

In [3]:
object_vars_df =  df.select_dtypes(include=[object]).drop(['UserName', 'Country'], axis=1)
vf.create_plot_layout(df=object_vars_df, number_columns=3, plot_func=vf.plot_bar_chart_distribution)

## Distribution of numerical variables

In [4]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_vars_df =  df.select_dtypes(include=numerics).drop(['LoanNumber'], axis=1)
vf.create_plot_layout(df=num_vars_df, number_columns=3, plot_func=vf.plot_bar_chart_distribution, 
                      is_categorical=False, bins=8)

## Distribution of categorical values by gender

In [53]:
object_vars_df =  df.select_dtypes(include=[object]).drop(['UserName', 'Country'], axis=1)
vf.create_plot_layout(df=object_vars_df, number_columns=2, plot_func=vf.plot_multiple_categorical_bar_chart_distribution, 
                      ignore_cols=['Gender'], group_category='Gender', vars_to_drop='Unknown', plot_width=500)

## Distribution of categorical values by age

In [55]:
vf.create_plot_layout(df=object_vars_df, number_columns=2, plot_func=vf.plot_multiple_categorical_bar_chart_distribution,
                      ignore_cols=['AgeGroup'], group_category='AgeGroup',plot_width=500)

# Distribution of  null values

In [10]:
p = vf.plot_bar_chart_distribution(df, 'perc_null', is_categorical=False, bins=5)
show(p)

# Distribution of cateogrical values by entries with none, low level or high level of null values

In [11]:
indigo_shades = ["#9fa8da", "#5c6bc0", "#3949ab", "#283593", "#8c9eff", "#3f51b5","#1a237e","#7986cb","#c5cae9",
                 "#303f9f", "#e8eaf6", "#E8EAF6"]

object_vars_df =  df.select_dtypes(include=[object]).drop(['UserName', 'Country', 'OccupationArea'], axis=1)

for col in object_vars_df.drop(['perc_null_category'], axis=1, inplace=False).columns:
    object_vars_df_temp = object_vars_df.fillna('nan')
    p = vf.plot_multiple_categorical_bar_chart_distribution(object_vars_df_temp, 'perc_null_category', group_category=col, 
                                                plot_width=600, colours=indigo_shades[0:len(object_vars_df_temp[col].unique())])
    show(p)