# Exploratory analysis

## Setup

In [35]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

import plotly.express as px
import textwrap

In [36]:
files_path = ''
if 'google.colab' in str(get_ipython()):
    print('TO DO: Set up Google Colab')
    # print('Running in Google Colab')
    # from google.colab import drive
    # mount_point = '/content/drive'
    # drive.mount(mount_point)
    # files_path = mount_point + '/MyDrive/Colab Notebooks/'
else:
    print('Running locally')
    files_path = '../data/'

Running locally


In [37]:
from matplotlib import cycler, rcdefaults
rcdefaults()
colors = cycler('color',
                ['#EE6666', '#3388BB', '#9988DD',
                 '#EECC55', '#88BB44', '#FFBBBB'])
plt.rc('axes', facecolor='#E6E6E6', edgecolor='none',
       axisbelow=True, grid=True, prop_cycle=colors)
plt.rc('grid', color='w', linestyle='solid')
plt.rc('xtick', direction='out', color='gray')
plt.rc('ytick', direction='out', color='gray')
plt.rc('patch', edgecolor='#E6E6E6')
plt.rc('lines', linewidth=2)
plt.rc('text', color='gray')
plt.rc('axes', labelcolor='gray')

## 1. Exploratory analysis

In [38]:
german_df = pd.read_csv(files_path + 'syntetic_sample.csv')

In [39]:
german_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 21 columns):
 #   Column                   Non-Null Count    Dtype 
---  ------                   --------------    ----- 
 0   sex                      1000000 non-null  object
 1   marrital_status          1000000 non-null  object
 2   age                      1000000 non-null  int64 
 3   n_of_liables             1000000 non-null  int64 
 4   job                      1000000 non-null  object
 5   foreign_worker           1000000 non-null  int64 
 6   present_employee_since   1000000 non-null  object
 7   telephone                1000000 non-null  int64 
 8   housing                  1000000 non-null  object
 9   present_residence_since  1000000 non-null  int64 
 10  property                 1000000 non-null  object
 11  checking_account         1000000 non-null  object
 12  savings                  1000000 non-null  object
 13  purpose                  1000000 non-null  object
 14  cre

In [40]:
german_df.head()

Unnamed: 0,sex,marrital_status,age,n_of_liables,job,foreign_worker,present_employee_since,telephone,housing,present_residence_since,...,checking_account,savings,purpose,credit_history,duration,credit_amount,guarantors,other_installment_plans,credits_at_bank,risk
0,female,divorced,58,1,unskilled resident,1,>=7y,0,for free,4,...,< 0 DM,<100 DM,used car,5,48,6416,none,bank,2,0
1,male,single,47,1,qualified,1,1-4y,0,own,3,...,>= 200 DM,100-500 DM,business,3,36,8229,none,none,4,0
2,male,married/widowed,35,1,qualified,1,>=7y,1,own,1,...,< 0 DM,100-500 DM,radio/television,3,48,6999,none,bank,1,0
3,female,divorced,26,1,highly qualified,1,unemployed,1,rent,4,...,no checking account,0 or unk.,used car,3,48,9055,none,none,1,1
4,male,single,43,2,highly qualified,1,>=7y,1,own,4,...,0 <= ... < 200 DM,100-500 DM,radio/television,5,24,1175,none,none,1,0


In [41]:
non_numerical_cols = list(german_df.select_dtypes(exclude=[np.number]).columns)
print("Non-numerical columns:")
non_numerical_cols

Non-numerical columns:


['sex',
 'marrital_status',
 'job',
 'present_employee_since',
 'housing',
 'property',
 'checking_account',
 'savings',
 'purpose',
 'guarantors',
 'other_installment_plans']

In [42]:
for col in non_numerical_cols:
    print(f"Unique values in {col}:")
    print(german_df[col].unique())
    print()

Unique values in sex:
['female' 'male']

Unique values in marrital_status:
['divorced' 'single' 'married/widowed']

Unique values in job:
['unskilled resident' 'qualified' 'highly qualified'
 'unemployed/unskilled non-resident']

Unique values in present_employee_since:
['>=7y' '1-4y' 'unemployed' '<1y' '4-7y']

Unique values in housing:
['for free' 'own' 'rent']

Unique values in property:
['unk. / no property' 'real estate' 'building society / life insurance'
 'car or other']

Unique values in checking_account:
['< 0 DM' '>= 200 DM' 'no checking account' '0 <= ... < 200 DM']

Unique values in savings:
['<100 DM' '100-500 DM' '0 or unk.' '500-1000 DM' '>1000 DM']

Unique values in purpose:
['used car' 'business' 'radio/television' 'education'
 'furniture/equipment' 'new car' 'repairs' 'retraining' 'others'
 'domestic appliances']

Unique values in guarantors:
['none' 'guarantor' 'co-applicant']

Unique values in other_installment_plans:
['bank' 'none' 'stores']



In [45]:
ordinal_non_numeric_cols = ['present_employee_since', 'checking_account', 'savings', 'job']
non_ordinal_non_numeric_cols = list(set(non_numerical_cols) - set(ordinal_non_numeric_cols))
numeric_cols = list(german_df.select_dtypes(include=[np.number]).columns)

In [46]:
print("Ordinal non-numerical columns:")
ordinal_non_numeric_cols

Ordinal non-numerical columns:


['present_employee_since', 'checking_account', 'savings', 'job']

In [47]:
print("Non-ordinal non-numerical columns:")
non_ordinal_non_numeric_cols

Non-ordinal non-numerical columns:


['other_installment_plans',
 'sex',
 'property',
 'marrital_status',
 'guarantors',
 'housing',
 'purpose']

In [48]:
print("Numerical columns:")
numeric_cols

Numerical columns:


['age',
 'n_of_liables',
 'foreign_worker',
 'telephone',
 'present_residence_since',
 'credit_history',
 'duration',
 'credit_amount',
 'credits_at_bank',
 'risk']