# CreditCardApproval EDA Diana Max

## Setup notebook & fetch data

In [None]:
from ucimlrepo import fetch_ucirepo
#from ydata_profiling import ProfileReport

import numpy as np
import pandas as pd


import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats.contingency import association
from scipy.stats import pointbiserialr, pearsonr, spearmanr

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import TargetEncoder, FunctionTransformer, LabelBinarizer, label_binarize, OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import itertools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

In [None]:
credit_approval = fetch_ucirepo(id=27)

X = credit_approval.data.features
y = credit_approval.data.targets
df = credit_approval.data.original

## First look

In [None]:
credit_approval.data.original

In [None]:
# ProfileReport(df)

## Univariate EDA

### A1

In [None]:
A1 = df[['A1','A16']]

ax = sns.barplot(
    data=A1.reset_index().groupby(['A1','A16']).count().reset_index(),
    x='A1',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

In [None]:
df.A1.isna().sum()

Impute missing values with most frequent category.

### A2

In [None]:
A2 = df[['A2','A16']]
sns.histplot(A2, x='A2', hue='A16',stat='density')

Let's try a log1p transform.

In [None]:
A2 = A2.assign(A2=A2.A2.apply(np.log1p))
sns.histplot(A2, x='A2', hue='A16',stat='density')

The distribution is less skewed now, so it seems like a usefull transformation.

In [None]:
df.A2.isna().sum()

Its still skewed, so impute missing values with median.

### A3

In [None]:
A3 = df[['A3','A16']]
sns.histplot(A3, x='A3', hue='A16',stat='density')

Let's try a log1p transform.

In [None]:
A3 = A3.assign(A3=A3.A3.apply(np.log1p))
sns.histplot(A3, x='A3', hue='A16',stat='density')

The distribution is much less skewed now. Looks like a very good option.

In [None]:
df.A3.isna().sum()

Its still skewed, so impute missing values with median.

### A4

In [None]:
A4 = df[['A4','A16']]

ax = sns.barplot(
    data=A4.reset_index().groupby(['A4','A16']).count().reset_index(),
    x='A4',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

In [None]:
A4['A4'] = A4['A4'].replace('l', 'u')

In [None]:
ax = sns.barplot(
    data=A4.reset_index().groupby(['A4','A16']).count().reset_index(),
    x='A4',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

Bin categories to u and non-u.

In [None]:
df.A4.isna().sum()

Replace missing values with the most-frequent category.

### A5

In [None]:
A5 = df[['A5','A16']]

ax = sns.barplot(
    data=A5.reset_index().groupby(['A5','A16']).count().reset_index(),
    x='A5',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

Bin categories to g and non-g.

In [None]:
A5['A5'] = A5['A5'].replace(['gg', 'p'], 'non-g')

In [None]:
ax = sns.barplot(
    data=A5.reset_index().groupby(['A5','A16']).count().reset_index(),
    x='A5',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

In [None]:
df.A5.isna().sum()

Replace missing values with the most-frequent category.

### A6

In [None]:
ax = sns.barplot(
    data=df.reset_index().groupby(['A6','A16']).count().reset_index(),
    x='A6',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

Many categories with quite a range of different distributions of A16 (target) values. Target encoding seems like a reasonable choice.

In [None]:
#make_pipeline(TargetEncoder(), SimpleImputer(strategy="median"))

In [None]:
ax = sns.histplot(
    data= pd.DataFrame({
        'A6': TargetEncoder().fit_transform(df.A6.values[:,np.newaxis], df.A16).ravel(),
        'A16': df.A16.values}),
    x='A6',
    hue='A16',
    stat='density'
)

Target encoding looks promising.

In [None]:
df.A6.isna().sum()

Impute with median.

### A7

In [None]:
ax = sns.barplot(
    data=df.reset_index().groupby(['A7','A16']).count().reset_index(),
    x='A7',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

Naively try target encoding first.

In [None]:
ax = sns.histplot(
    data= pd.DataFrame({
        'A7': TargetEncoder().fit_transform(df.A7.values[:,np.newaxis], df.A16).ravel(),
        'A16': df.A16.values}),
    x='A7',
    hue='A16',
    kde=True
)

This looks not quite usefull yet, as the kernel density estimates show curves of almost the same shape and location.

Many categories with almost no values. Summarising those categories, that have similar distributions of A16, so: (v, dd, j), (h, z), (bb, n, o), (ff), could be an option.

In [None]:
ax = sns.histplot(
    data=pd.DataFrame({
        'A7 new': df.A7.replace({'ff':'not v','dd':'not v','j':'not v', 'h':'not v','z':'not v', 'bb':'not v','n':'not v','o':'not v'}).ravel(),
        'A16': df.A16.values}),
    x='A7 new',
    hue='A16',
    multiple='dodge',
    shrink=.8
    #stat='density'
)

This still doesnt look very usefull, suggesting either the features is generally bad (at least for a linear classifier) or the new categories are bad.

We will come back to this feature later on. We will have a look at correlations and mutual information first to get a little more information about this feature. If it contians high information, OneHot encoding might be worth it (as it adds dimensions but this might be worth for a good feature).


Otherwise we might try other binnings of categories or throw the feature out all toghether.

In [None]:
df.A7.isna().sum()

Impute missing values with most frequent.

### A8

In [None]:
A8 = df[['A8','A16']]
sns.histplot(A8, x='A8', hue='A16',stat='density')

Try a log1p transform.

In [None]:
A8 = A8.assign(A8=A8.A8.apply(np.log1p))
sns.histplot(A8, x='A8', hue='A16',stat='density')

Although this already looks quite promising a common logarithm (base = 10) might be even better, as the distribution is still strongly left skewed.

In [None]:
A8 = A8.assign(A8=A8.A8.apply(np.log10))
sns.histplot(A8, x='A8', hue='A16',stat='density')

Looks reasonably well.

In [None]:
df.A8.isna().sum()

No missing values.

### A9

In [None]:
A9 = df[['A9','A16']]

ax = sns.barplot(
    data=A9.reset_index().groupby(['A9','A16']).count().reset_index(),
    x='A9',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

In [None]:
df.A9.isna().sum()

Feature is already in binary format and doesn't have any missing values. Perfect.

### A10

In [None]:
A10 = df[['A10','A16']]

ax = sns.barplot(
    data=A10.reset_index().groupby(['A10','A16']).count().reset_index(),
    x='A10',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

In [None]:
df.A10.isna().sum()

Feature is already in binary format and doesn't have any missing values. Perfect.

### A11

In [None]:
A11 = df[['A11','A16']]
sns.histplot(A11, x='A11', hue='A16')

Extremely strong left-skewed distribution with mostly Zeros. Apply a log1p transform.

In [None]:
A11 = A11.assign(A11=A11.A11.apply(np.log1p))
sns.histplot(A11, x='A11', hue='A16')

This looks much better.

In [None]:
df.A11.isna().sum()

### A12

In [None]:
A12 = df[['A12','A16']]

ax = sns.barplot(
    data=A12.reset_index().groupby(['A12','A16']).count().reset_index(),
    x='A12',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

In [None]:
df.A12.isna().sum()

Feature is already in binary format and doesn't have any missing values. Perfect.

### A13

In [None]:
A13 = df[['A13','A16']]

ax = sns.barplot(
    data=A13.reset_index().groupby(['A13','A16']).count().reset_index(),
    x='A13',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

Binarise. Since the distirbution of A16 among p resembles more that of g, summarise g and p into one category.

In [None]:
A13['A13'] = A13['A13'].replace(['s', 'p'], 'non-g')

In [None]:
ax = sns.barplot(
    data=A13.reset_index().groupby(['A13','A16']).count().reset_index(),
    x='A13',
    y='index',
    hue='A16')
ax.set(ylabel='value count')

In [None]:
df.A13.isna().sum()

No missing values.

### A14

In [None]:
A14 = df[['A14','A16']]
sns.histplot(A14, x='A14', hue='A16')

Strongly left-skewed, apply log1p transform.

In [None]:
A14 = A14.assign(A14=A14.A14.apply(np.log1p))
sns.histplot(A14, x='A14', hue='A16')

Fine.

In [None]:
df.A14.isna().sum()

Impute missing values with most frequent value, Zero, which is reasonably well balanced between values for the target.

### A15

In [None]:
A15 = df[['A15','A16']]
sns.histplot(A15, x='A15', hue='A16')

Apply a log1p transform.

In [None]:
A15 = A15.assign(A15=A15.A15.apply(np.log1p))
sns.histplot(A15, x='A15', hue='A16')

Looks much better. Keep that.

In [None]:
df.A15.isna().sum()

No missing values.

### A16

In [None]:
df.A16.value_counts()

## Multivariate EDA

### NaN Analysis

In [None]:
credit_approval.variables[['name','missing_values']]

In [None]:
df.isna()

In [None]:
df.isna().any(axis=1).sum() / df.shape[0]

About 5% of the data contains missing values.

TODO: look if some features are always NaN at the same time

### Correlations with target

Correlate every variables with target:
- categorical data: Cramers V
- continuous data: Point biserial correlation

#### categorical

In [None]:
from matplotlib.pyplot import figure

figure(figsize=(15, 9), dpi=80)

categorical_variables = credit_approval.variables[(credit_approval.variables.type == 'Categorical') & 
                                                  (credit_approval.variables.role == 'Feature')].name.values
s = pd.Series(index=categorical_variables)


for var in categorical_variables:
    s[var] = association(pd.crosstab(df[var], df.A16))

s.plot(kind='bar', title='Correlation with target \n categorical data')


#### continuous

In [None]:
figure(figsize=(15, 9), dpi=80)

target = y.replace({'+':1,'-':0})

continuous_variables = credit_approval.variables[(credit_approval.variables.type == 'Continuous') & (credit_approval.variables.role == 'Feature')].name.values

s = pd.Series(index=continuous_variables)


for var in continuous_variables:
    s[var] = pointbiserialr(target[~df[var].isna()].values.ravel(), df[var].dropna().values).statistic

s.plot(kind='bar', title='Correlation with target \n continious data')

### Correlations and mutual information between variables

In [None]:
"""
Compute mutual information in every commbination of two variables. We don't use product-momment or rank correlation since it assumes at least a monotonic relationship, which is a assumption we don't want to make at this point.

- Cramer's V for all combinations of categorical variables
- Pearson and Spearman for all combinations of continuous variables

- Mutal information for every combination of variables.

"""

In [None]:
category = credit_approval.variables[['name','type']].set_index('name')

#### Between categorical variables
Cramer's V

In [None]:
vars_cat    = category[category.type == 'Categorical'].index
n_cat       = len(vars_cat)
cramers_mat = pd.DataFrame(index=vars_cat, columns=vars_cat, dtype=np.float64)

for var1, var2 in itertools.combinations(vars_cat, r=2):
    cramers_mat.loc[var1,var2] = association(pd.crosstab(df[var1], df[var2]))

fig, ax = plt.subplots(figsize=(15, 12))
axi = ax.matshow(cramers_mat)
ax.set_xticks(range(n_cat), labels=vars_cat)
ax.set_yticks(range(n_cat), labels=vars_cat)
ax.set_xticklabels(labels=cramers_mat.columns, fontsize=30)
ax.set_yticklabels(labels=cramers_mat.index, fontsize=30)

cbar = fig.colorbar(axi, ax=ax)
cbar.ax.tick_params(labelsize=30)
plt.title('Cramer´s V \n categorical data', fontsize=30)

#### Between continuous variables

In [None]:
vars_con    = category[category.type == 'Continuous'].index
n_con       = len(vars_con)

pearson_mat = pd.DataFrame(index=vars_con, columns=vars_con, dtype=np.float64)
spearman_mat = pd.DataFrame(index=vars_con, columns=vars_con, dtype=np.float64)


#### Pearson product-moment correlation coefficient

In [None]:
for var1, var2 in itertools.combinations(vars_con, r=2):
    
    pearson_mat.loc[var1,var2] =  pearsonr(
        df[[var1,var2]].dropna(how='any')[var1],
        df[[var1,var2]].dropna(how='any')[var2]).statistic

fig, ax = plt.subplots(figsize=(15, 12))
axi = ax.matshow(pearson_mat)
ax.set_xticks(range(n_con), labels=vars_con)
ax.set_yticks(range(n_con), labels=vars_con)
ax.set_xticklabels(labels=pearson_mat.columns, fontsize=30)
ax.set_yticklabels(labels=pearson_mat.index, fontsize=30)

cbar = fig.colorbar(axi, ax=ax)
cbar.ax.tick_params(labelsize=30)
plt.title('Pearson product-moment correlation coefficient \n continious data', fontsize=30)


#### Spearman rank correlation coefficient

In [None]:
for var1, var2 in itertools.combinations(vars_con, r=2):
    
    spearman_mat.loc[var1,var2] =  spearmanr(
        df[[var1,var2]].dropna(how='any')[var1],
        df[[var1,var2]].dropna(how='any')[var2]).statistic

fig, ax = plt.subplots(figsize=(15, 12))
axi = ax.matshow(spearman_mat)
ax.set_xticks(range(n_con), labels=vars_con)
ax.set_yticks(range(n_con), labels=vars_con)
ax.set_xticklabels(labels=spearman_mat.columns, fontsize=30)
ax.set_yticklabels(labels=spearman_mat.index, fontsize=30)

cbar = fig.colorbar(axi, ax=ax)
cbar.ax.tick_params(labelsize=30)
plt.title('Spearman rank correlation coefficient \n continious data', fontsize=30)

#### Mutual information (combinations of categorical and continuous)

In this context, there is no reasonable way known to the authers to correlate a multi-class (k > 2) categorical variable to a continuous variable.

- Therefore, we already do some tweaking of the data here, applying reasonable ways to allow us to correlate as many features as possible. Namely: Make A4, A5 and A13 binary variables by by summarising classes.

- Also, we apply the log transform to features with a strongly skewed distribution (A3, A8, A11, A14, A15) already, to make the correlations more meaningfull, since we are not (!) doing rank correlations.

- A obvious way to deal with A6 and A7 is target encoding, we will also already apply that here, for the sake of not leaving A6 and A7 completely out of the feature-feature correlations. This is however still EDA and not the feature engineering part.

- Since only 5% of the data contains missing values, we will for now throw out any column that has missing values.

In [None]:
column_tweaker = make_column_transformer(
    (
        OneHotEncoder(sparse_output=False, drop='first'),
        ['A1','A9','A10','A12','A16']
    ),
    (
        FunctionTransformer(lambda col: label_binarize(col, classes=['u'])),
        ['A4']
    ),
    (
        FunctionTransformer(lambda col: label_binarize(col, classes=['g'])),
        ['A5']
    ),
    (
        # becoming numeric column
        TargetEncoder(),
        ['A6', 'A7']
    ),
    (
        FunctionTransformer(lambda col: label_binarize(col, classes=['g'])),
        ['A13']
    ),
    (
        FunctionTransformer(np.log1p),
        ['A2','A3','A8','A11','A14','A15']
    ),

    # leave everything else untouched
    remainder='drop'
)

df_tweaked = df.copy(deep=True).dropna(how='any')
var_names = df.columns

df_tweaked = pd.DataFrame(column_tweaker.fit_transform(df_tweaked, y=df_tweaked.A16), columns=['A1','A9','A10','A12' ,'A16','A4','A5','A6','A7','A13','A2','A3','A8','A11','A14','A15'])
df_tweaked = df_tweaked[var_names] # bring columns in right order again

category = credit_approval.variables[['name','type']].set_index('name')
category.loc[['A6','A7'],:] = 'Continuous'

df_tweaked.head(7)

In [None]:
warnings.simplefilter('ignore', UserWarning)

mutual_informaion = pd.DataFrame(index=var_names, columns=var_names)

# each variable is taken as target variable
for var in var_names:
    
    discrete_features_mask = (category.loc[category.index!=var,:] == 'Categorical').values.ravel()
    
    # use mutual_info_regression if targeted variable is continuous
    if category.loc[var][0] == 'Continuous':
        mutual_informaion.loc[var, var_names != var] = mutual_info_regression(
            df_tweaked.loc[:, df_tweaked.columns != var],
            df_tweaked[var],
            discrete_features=discrete_features_mask
        )

    # use mutual_info_regression if targeted variable is categorical
    if category.loc[var][0] == 'Categorical':
        mutual_informaion.loc[var, var_names != var] = mutual_info_classif(
            df_tweaked.loc[:, df_tweaked.columns != var],
            df_tweaked[var],
            discrete_features=discrete_features_mask
        )
mutual_informaion = mutual_informaion.fillna(0)

fig, ax = plt.subplots(figsize=(15, 12))
axi = ax.matshow(mutual_informaion)
ax.set_xticks(range(len(var_names)), labels=var_names)
ax.set_yticks(range(len(var_names)), labels=var_names)

#plt.set_title('mutual information')
plt.colorbar(axi, ax=ax);

### Determining clustering for A7

Principal component analysis allows us to project data into a 2D space, where we can visualise it.

We wanted to summarize categories for A7, since we don't have any semantic information on the categories but still want a way to figure out, which categories to group together we can try to utilise PCA.


If we project the data, we can visually inspect, if there are any clusters which suggest a possible grouping of A7's categories.

We try PCA for differet sets of features and selectively in combination with the target.

#### Set up PCA for projection into 2D space

In [None]:
X_tweaked = df_tweaked.iloc[:,:15]
y_tweaked = pd.DataFrame(df_tweaked.A16)

In [None]:
X_tweaked_train, X_tweaked_test, y_tweaked_train, y_tweaked_test = train_test_split(X_tweaked, y_tweaked, test_size=.1)

In [None]:
pca = make_pipeline(
    MinMaxScaler(),
    PCA(n_components=2,svd_solver='full')
)
pca

#### Features and target

In [None]:
res = pd.DataFrame(pca.fit_transform(df_tweaked), columns = ['x','y'])
sns.scatterplot(res, x='x', y='y', hue=df.A7)

#### Feature space only

In [None]:
X_tweaked

In [None]:
res = pd.DataFrame(pca.fit_transform(X_tweaked), columns = ['x','y'])
sns.scatterplot(res, x='x', y='y', hue=df.A7).set(title='PCA A7 \n inklusive target')
plt.legend(bbox_to_anchor=(0.67, 0.5), ncol=2, loc='upper left')

#### exclude A7

In [None]:
df_a7=df_tweaked.loc[:,df_tweaked.columns!='A7']

res = pd.DataFrame(pca.fit_transform(df_a7), columns = ['x','y'])
sns.scatterplot(res, x='x', y='y', hue=df.A7).set(title='PCA A7 \n exklusive target')
plt.legend(bbox_to_anchor=(0.67, 0.55), ncol=2, loc='upper left')

#### Result

There are no obvious visible clusters resulting.

## Feature space PCA 

To further improve our understanding of the dataset, we again perform a PCA.

This time we want to look into how many principal components our feature space will be decomposed and how much varaince each of these components is explaining.

This is done to get a idea of the dimensionality of the data, which will further our understanding of the dataset, will be usefull when deciding how much PCA to perform during feature engingeering and to already get a idea of what classifiers we might use later on, as different classifiers show different effectivity with high or low dimensional data. 

In [None]:
pca = make_pipeline(
    MinMaxScaler(),
    PCA(svd_solver='full')
)
pca

res = pca.fit(X_tweaked).transform(X_tweaked)

plt.bar(range(len(pca['pca'].explained_variance_ratio_)), pca['pca'].explained_variance_ratio_)
plt.title('Explained variance ratio of individual components');

In [None]:
plt.bar(range(len(pca['pca'].explained_variance_ratio_)), np.cumsum(pca['pca'].explained_variance_ratio_))
plt.title('Cumulative explained variance ratio of components');
plt.grid();
np.round(np.cumsum(pca['pca'].explained_variance_ratio_),2)

We already explain 80% of the variance with 4 components and around 99% with 12 components.

## Conclusions

### A4 & A5

A4 and A5 have a correlation coefficient of 1. Therefore they are identical and one of them can instantly be discarded from the dataset (we discard A5).

### A7

In [None]:
cramers_mat.loc['A7','A6']

In [None]:
mutual_informaion.loc['A7','A6']

How to deal with A7 ?

A categorical feature with 9 categories. Target encoding didn't bring very promising univariate results even after summarizing from 9 to 3 categories.


However, we also found A7 to only have a medium correlation with the target variable in comparison to the other features and shares a relatively high ammount of mutual information with A6.

Therefore it seems like a reasonable option either just keep the not very promising looking target encoding, as to only add one dimension through this feature, or leave it out all together (which feature selection should do automaticly if our univariate observations also apply in combination with the other features).

In [None]:
strat = pd.DataFrame(index=df.columns, columns=['tweaking','imputation'], data= " ")

strat.loc['A1','tweaking'] = 'OneHotEncoding'
strat.loc['A1','imputation'] = 'MostFrequent category'

#strat.loc['A1','tweaking'] = 'OneHotEncoding'

strat