# German credit data

We are using the Statlog (German Credit Data) dataset (Dheeru Dua and Casey Graff. UCI machine learning repository, 2017. URL [http://archive.ics.uci.edu](http://archive.ics.uci.edu)). The German Credit dataset classifies people described by a set of 20 features as good or bad credit risk.

**import packages**

In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Load the dataset

Make sure to save the dataset in the parent directory or adjust the file path below.

In [3]:
df = pd.read_csv('../datasets/credit/credit-g_csv.csv')
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad


## Prep data

We will first create a dictionary where we save the names of the target to predict, as well as the column names of numerical and categorial features. Then, we'll recode categorical variables.

### define feature types

In [4]:
d = {
    'target': 'class',
    'numerical':['duration', 'credit_amount', 'installment_commitment', 'age',
                 'residence_since', 'existing_credits', 'num_dependents']
}

d['categorical'] = df.columns.difference(d['numerical'] + [d['target']])

Next, we will recode the target variable from str ('bad', 'good') into int (0/1). So then 1=good and 0=bad.

In [5]:
# recode response variable
df[d['target']] = df[d['target']].apply(lambda x: 1 if x == "good" else 0)
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,1
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,0
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,1
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,1
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,0


In [6]:
X = df.drop(d['target'], axis=1)
y = df[d['target']]

### Encoding categorical variables

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
categorical_features_idx = [X.columns.get_loc(col) for col in d['categorical']]
categorical_features_idx

[0, 2, 6, 19, 14, 16, 9, 13, 18, 8, 11, 3, 5]

In [9]:
#encoding categorical variable and creating dictionary
categorical_names = {}
for feature in categorical_features_idx:
    le = LabelEncoder()
    le.fit(X.iloc[:, feature].values)
    X.iloc[:, feature] = le.transform(X.iloc[:, feature])
    categorical_names[feature] = le.classes_

categorical_names

{0: array(['0<=X<200', '<0', '>=200', 'no checking'], dtype=object),
 2: array(['all paid', 'critical/other existing credit', 'delayed previously',
        'existing paid', 'no credits/all paid'], dtype=object),
 6: array(['1<=X<4', '4<=X<7', '<1', '>=7', 'unemployed'], dtype=object),
 19: array(['no', 'yes'], dtype=object),
 14: array(['for free', 'own', 'rent'], dtype=object),
 16: array(['high qualif/self emp/mgmt', 'skilled', 'unemp/unskilled non res',
        'unskilled resident'], dtype=object),
 9: array(['co applicant', 'guarantor', 'none'], dtype=object),
 13: array(['bank', 'none', 'stores'], dtype=object),
 18: array(['none', 'yes'], dtype=object),
 8: array(['female div/dep/mar', 'male div/sep', 'male mar/wid',
        'male single'], dtype=object),
 11: array(['car', 'life insurance', 'no known property', 'real estate'],
       dtype=object),
 3: array(['business', 'domestic appliance', 'education',
        'furniture/equipment', 'new car', 'other', 'radio/tv', 'repairs',


---
## Version 1: numerical & categorical variables, no one-hot encoding

### split into train and test data

In [10]:
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, stratify=y)

# reset the index
X_train, y_train, X_test, y_test = X_train.reset_index(drop=True), y_train.reset_index(drop=True), X_test.reset_index(drop=True), y_test.reset_index(drop=True)

X_train.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,0,24,3,4,1246,2,2,4,3,2,2,3,23,2,1,1,3,1,0,1
1,1,12,3,4,900,4,0,4,2,2,2,0,23,1,1,1,1,1,0,1
2,3,6,3,4,672,2,4,1,0,2,4,3,54,1,1,1,2,1,1,1
3,3,10,3,9,2848,0,0,1,3,0,2,3,32,1,1,1,1,2,0,1
4,3,48,1,0,7629,4,3,4,1,2,2,0,46,0,1,2,0,2,0,1


### save .csv files

In [11]:
# X_train.to_csv('../datasets/credit/credit_X_train.csv', index=False)
# y_train.to_csv('../datasets/credit/credit_y_train.csv', index=False)
# X_test.to_csv('../datasets/credit/credit_X_test.csv', index=False)
# y_test.to_csv('../datasets/credit/credit_y_test.csv', index=False)

---
## Version 2: numerical & one-hot encoded categorical variables

### One-hot encoding for categorical data 
Then, we will apply one-hot encoding to the categorial variables and split the data into train and test set.

In [12]:
# one-hot encoding
X_ = pd.get_dummies(X, columns=d['categorical'], drop_first=True, dtype=int)
X_.head()

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status_1,checking_status_2,checking_status_3,...,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,savings_status_1,savings_status_2,savings_status_3,savings_status_4
0,6,1169,4,4,67,2,1,1,0,0,...,0,0,1,0,0,0,0,0,0,1
1,48,5951,2,2,22,1,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,12,2096,2,3,49,1,2,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,42,7882,2,4,45,1,2,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,24,4870,3,4,53,2,2,1,0,0,...,1,0,0,0,0,0,0,1,0,0


### split into train and test data

In [13]:
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_, y, random_state=0, test_size=0.2, stratify=y)

# reset the index
X_train, y_train, X_test, y_test = X_train.reset_index(drop=True), y_train.reset_index(drop=True), X_test.reset_index(drop=True), y_test.reset_index(drop=True)

X_train.head()

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status_1,checking_status_2,checking_status_3,...,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,savings_status_1,savings_status_2,savings_status_3,savings_status_4
0,24,1246,4,2,23,1,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,12,900,4,2,23,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1
2,6,672,1,4,54,1,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
3,10,2848,1,2,32,1,2,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,48,7629,4,2,46,2,2,0,0,1,...,0,0,0,0,0,0,0,0,0,1


### save .csv files

In [14]:
# X_train.to_csv('../datasets/credit/encoded_credit_X_train.csv', index=False)
# X_test.to_csv('../datasets/credit/encoded_credit_X_test.csv', index=False)

---
## Version 3: Only binary features

### create binary version of dataset

In [15]:
df_c = pd.DataFrame()
for column in X.columns:
    if column in d['numerical'] and len(X[column].unique()) > 2:
        df_c[column] = pd.cut(X[column], 5, labels=[1, 2, 3, 4, 5])
    else:
        df_c[column] = X[column]

df_binary = pd.get_dummies(df_c, columns=df_c.columns, drop_first=True, dtype=int)

### split into train and test data

In [16]:
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(df_binary, y, random_state=0, test_size=0.2, stratify=y)

# reset the index
X_train, y_train, X_test, y_test = X_train.reset_index(drop=True), y_train.reset_index(drop=True), X_test.reset_index(drop=True), y_test.reset_index(drop=True)

X_train.head()

Unnamed: 0,checking_status_1,checking_status_2,checking_status_3,duration_2,duration_3,duration_4,duration_5,credit_history_1,credit_history_2,credit_history_3,...,existing_credits_2,existing_credits_3,existing_credits_4,existing_credits_5,job_1,job_2,job_3,num_dependents_2,own_telephone_1,foreign_worker_1
0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
3,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,1
4,0,0,1,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,1,0,1


### save .csv files

In [17]:
# X_train.to_csv('../datasets/credit/bin_credit_X_train.csv', index=False)
# X_test.to_csv('../datasets/credit/bin_credit_X_test.csv', index=False)