# Week 01 – Intro

This week will be all about setting up your machine and taking a look at the data that we'll be using in the coming weeks.

**import packages**

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Load the dataset

We are using the Statlog (German Credit Data) dataset (Dheeru Dua and Casey Graff. UCI machine learning repository, 2017. URL [http://archive.ics.uci.edu](http://archive.ics.uci.edu)). The German Credit dataset classifies people described by a set of 20 features as good or bad credit risk.

Make sure to save the dataset in the parent directory or adjust the file path below.

In [11]:
df = pd.read_csv('../datasets/credit/credit-g_csv.csv')
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad


## Prep data

We will first create a dictionary where we save the names of the target to predict, as well as the column names of numerical and categorial features.

In [12]:
d = {
    'target': 'class',
    'numerical':['duration', 'credit_amount', 'installment_commitment', 'age',
                 'residence_since', 'existing_credits', 'num_dependents']
}

d['categorical'] = df.columns.difference(d['numerical'] + [d['target']])

Next, we will recode the target variable from str ('good'/'bad') into int (0/1).

In [13]:
# recode response variable
df[d['target']] = df[d['target']].apply(lambda x: 1 if x == "good" else 0)
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,1
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,0
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,1
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,1
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,0


In [14]:
X = df.drop(d['target'], axis=1)
y = df[d['target']]

--- 
## One-hot encoding for categorical data & save .csv files

Then, we will apply one-hot encoding to the categorial variables and split the data into train and test set.

In [15]:
# one-hot encoding
X_ = pd.get_dummies(X, columns=d['categorical'], drop_first=True, dtype=int)

# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_, y, random_state=0, test_size=0.2, stratify=y)

# reset the index
X_train, y_train, X_test, y_test = X_train.reset_index(drop=True), y_train.reset_index(drop=True), X_test.reset_index(drop=True), y_test.reset_index(drop=True)

X_train.head()

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status_<0,checking_status_>=200,checking_status_no checking,...,purpose_new car,purpose_other,purpose_radio/tv,purpose_repairs,purpose_retraining,purpose_used car,savings_status_500<=X<1000,savings_status_<100,savings_status_>=1000,savings_status_no known savings
0,24,1246,4,2,23,1,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,12,900,4,2,23,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1
2,6,672,1,4,54,1,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
3,10,2848,1,2,32,1,2,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,48,7629,4,2,46,2,2,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [16]:
# X_train.to_csv('../datasets/credit_g_X_train.csv', index=False)
# y_train.to_csv('../datasets/credit_g_y_train.csv', index=False)
# X_test.to_csv('../datasets/credit_g_X_test.csv', index=False)
# y_test.to_csv('../datasets/credit_g_y_test.csv', index=False)

---
## Create binary version of dataset

In [17]:
df_c = pd.DataFrame()
for column in X.columns:
    if column in d['numerical'] and len(X[column].unique()) > 2:
        df_c[column] = pd.cut(X[column], 5, labels=[1, 2, 3, 4, 5])
    else:
        df_c[column] = X[column]

df_binary = pd.get_dummies(df_c, columns=df_c.columns, drop_first=True, dtype=int)

# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(df_binary, y, random_state=0, test_size=0.2, stratify=y)

# reset the index
X_train, y_train, X_test, y_test = X_train.reset_index(drop=True), y_train.reset_index(drop=True), X_test.reset_index(drop=True), y_test.reset_index(drop=True)

X_train.head()

Unnamed: 0,checking_status_<0,checking_status_>=200,checking_status_no checking,duration_2,duration_3,duration_4,duration_5,credit_history_critical/other existing credit,credit_history_delayed previously,credit_history_existing paid,...,existing_credits_2,existing_credits_3,existing_credits_4,existing_credits_5,job_skilled,job_unemp/unskilled non res,job_unskilled resident,num_dependents_2,own_telephone_yes,foreign_worker_yes
0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
3,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,1
4,0,0,1,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,1,0,1


In [9]:
# X_train.to_csv('../datasets/bin_credit_g_X_train.csv', index=False)
# y_train.to_csv('../datasets/bin_credit_g_y_train.csv', index=False)
# X_test.to_csv('../datasets/bin_credit_g_X_test.csv', index=False)
# y_test.to_csv('../datasets/bin_credit_g_y_test.csv', index=False)