In [1]:
## Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

## Train test split
from sklearn.model_selection import train_test_split

## Pipeline
from imblearn.pipeline import Pipeline  #  Pipeline
from sklearn.compose import ColumnTransformer  # Join transformers
from imblearn.over_sampling import SMOTE  # Oversampling technique

## Model
from sklearn.tree import DecisionTreeClassifier

## Precision metrics
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

## Working with data frames
import pandas as pd

In [2]:
# Create data frame of people
pple = (
    pd.read_csv("application_record.csv")  # Read the csv file
    .drop_duplicates('ID', keep='last')  # Drop duplicate ids
    .drop('OCCUPATION_TYPE', axis=1)  # Drop occupation type column
)  # 438 510 rows

In [3]:
# Create data frame of record
record = (
    # Read the csv file
    pd.read_csv("credit_record.csv")  
    # Drop months balance its not needed
    .drop('MONTHS_BALANCE', axis=1)  
     # Replace the text - it means the same thing
    .replace({'C': 0, 'X' : 0}) 
    # Convert to number
    .assign(STATUS=lambda x: pd.to_numeric(x['STATUS']))  
)  # 45985 rows

# Mark the rows where the person owed the debt for two or more monts
# as default marker 1 otherwise 0
record['RESULT'] = record['STATUS'].apply(lambda x:1 if x >= 2 else 0) 

# Join the two data frames based on the ID inner wise
df = pple.join(record.set_index('ID'), on='ID', how='inner')  # 36457 rows

In [4]:
# Features that need to be manipulated with
numeric_features = ['AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'CNT_CHILDREN', 
                    'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS']

categorical_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                        'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 
                        'NAME_FAMILY_STATUS','NAME_HOUSING_TYPE']

In [5]:
# Apply transformers
numeric_transformer = Pipeline(steps=[
    # Replace missing values with median
    ('imputer', SimpleImputer(strategy='median')),  
    ('scaler', StandardScaler())])  # Scale normally the values

categorical_transformer = Pipeline(steps=[
    # Replace missing values with the word missing
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), 
    # Encode the categorical values to prevent bias
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])  

In [6]:
# Join the transformers into one function 
transformer = ColumnTransformer(
    transformers = [
        # Mame of the operation, what transformer is applied, 
        # on which features
        ('numeric_data_preprocessing', 
         numeric_transformer, numeric_features),
        ('categorical_data_preprocessing', 
         categorical_transformer, categorical_features),
    ]
)

In [7]:
# Creating a pipeline with specific steps
final_pipeline = Pipeline(
    steps=[
        # Apply transformation rules on the features
        ('transformer', transformer),  
        # Imbalanced dataset
        ('imbalanced', SMOTE(random_state=38)),  
        # Apply statistical model
        ('rf_estimator', DecisionTreeClassifier())  
    ])

In [8]:
# Preparing before using the pipeline on the dataframe

# Split the data into train and test
# Divide the data frame and the actual results
X = df.drop('RESULT', axis=1) 
y = df['RESULT']

# Split into test, train and data, results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [9]:
# Run the pipeline
final_pipeline.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('numeric_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['AMT_INCOME_TOTAL',
                                                   'DAYS_BIRTH', 'CNT_CHILDREN',
                                                   'DAYS_EMPLOYED',
                                                   'CNT_FAM_MEMBERS']),
                                                 ('categorical_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_val

In [10]:
# Predict 
y_pred = final_pipeline.predict(X_test)

In [11]:
# Measure accuracy
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))
print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))

Accuracy Score:  0.9704068971281253
F1 Score:  0.9820919764046849
Precision Score:  0.9963519530160267
Recall Score:  0.9704068971281253
