In [1]:
%matplotlib inline
## Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

## Train test split
from sklearn.model_selection import train_test_split

## Pipeline
from imblearn.pipeline import Pipeline  #  Pipeline
from sklearn.compose import ColumnTransformer  # Join transformers
from imblearn.over_sampling import SMOTE  # Oversampling technique
from imblearn import FunctionSampler

## Model
from sklearn.tree import DecisionTreeClassifier

## Precision metrics
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Create data frame of people
pple = (
    pd.read_csv("application_record.csv")  # Read the csv file
    .drop_duplicates('ID', keep='last')  # Drop duplicate ids
    .drop('OCCUPATION_TYPE', axis=1)  # Drop occupation type column
    .assign(
        AMT_INCOME_TOTAL=lambda x: pd.to_numeric(x['AMT_INCOME_TOTAL']),
        DAYS_BIRTH=lambda x: pd.to_numeric(x['DAYS_BIRTH']),
        CNT_CHILDREN=lambda x: pd.to_numeric(x['CNT_CHILDREN']),
        DAYS_EMPLOYED=lambda x: pd.to_numeric(x['DAYS_EMPLOYED']),
        CNT_FAM_MEMBERS=lambda x: pd.to_numeric(x['CNT_FAM_MEMBERS']),
    )  # Convert to number
)  # 438 510 rows

In [4]:
# Create data frame of record
record = (
    pd.read_csv("credit_record.csv")  # Read the csv file
    .drop('MONTHS_BALANCE', axis=1)  # Drop months balance its not needed
    .replace({'C': 0, 'X' : 0})  # Replace the text - it means the same thing
    .assign(STATUS=lambda x: pd.to_numeric(x['STATUS']))  # Convert to number
)  # 45985 rows

# Mark the rows where the person owed the debt for two or more monts as default
# marker 1 otherwise 0
record['RESULT'] = record['STATUS'].apply(lambda x:1 if x >= 2 else 0) 

# Join the two data frames based on the ID inner wise
df = pple.join(record.set_index('ID'), on='ID', how='inner')  # 36457 rows

In [5]:
# Features that need to be manipulated with
numeric_features = ['AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'CNT_CHILDREN', 'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS']
categorical_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
                       'NAME_HOUSING_TYPE']

In [6]:
def CustomSampler_IQR (X, y):
    
    features = X.columns
    df = X.copy()
    df['Outcome'] = y
    
    indices = [x for x in df.index]    
    out_indexlist = []
        
    for col in features:
       
        #Using nanpercentile instead of percentile because of nan values
        Q1 = np.nanpercentile(df[col], 25.)
        Q3 = np.nanpercentile(df[col], 75.)
        
        cut_off = (Q3 - Q1) * 1.5
        upper, lower = Q3 + cut_off, Q1 - cut_off
                
        outliers_index = df[col][(df[col] < lower) | (df[col] > upper)].index.tolist()
        outliers = df[col][(df[col] < lower) | (df[col] > upper)].values        
        out_indexlist.extend(outliers_index)
        
    #using set to remove duplicates
    out_indexlist = list(set(out_indexlist))
    
    clean_data = np.setdiff1d(indices,out_indexlist)

    return X.loc[clean_data], y.loc[clean_data]

In [7]:
def IQR_Outliers (X, features):

    print('# of features: ', len(features))
    print('Features: ', features)

    indices = [x for x in X.index]
    #print(indices)
    print('Number of samples: ', len(indices))
    
    out_indexlist = []
        
    for col in features:
       
        #Using nanpercentile instead of percentile because of nan values
        Q1 = np.nanpercentile(X[col], 25.)
        Q3 = np.nanpercentile(X[col], 75.)
        
        cut_off = (Q3 - Q1) * 1.5
        upper, lower = Q3 + cut_off, Q1 - cut_off
        print ('\nFeature: ', col)
        print ('Upper and Lower limits: ', upper, lower)
                
        outliers_index = X[col][(X[col] < lower) | (X[col] > upper)].index.tolist()
        outliers = X[col][(X[col] < lower) | (X[col] > upper)].values
        print('Number of outliers: ', len(outliers))
        print('Outliers Index: ', outliers_index)
        print('Outliers: ', outliers)
        
        out_indexlist.extend(outliers_index)
        
    #using set to remove duplicates
    out_indexlist = list(set(out_indexlist))
    out_indexlist.sort()
    print('\nNumber of rows with outliers: ', len(out_indexlist))
    print('List of rows with outliers: ', out_indexlist)

In [8]:
# Apply transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Replace missing values with median
    ('scaler', StandardScaler())])  # Scale normally the values

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Replace missing values with the word missing
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])  # Encode the categorical values to prevent bias

In [9]:
# Join the transformers into one function 
transformer = ColumnTransformer(
    transformers = [
        # Mame of the operation, what transformer is applied, on which features
        ('numeric_data_preprocessing', numeric_transformer, numeric_features),
        ('categorical_data_preprocessing', categorical_transformer, categorical_features),
    ]
)

In [13]:
# Creating a pipeline with specific steps
final_pipeline = Pipeline(
    steps=[
        ('transformer', transformer),  # Apply transformation rules on the features
        ('imbalanced', SMOTE(random_state=38)),  # Imbalanced dataset
        ('outlier', FunctionSampler(func=CustomSampler_IQR, validate=False)),

        ('rf_estimator', DecisionTreeClassifier())  # Apply model
    ])

In [14]:
# Preparing before using the pipeline on the dataframe

# Split the data into train and test
# Divide the data frame and the actual results
X = df.drop('RESULT', axis=1) 
y = df['RESULT']

# Split into test, train and data, results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [15]:
# Run the pipeline
final_pipeline.fit(X_train, y_train)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
# Predict 
y_pred = final_pipeline.predict(X_test)

In [None]:
# Mesure accuracy
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))
print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))