# Classification Task: Credit Approval

### Goal: Create a classifier using the sci-kit learn package

We note that this is a fairly balanced dataset where 44% of the target is "True".

In [22]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import *

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

Create Target and replace unknown symbols in the dataset

In [16]:
df = spark.read.csv("crx.csv", header = False, inferSchema = True)\
    .withColumn("_c1", f.col("_c1").cast(DoubleType()))\
    .withColumn("_c13", f.col("_c13").cast(IntegerType()))\
    .withColumn("target", f.when(f.col("_c15") == '+', f.lit(1)).otherwise(f.lit(0)))\
    .drop("_c15")\
    .replace('?', None)
total = df.count()
print(total, len(df.columns))

690 16


Display percentage of missing values and note continuous vs categorical variables for encoding and imputing in later steps.
We will not be using standard scaler or capping variables for this classification task.

In [4]:
categorical = []
continuous = []
for i, v in enumerate(df.columns):
    unique_values = df.groupby(v).count()\
            .withColumn('perc_of_count_total', (f.col('count') / total) * 100)
    
    if df.dtypes[i][1] == 'string':
        categorical.append(v)
        
        unique_values.show()
    else:
        continuous.append(v)
        unique_values.filter(f.col(v).isNull()).show()
continuous.remove("target")        

+----+-----+-------------------+
| _c0|count|perc_of_count_total|
+----+-----+-------------------+
|null|   12| 1.7391304347826086|
|   b|  468|  67.82608695652173|
|   a|  210| 30.434782608695656|
+----+-----+-------------------+

+----+-----+-------------------+
| _c1|count|perc_of_count_total|
+----+-----+-------------------+
|null|   12| 1.7391304347826086|
+----+-----+-------------------+

+---+-----+-------------------+
|_c2|count|perc_of_count_total|
+---+-----+-------------------+
+---+-----+-------------------+

+----+-----+-------------------+
| _c3|count|perc_of_count_total|
+----+-----+-------------------+
|   l|    2| 0.2898550724637681|
|null|    6| 0.8695652173913043|
|   y|  163|   23.6231884057971|
|   u|  519|  75.21739130434783|
+----+-----+-------------------+

+----+-----+-------------------+
| _c4|count|perc_of_count_total|
+----+-----+-------------------+
|   g|  519|  75.21739130434783|
|null|    6| 0.8695652173913043|
|   p|  163|   23.6231884057971|
|  gg|    

Spark Dataframe to Pandas Dataframe. As this is a small dataset, no sampling is done.

In [5]:
full_pd = df.toPandas()
full_pd.describe()

Unnamed: 0,_c1,_c2,_c7,_c10,_c13,_c14,target
count,678.0,690.0,690.0,690.0,677.0,690.0,690.0
mean,31.568171,4.758725,2.223406,2.4,184.014771,1017.385507,0.444928
std,11.957862,4.978163,3.346513,4.86294,173.806768,5210.102598,0.497318
min,13.75,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.6025,1.0,0.165,0.0,75.0,0.0,0.0
50%,28.46,2.75,1.0,0.0,160.0,5.0,0.0
75%,38.23,7.2075,2.625,3.0,276.0,395.5,1.0
max,80.25,28.0,28.5,67.0,2000.0,100000.0,1.0


Impute and encoding variables. For variables with only 2 values, drop a column to reduce redundancy.

In [6]:
for col in categorical:
    full_pd[col] = full_pd[col].fillna(full_pd.mode()[col][0])
for col in continuous:
    full_pd[col] = full_pd[col].fillna(full_pd.mean()[col])
    
dum_pd = pd.get_dummies(full_pd, columns=categorical, prefix=categorical)
dum_pd.drop('_c0_b', axis=1, inplace=True)
dum_pd.drop('_c11_f', axis=1, inplace=True)
dum_pd.drop('_c8_f', axis=1, inplace=True)
dum_pd.drop('_c9_f', axis=1, inplace=True)

Train test split, stratify by target value

In [None]:
y = dum_pd.pop('target')
X_train, X_holdout, y_train, y_holdout = train_test_split(dum_pd, y, test_size=0.20, random_state=1, stratify=y)

Model selection, feature selection, hyperparameter tuning

In [7]:
rfe = RFE(estimator=RandomForestClassifier())
model = RandomForestClassifier()
pipeline = Pipeline(steps=[('features', rfe),('model', model)])

param_grid = {
    'features__n_features_to_select':[5, 6, 8],
    'model__max_depth': [3, 4, 5],
    'model__min_samples_split': [25, 50]
}

search = GridSearchCV(pipeline, param_grid, n_jobs = -1)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.869):
{'features__n_features_to_select': 8, 'model__max_depth': 3, 'model__min_samples_split': 50}


Create Model Pipeline

In [27]:
rfe = RFE(estimator=RandomForestClassifier(max_depth = 3, min_samples_split = 50), n_features_to_select = 8)
model = RandomForestClassifier(max_depth = 3, min_samples_split = 50)
pipeline = Pipeline(steps=[('features', rfe),('model', model)])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 RFE(estimator=RandomForestClassifier(max_depth=3,
                                                      min_samples_split=50),
                     n_features_to_select=8)),
                ('model',
                 RandomForestClassifier(max_depth=3, min_samples_split=50))])

Display Feature Importance

In [28]:
pd.DataFrame(pipeline.named_steps['model'].feature_importances_,
             index = X_train.columns[pipeline.named_steps['features'].ranking_ == 1],
             columns = ['Importance']).sort_values(by=['Importance'], ascending=False)

Unnamed: 0,Importance
_c8_t,0.442669
_c10,0.168993
_c9_t,0.136454
_c7,0.088988
_c14,0.079866
_c2,0.037161
_c13,0.029166
_c1,0.016703


Display Model Metrics with hold out set to ensure no over-fitting

In [29]:
print ('train auc: ', roc_auc_score(y_train, pipeline.predict_proba(X_train)[:,1]))
print ('holdout auc: ', roc_auc_score(y_holdout, pipeline.predict_proba(X_holdout)[:,1]))

train auc:  0.9425447685849407
holdout auc:  0.9503938684266554
