# Credit Default Prediction on Amex Dataset

### Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd 

import pyspark
from pyspark import StorageLevel
from pyspark.sql import (
    SparkSession, 
    types, 
    functions as F,
)
from pyspark.sql.functions import (
    col,
    isnan,
    when,
    count,
)
from pyspark.ml import Pipeline 
from pyspark.ml.feature import (
    OneHotEncoder, 
    StringIndexer, 
    VectorAssembler, 
    Imputer,
)
from pyspark.ml.classification import (
    LogisticRegression, 
    LinearSVC,
    DecisionTreeClassifier,
    GBTClassifier,
    RandomForestClassifier,
)
from pyspark.ml.evaluation import (
    BinaryClassificationEvaluator,
    MulticlassClassificationEvaluator,
)

import itertools

### Create a Spark Session

In [2]:
spark = SparkSession.builder \
                    .appName("amex-app") \
                    .master("local[*]") \
                    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/16 02:38:54 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/12/16 02:38:54 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/12/16 02:38:54 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/12/16 02:38:54 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


### Important Global Variables

In [3]:
TRAIN_DATA_PATH = 'gs://icdp-bigdata-bucket/train_data.csv'
TRAIN_LABEL_PATH = 'gs://icdp-bigdata-bucket/train_labels.csv'

### Miscellaneous Utility Functions

In [4]:
## Function to create a Schema Object for the Dataframe 
def create_spark_schema(series):
    fields = list()
    
    for value in series: 
        if value in string_dtypes:
            fields.append(
                types.StructField(
                    value, 
                    types.StringType(), 
                    True,
                )
            )
        elif value in date_dtypes:
            fields.append(
                types.StructField(
                    value, 
                    types.DateType(), 
                    True,
                )
            )
        elif value in integer_dtypes:
            fields.append(
                types.StructField(
                    value, 
                    types.IntegerType(), 
                    True,
                )
            )
        else:
            fields.append(
                types.StructField(
                    value, 
                    types.FloatType(), 
                    True,
                )
            )
    return types.StructType(fields)

In [5]:
#Add Suffix to List Elements
def add_suffix(names, suffix):
    return [name + suffix for name in names]

In [6]:
# Drop Columns with Null values above a certain threshold
def dropNullColumns(df, threshold):
    """
    This function drops columns containing all null values.
    :param df: A PySpark DataFrame
    """
  
    null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(
        c) for c in df.columns]).collect()[0].asDict()
    print("null counts calculated...")
    df_count = df.count()
    col_to_drop = [k for k, v in null_counts.items() if v >(df_count * threshold)]  
    print("columns to drop found...")
    df = df.drop(*col_to_drop)  
  
    return df, col_to_drop

### Reading the Dataframe

#### Reading the First 20 rows only

In [7]:
train_df_temp = spark.read.option(
    "header", 'true',
).csv(
    TRAIN_DATA_PATH,
).limit(
    20
)
train_labels_temp = spark.read.option(
    "header", 'true',
).csv(
    TRAIN_LABEL_PATH,
).limit(
    20
)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

#### Define Schema Using Sampled Temporary Dataframe

In [8]:
## Known Datatypes: 

string_dtypes = ["customer_ID", 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
date_dtypes = ['S_2']
integer_dtypes = ['target']

In [9]:
train_schema = create_spark_schema(train_df_temp.columns)
label_schema = create_spark_schema(train_labels_temp.columns)

#### Remove Temp Datasets from Memory

In [10]:
train_df_temp.unpersist()
train_labels_temp.unpersist()

del train_df_temp
del train_labels_temp

#### Reading the Whole Dataset with the Inferred Schema

In [11]:
train_df = spark.read.option(
    "header", 
    "true",
).csv(
    TRAIN_DATA_PATH, 
    schema=train_schema
)
label_df = spark.read.option(
    "header", 
    "true",
).csv(
    TRAIN_LABEL_PATH, 
    schema=label_schema,
)

In [12]:
## Other categorization of the known dtypes
info_cols = ['customer_ID', 'S_2']
target_cols = ['target']
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']


# Define Numeric Columns
excluded = info_cols + cat_cols
num_cols = [col for col in train_df.columns if col not in excluded]

### Preprocessing of the Dataset

#### Dropping Null Columns

In [13]:
## Remove All Columns with More than 5% Missing Values
train_df, cols_to_drop = dropNullColumns(train_df, 0.05)

22/12/16 02:39:08 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

null counts calculated...




columns to drop found...


                                                                                

#### Remove Less Important Column S_2

In [14]:
## Remove the S_2 variable as the testing data and the training data are in different time periods 
train_df = train_df.drop("S_2")

#### Converting Categorical Columns to Numeric using StringIndexer

In [15]:
cat_columns_to_index = list(set(train_df.columns) & set(cat_cols))

In [16]:
cat_cols_indexed = add_suffix(cat_columns_to_index, "_index")

## Create StringIndexer Object
indexer = StringIndexer(
    inputCols=cat_columns_to_index,
    outputCols=cat_cols_indexed,
)
indexer.setHandleInvalid("keep")
indexer_model = indexer.fit(train_df)

train_df = indexer_model.transform(train_df)

                                                                                

#### Impute values for numerical columns

In [17]:
num_columns_to_impute = list(set(train_df.columns) & set(num_cols))

In [18]:
num_cols_imputed = add_suffix(num_columns_to_impute, "_imputed")

##Create Imputer
imputer = Imputer(
    inputCols=num_columns_to_impute,
    outputCols=num_cols_imputed,
)
imputer.setStrategy("median")

imputer_model = imputer.fit(train_df)

train_df = imputer_model.transform(train_df)

                                                                                

#### OneHotEncode the Categorical Columns

In [19]:
cat_cols_ohe = add_suffix(cat_cols_indexed, "_ohe")
https://github.com/yangsong24/Amex_credit_card_default_prediction.git
### Create Ohe Object
ohe = OneHotEncoder(
    inputCols = cat_cols_indexed,
    outputCols = cat_cols_ohe,
)

ohe_model = ohe.fit(train_df)

train_df = ohe_model.transform(train_df)

In [20]:
useful_cols = ["customer_ID"] + cat_cols_ohe + num_cols_imputed

### Remove Unnecessary Columns and Aggregate

In [21]:
train_df = train_df.select(*useful_cols)

In [22]:
new_num_cols = []
for num_col in num_cols_imputed:
    new_name = num_col.split("_")[0] + "_" + num_col.split("_")[1]
    new_num_cols.append(new_name)
    train_df = train_df.withColumnRenamed(num_col, new_name)
new_cat_cols = []
for cat_col in cat_cols_ohe:
    new_name = cat_col.split("_")[0] + "_" + cat_col.split("_")[1]
    new_cat_cols.append(new_name)
    train_df = train_df.withColumnRenamed(cat_col, new_name)

In [23]:
## Aggregation Functions
num_funcs = [
    (F.mean, "_mean"),
     (F.min, "_min"),
     (F.max, "_max"),
]

cat_funcs = [
    (F.count, "_count"),
    (F.last, "_last"),
    (F.countDistinct, "_nunique"),
]

In [24]:
agg_num_args = [
    func(col).alias(col + suffix) 
    for col, (func, suffix) in itertools.product(new_num_cols, num_funcs)]

agg_cols_args = [
    func(col).alias(col + suffix) 
    for col, (func, suffix) in itertools.product(new_cat_cols, cat_funcs)]

# Combine numeric and categoric agg arguments
agg_args = agg_num_args + agg_cols_args

In [25]:
train_df = train_df.groupBy("customer_ID").agg(*agg_args)

In [26]:
train_df = train_df.join(
    F.broadcast(label_df), 
    on="customer_ID",
)

In [27]:
va_model = VectorAssembler(
    inputCols=train_df.drop(
        "customer_ID",
        "target",
    ).columns,
    outputCol="features",
    handleInvalid="skip",
)

In [28]:
train_df = va_model.transform(
    train_df,
).select(
    [
        "customer_ID", 
        "features", 
        "target",
    ]
).persist(
    StorageLevel.DISK_ONLY,
)

                                                                                

### Train Test Split

In [29]:
train_split, test_split = train_df.randomSplit(weights = [0.8, 0.2], seed = 42)

### Fit Models

#### Logistic Regression

In [None]:
lr = LogisticRegression(
    featuresCol="features",
    labelCol="target",
)
lr_model = lr.fit(train_split)

22/12/16 03:49:38 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/12/16 03:49:38 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [None]:
lr_preds = lr_model.transform(test_split)

In [None]:
binEval = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="target",metricName="areaUnderROC")
multiEval = MulticlassClassificationEvaluator(labelCol = "target", predictionCol = "prediction")

In [None]:
print("AUCROC: ", binEval.evaluate(lr_preds))
print("Accuracy: ", multiEval.evaluate(lr_preds, {multiEval.metricName: "accuracy"}))
print("F1 Score: ", multiEval.evaluate(lr_preds, {multiEval.metricName: "f1"}))
print("Weighted Precision: ", multiEval.evaluate(lr_preds, {multiEval.metricName: "weightedPrecision"}))
print("Weighted Recall: ", multiEval.evaluate(lr_preds, {multiEval.metricName: "weightedRecall"}))

                                                                                

AUCROC:  0.8495909980592996


                                                                                

Accuracy:  0.8898278218092375


                                                                                

F1 Score:  0.8889484226843926


                                                                                

Weighted Precision:  0.8884047070122425




Weighted Recall:  0.8898278218092375


                                                                                