# Default Detection for American Express 

## Mackenzie Carter, Salvador R Sanchez Castro

### University of San Diego 

### Master of Science, Applied Data Science 

## MADS 502: Data Minning  

 The dataset contains aggregated profile features for each customer at each statement date. Features are anonymized and normalized, and fall into the following general categories: 
 
$D_*$ = Delinquency variables 

$S_*$ = Spend variables 

$P_*$ = Payment variables 

$B_*$ = Balance variables 

$R_*$ = Risk variables 

with the following features being categorical:  

['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'] 


Note that the negative class has been subsampled for this dataset at 5%, and thus receives a 20x weighting in the scoring metric. 

***

# Section 1 Explore Data

In [1]:
#Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from scipy import stats
import time
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter

In [2]:
#Suppress all warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Load Data in Dataframe and verify load time
start_time = time.time()
train_data = pd.read_csv('Train25.csv')
#train_data = pd.read_csv('train_data.csv')

print("--- %s seconds ---" % (time.time() - start_time))
train_data.head()

In [None]:
#Load Data in Dataframe and verify load time
start_time = time.time()
train_labels = pd.read_csv('train_labels.csv')
print("--- %s seconds ---" % (time.time() - start_time))
train_labels.head()

In [None]:
#Merge training data with target class
train_data = pd.merge(train_data, train_labels, on='customer_ID')
train_data.head()

In [None]:
#Define y (Target)
y = pd.DataFrame(train_data[['target']])
y.head()

In [None]:
#Drop target
train_data = train_data.drop(columns=['target'])

In [None]:
#df Shape
train_data.shape

In [None]:
#Data types
train_data.info()

## Work with Non Numerical Data

In [None]:
#function for extracting objects
def getObjectCol(df):
    DType = ['object']
    df = df.select_dtypes(include=DType)
        
    return df

In [None]:
#function for extracting numerical
def getNumericCol(df):
    DType = ['float64','int64']
    df = df.select_dtypes(include=DType)
        
    return df

In [None]:
#Segregate ID Columns
train_data_ID = train_data[['Unnamed: 0','customer_ID']]
train_data_ID.head()

In [None]:
#Segregate Catagorical
train_data_Cat = train_data[['target','B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']]
train_data_Cat.head()

In [None]:
#View Data Type
train_data_Cat.info()

In [None]:
#Convert All columns to catagorical
for col in train_data_Cat.columns:
    train_data_Cat[col] = train_data_Cat[col].astype('category')

In [None]:
#View Data Type after transformation
train_data_Cat.info()

In [None]:
#View data
sns.set_theme(style="whitegrid")
sns.set_context("poster")
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20,8), sharey=True)

sns.histplot(train_data_Cat, x='B_30', hue="target", stat="probability", multiple="fill", \
             shrink=.8, palette=("PuBu"), ax=ax1)
sns.histplot(train_data_Cat, x='B_38', hue="target", stat="probability", multiple="fill", \
             shrink=.8, palette=("Purples"), ax=ax2)
plt.show()

In [None]:
#View data
sns.set_theme(style="whitegrid")
sns.set_context("poster")
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(20,8), sharey=True)

sns.histplot(train_data_Cat, x='D_63', hue="target", stat="probability", multiple="fill", \
             shrink=.8, palette=("PuBu"), ax=ax1)
sns.histplot(train_data_Cat, x='D_64', hue="target", stat="probability", multiple="fill", \
             shrink=.8, palette=("Purples"), ax=ax2)
sns.histplot(train_data_Cat, x='D_66', hue="target", stat="probability", multiple="fill", \
             shrink=.8, palette=("Greens"), ax=ax3)
sns.histplot(train_data_Cat, x='D_68', hue="target", stat="probability", multiple="fill", \
             shrink=.8, palette=("Reds"), ax=ax4)
plt.show()

In [None]:
#View data
sns.set_theme(style="whitegrid")
sns.set_context("poster")
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20,8), sharey=True)

sns.histplot(train_data_Cat, x='D_114', hue="target", stat="probability", multiple="fill", \
             shrink=.8, palette=("PuBu"), ax=ax1)
sns.histplot(train_data_Cat, x='D_116', hue="target", stat="probability", multiple="fill", \
             shrink=.8, palette=("Purples"), ax=ax2)
sns.histplot(train_data_Cat, x='D_117', hue="target", stat="probability", multiple="fill", \
             shrink=.8, palette=("Greens"), ax=ax3)

plt.show()

In [None]:
#View data
sns.set_theme(style="whitegrid")
sns.set_context("poster")
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20,8), sharey=True)

sns.histplot(train_data_Cat, x='D_120', hue="target", stat="probability", multiple="fill", \
             shrink=.8, palette=("PuBu"), ax=ax1)
sns.histplot(train_data_Cat, x='D_126', hue="target", stat="probability", multiple="fill", \
             shrink=.8, palette=("Purples"), ax=ax2)
plt.show()

In [None]:
#Drop target
train_data_Cat = train_data_Cat.drop(columns=['target'])

In [None]:
#Data quality report

#Initial table
freqDF = pd.DataFrame(columns=['Feature','Mode','Mode Freq.','Mode %','2nd Mode','2nd Mode Freq.','2nd Mode %'])
for col in train_data_Cat.columns:
    freq = train_data_Cat[col].value_counts()
    freqdf = freq.to_frame()
    fRow = freqdf.iloc[0]
    secRow = freqdf.iloc[1]
    fPrct = fRow[0] / len(train_data_Cat[col])
    secPrct = secRow[0] / len(train_data_Cat[col])
    try:
        mode1 = int(fRow.name)
    except:
        mode1 = fRow.name
    try:
        mode2 = int(secRow.name)
    except:
        mode2 = secRow.name
    freqDF = freqDF.append({'Feature':col,'Mode':mode1,'Mode Freq.':fRow[0],'Mode %':fPrct,\
                           '2nd Mode':mode2,'2nd Mode Freq.':secRow[0],'2nd Mode %':secPrct}, ignore_index=True)

freqDF = freqDF.set_index('Feature')

#Nulls, Counts, Cardinality
NUllFeatures = round(train_data_Cat.isnull().sum() / train_data_Cat.shape[0],4)\
      .sort_values(ascending=False)
Count = train_data_Cat.count()
uni = train_data_Cat.nunique()

#Formating
NUllFeatures.to_frame(name="% Miss.")
Count.to_frame(name="Count")
uni.to_frame()
result = pd.concat([Count, NUllFeatures,uni], axis=1)
result.columns =["Count","% Miss.","Card."]
result = pd.concat([result, freqDF], axis=1)
#result = result.drop(['respondent_id'])
result.style.format({'% Miss.': "{:.1%}",'Mode %': "{:.0%}",'2nd Mode %': "{:.0%}",\
                     'Count': "{:,}",'Card.': "{:,}",'Mode Freq.': "{:,}",\
                    '2nd Mode Freq.': "{:,}"})

In [None]:
result['Mode %']

In [None]:
#Drop attributes with Mode% Above 95%
train_data_Cat = train_data_Cat.drop(columns=['D_66','D_116'])

In [None]:
#Drop Id and Cat Columns
train_data = train_data.drop(columns=['Unnamed: 0','customer_ID'])
train_data = train_data.drop(columns=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'])

In [None]:
#Show object col
train_data_objects = getObjectCol(train_data)
train_data_objects.head()

In [None]:
#Change S_2 to DateTime
train_data_objects['S_2'] = \
pd.to_datetime(train_data_objects['S_2'])
train_data_objects.info()

In [None]:
#Unique values on D_63
train_data_Cat['D_63'].unique()

In [None]:
#Unique values on D_64
train_data_Cat['D_64'].unique()

In [None]:
#show percentage of null values
def nullVals(df):
    return round(df.isnull().sum() / df.shape[0] *100,2)

In [None]:
#Checking percantage of null values in objects
nullVals(train_data_Cat)

In [None]:
#dummies on objects
train_data_Cat_dum = pd.get_dummies(train_data_Cat)
train_data_Cat_dum.head()

## Work with Numerical Data

In [None]:
#Show Numeric col
train_data_numeric = getNumericCol(train_data)

#Drop target
train_data_numeric = train_data_numeric.drop(columns=['target'])
train_data_numeric

In [None]:
from sklearn.preprocessing import MinMaxScaler

scale = MinMaxScaler()

train_data_numeric[train_data_numeric.columns] = scale.fit_transform(train_data_numeric) 

In [None]:
train_data_numeric

In [None]:
#Checking percantage of null values in numeric
numericNulls = nullVals(train_data_numeric)
numericNulls

In [None]:
#Limit of nulls aceptable (pct)
x = 20

In [None]:
#nulss above limit
pctNulls = 5
numericNulls = numericNulls[numericNulls > pctNulls]

#To DF
numericNullsDf = numericNulls.to_frame(name = 'numericNulls')
numericNullsDf = numericNullsDf.sort_values('numericNulls',\
                                            ascending=False)

# figure for missing values
sns.set_theme(style="darkgrid")
sns.set(font_scale=1.5)
ax = plt.subplots(figsize=(20,10), sharex=True)

ax = sns.barplot(x=numericNullsDf.index,\
                 y=numericNullsDf.numericNulls,\
                 data=numericNullsDf)
ax.axhline(x, ls='--')

ax.set(title='Null Values in Numeric Data')
ax.set(xlabel="Attribute", ylabel='% of Missing Values')

plt.xticks(rotation=45)
plt.show()

In [None]:
print('Number of columns with nulls above ' + \
      str(pctNulls) + '% is -> ' + str(numericNulls.shape[0]))

In [None]:
#Limit of nulls aceptable function
numericNullsDf_aboveXpct = numericNullsDf[numericNullsDf > x].dropna()
cols2Drop = numericNullsDf_aboveXpct.index
cols2Drop

In [None]:
#Drop variables with to many nulls
train_data_numeric_clean = train_data_numeric.drop(columns= cols2Drop)
train_data_numeric_clean

In [None]:
#Correlation Matrix
corr = train_data_numeric_clean.corr().abs()

In [None]:
#Top Correlations
topCorr = corr.unstack().sort_values(ascending=False)

topCorrDF = topCorr.to_frame()
topCorrDF.reset_index(inplace=True)
topCorrDF['Keep'] = topCorrDF['level_0'] != topCorrDF['level_1'] 
topCorrDF = topCorrDF[topCorrDF['Keep']]
topCorrDF = topCorrDF.drop(columns=['Keep'])
topCorrDF.columns = ['ATTR A','ATTR B','Corr.']
topCorrDF.head(10)


In [None]:
#High Corrlated values
topCorrDFHC = topCorrDF[topCorrDF['Corr.'] > 0.85]
topCorrDFHC.shape

In [None]:
# figure for corrlations
sns.set_theme(style="darkgrid")
sns.set(font_scale=1.5)
ax = plt.subplots(figsize=(20,10), sharex=True)


ax = sns.barplot(x=topCorrDFHC.index,\
                 y=topCorrDFHC['Corr.'],\
                 data=topCorrDFHC)

ax.set(title='High Correlation')
ax.set(xlabel="Attribute Pair Index", ylabel='Correlation')

plt.xticks(rotation=45)
plt.show()

In [None]:

#Column to identify which attributes to remove
topCorrDFHC['Running'] = np.arange(len(topCorrDFHC))
topCorrDFHC.head()

In [None]:
#Remove pair for high correaltions
topCorrDFHC = topCorrDFHC[topCorrDFHC.index%2==0]
topCorrDFHC.head()

In [None]:
#Drop variables with high correlation 
train_data_numeric_clean_HC = \
train_data_numeric_clean.drop(columns= topCorrDFHC['ATTR A'])
train_data_numeric_clean_HC

In [None]:
trainSD = train_data_numeric_clean_HC.std().sort_values(ascending=False)
trainSD = trainSD.to_frame()
trainSD

In [None]:
#Low variation
SDMin = 0.1
trainSDLow = trainSD[trainSD[0] < SDMin]
trainSDLow.shape

In [None]:
#Low variation to drop
cols2DropLow = trainSDLow.index
cols2DropLow

In [None]:
#Drop variables low variation
train_data_numeric_clean_HC_HighVariation = train_data_numeric_clean_HC.drop(columns= cols2DropLow)
train_data_numeric_clean_HC_HighVariation

In [None]:
# figure Standard Devation
sns.set_theme(style="darkgrid")
ax = plt.subplots(figsize=(20,10), sharex=True)

ax = sns.barplot(x=trainSD.index,\
                 y=trainSD[0],\
                 data=trainSD)

ax.set(title='Standar Deviation')
ax.set(xlabel="Attribute", ylabel='STD')
ax.axhline(SDMin, ls='--')

plt.xticks(rotation=45)
plt.show()

In [None]:
Delinquency = train_data_numeric_clean_HC.filter(regex='D')
Delinquency = pd.concat([Delinquency,y], axis = 1)
Delinquency.head()

In [None]:
#Figure config
sns.set_theme(style="darkgrid")
sns.set_context("poster")
fig, ax = plt.subplots(figsize=(20,10))

ax = sns.histplot(data=Delinquency, x="D_47", kde=True, hue="target")

#Format Axis
ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,}'))

plt.show()

In [None]:
N = 18
# Select first N columns
First_n_column_Deli  = Delinquency.iloc[: , 1:N]
Middle_n_column_Deli  = Delinquency.iloc[: , 18:36]
Last_n_column_Deli  = Delinquency.iloc[: , -N:]

In [None]:
#Boxplot of Delinquency first columns
ax = plt.subplots(figsize=(20,10), sharex=True)
ax = sns.boxplot(x="variable", y="value", data=pd.melt(first_n_column_Deli))

plt.show()

In [None]:
#Boxplot of Delinquency midle columns
ax = plt.subplots(figsize=(20,10), sharex=True)
ax = sns.boxplot(x="variable", y="value", data=pd.melt(Middle_n_column_Deli))

plt.show()

In [None]:
#Boxplot of Delinquency last columns
ax = plt.subplots(figsize=(20,10), sharex=True)
ax = sns.boxplot(x="variable", y="value", data=pd.melt(Last_n_column_Deli))

plt.show()

In [None]:
Risk = train_data_numeric_clean_HC.filter(regex='R')
Risk.head()

In [None]:
#Boxplot of Delinquency last columns
ax = plt.subplots(figsize=(20,10), sharex=True)
ax = sns.boxplot(x="variable", y="value", data=pd.melt(Risk))

plt.show()

In [None]:
Spend = train_data_numeric_clean_HC.filter(regex='S')
Spend.shape()

In [None]:
Payment = train_data_numeric_clean_HC.filter(regex='P')
Payment.head()

In [None]:
Balance = train_data_numeric_clean_HC.filter(regex='B')
Balance.head()

In [None]:
Delinquency.describe()

# Modeling

***

## Logistic Reegresion

In [None]:
#Delcare X Variables

#Concat cataagorical variables
X0 = pd.concat([train_data_numeric_clean_HC_HighVariation,train_data_Cat_dum], axis = 1) 
X1 = pd.concat([train_data_numeric_clean_HC,train_data_Cat_dum], axis = 1) 

#Fill Nan with mean
X0 = X0.fillna(X0.mean())
X1 = X1.fillna(X1.mean())

#Add constant
X0 = sm.add_constant(X0)
X1 = sm.add_constant(X1)
X1

In [None]:
#Logictic Regression
logreg00 = sm.Logit(y, X0).fit()
logreg01 = sm.Logit(y, X1).fit()
logreg01.summary2()

In [None]:
def results_summary_to_dataframe(results):
    '''take the result of an statsmodel results table and transforms it into a dataframe'''
    pvals = results.pvalues
    coeff = results.params
    conf_lower = results.conf_int()[0]
    conf_higher = results.conf_int()[1]

    results_df = pd.DataFrame({"pvals":pvals,
                               "coeff":coeff,
                               "conf_lower":conf_lower,
                               "conf_higher":conf_higher
                                })

    #Reordering...
    results_df = results_df[["coeff","pvals","conf_lower","conf_higher"]]
    return results_df

In [None]:
summaryDF = results_summary_to_dataframe(logreg01)
summaryDF

In [None]:
summaryDF_05 = summaryDF[summaryDF.pvals > 0.02]
cols2DropSum = summaryDF_05.index

In [None]:
#X2 is X1 - col with los high P Value
X2 = X1.drop(columns= cols2DropSum)

In [None]:
#Logictic Regression with significant variables 
logreg02 = sm.Logit(y, X2).fit()
logreg02.summary2()

# Long time

In [None]:
#Feature ranking with recursive feature elimination (RFE) to determine the top 15 variables
#from sklearn.linear_model import LogisticRegression
#logreg = LogisticRegression()

#from sklearn.feature_selection import RFE
#rfe = RFE(logreg, n_features_to_select=15)         # running RFE with 15 variables as output
#rfe = rfe.fit(X2, y)

#list(zip(X2.columns, rfe.support_, rfe.ranking_))

In [None]:
#Top Columns
#col_Top = X2.columns[rfe.support_]
#col_Top

In [None]:
#X3 is Top variables
#X3 = sm.add_constant(X2[col_Top])
#X3.head()

In [None]:
#Logictic Regression
#logreg03 = sm.Logit(y, X3).fit()
#logreg03.summary2()

In [None]:
# Getting the predicted values on the train set
y_pred00 = logreg00.predict(X0)
y_pred01 = logreg01.predict(X1)
y_pred02 = logreg02.predict(X2)
#y_pred03 = logreg03.predict(X3)
y_pred01

In [None]:
#Transform predictions to Int
prediction00 = (y_pred00 > 0.5).astype('int')
prediction00 = prediction00.to_frame('y_predict')

prediction01 = (y_pred01 > 0.5).astype('int')
prediction01 = prediction01.to_frame('y_predict')

prediction02 = (y_pred02 > 0.5).astype('int')
prediction02 = prediction02.to_frame('y_predict')

#prediction03 = (y_pred03 > 0.5).astype('int')
#prediction03 = prediction03.to_frame('y_predict')

prediction01

In [None]:
#More libraries
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
#Function for Model Evaluation
def ModelEval(y, prediction,level_0,level_1):
    target_names = [level_0,level_1]
    cm= confusion_matrix(y, prediction01)
    #plt.(figsize=(20,20))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,\
                                  display_labels=target_names)
    fig, ax = plt.subplots(figsize=(6,6))
    disp.plot(ax=ax)
    #disp
    plt.grid(False)
    
    plt.show()
    
    #Report
   
    report = classification_report(y,\
                                   prediction, \
                                   digits=3, \
                                   output_dict=True,\
                                  target_names=target_names)
    reportDF = pd.DataFrame(report).T
    reportDF = round(reportDF,3)
    reportDF['support'] = reportDF['support'].astype('int')
    reportDF = reportDF.style.format({'precision': "{:.4}",\
                                      'recall': "{:.4}",\
                                      'f1-score': "{:.4}",\
                                      'support': "{:,}"})
    return reportDF

In [None]:
#Report for model hige variation
reprot1 = ModelEval(y,prediction00,'Non-Default','Default')
reprot1

In [None]:
#Report for Original X
reprot1 = ModelEval(y,prediction01,'Non-Default','Default')
reprot1

In [None]:
reprot2 = ModelEval(y,prediction02,'Non-Default','Default')
reprot2

In [None]:
#reprot3 = ModelEval(y,prediction03,'Non-Default','Default')
#reprot3

## c5.0 Model

In [None]:
#Add Libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.tree import export_graphviz

In [None]:
#Initiate Decision tree
amexmodel0=DecisionTreeClassifier(criterion='entropy', max_depth=6, max_leaf_nodes=6)
amexmodel=DecisionTreeClassifier(criterion='entropy', max_depth=6, max_leaf_nodes=6)

In [None]:
#Fit tree
amexmodel0.fit(X0,y)
amexmodel.fit(X1,y)

In [None]:
#Fit tree low variation
amexmodel0.fit(X0,y)

In [None]:
#Predict 
amexc5predict0 = amexmodel0.predict(X0)
amexc5predict = amexmodel.predict(X1)
amexc5predict

In [None]:
#Evaluation
C50Report = ModelEval(y,amexc5predict,'Non-Default','Default')
C50Report

In [None]:
#Evaluation
C50Report0 = ModelEval(y,amexc5predict0,'Non-Default','Default')
C50Report0

In [None]:
from sklearn.tree import plot_tree

#Figure size
plt.figure(figsize=(20,10))

#Plot tree
plot_tree(amexmodel, \
          filled=True, rounded = True)
plt.show()

## Cart

In [None]:
#Initiate Decision tree
amexmodelcart=DecisionTreeClassifier(criterion='gini', max_depth=6, max_leaf_nodes=6)

In [None]:
#Fit tree
amexmodelcart.fit(X1,y)

In [None]:
#Predict 
amexcartpredict = amexmodelcart.predict(X1)
amexcartpredict

In [None]:
#Evaluation
CartReport = ModelEval(y,amexcartpredict,'Non-Default','Default')
CartReport

In [None]:
#Figure size
plt.figure(figsize=(20,10))

#Plot tree
plot_tree(amexmodelcart, \
          filled=True, rounded = True)
plt.show()

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfy = np.ravel(y)

In [None]:
#Grow the forest
rf01 = RandomForestClassifier(n_estimators = 20, criterion="gini", max_depth=6)

In [None]:
#Fit forest
rf01.fit(X1,rfy)

In [None]:
#Predict 
rfpredictions = rf01.predict(X1)
rfpredictions

In [None]:
#Evaluation
ForestReport = ModelEval(y,rfpredictions,'Non-Default','Default')
ForestReport

In [None]:
#Figure size
plt.figure(figsize=(20,10))

#Base estimator, single tree from forest
baseTree = rf01.estimators_[0]

#Plot tree
plot_tree(baseTree, \
          filled=True, rounded = True)
plt.show()

## XGBoost

In [None]:
from xgboost import XGBClassifier # XGBoost algorithm
xgb0 = XGBClassifier(max_depth = 4)
xgb = XGBClassifier(max_depth = 4)
xgb0.fit(X0, y)
xgb.fit(X1, y)


In [None]:
xgb_yhat0 = xgb0.predict(X0)
xgb_yhat = xgb.predict(X1)
xgb_yhat

In [None]:
X1.shape

In [None]:
#Evaluation low variation
XGBoost0Report = ModelEval(y,xgb_yhat0,'Non-Default','Default')
XGBoost0Report

In [None]:
#Evaluation
XGBoostReport = ModelEval(y,xgb_yhat,'Non-Default','Default')
XGBoostReport