In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('PersonalLoan(1).csv')

df

Unnamed: 0,Age,Experience,Income,ZIPCode,Family,CCAvg,Education,Mortgage,SecuritiesAccount,CDAccount,Online,CreditCard,PersonalLoan
0,25,1,49,91107,4,1.6,1,0,Yes,No,No,No,No
1,45,19,34,90089,3,1.5,1,0,Yes,No,No,No,No
2,39,15,11,94720,1,1.0,1,0,No,No,No,No,No
3,35,9,100,94112,1,2.7,2,0,No,No,No,No,No
4,35,8,45,91330,4,1.0,2,0,No,No,No,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,92697,1,1.9,3,0,No,No,Yes,No,No
4996,30,4,15,92037,4,0.4,1,85,No,No,Yes,No,No
4997,63,39,24,93023,2,0.3,3,0,No,No,No,No,No
4998,65,40,49,90034,3,0.5,2,0,No,No,Yes,No,No


In [19]:
# Drop ZIP Code for now
rvar_list =['ZIPCode']
df_sample1 = df.drop(columns=rvar_list)

cvar_list = ['Education', 'SecuritiesAccount', 'CDAccount', 'Online', 'CreditCard', 'PersonalLoan']
nvar_list = ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Mortgage']

# Standardize the numerical variables 
df_sample2 = df_sample1.copy()
df_sample2[nvar_list] = (df_sample1[nvar_list] - df_sample1[nvar_list].mean())/df_sample1[nvar_list].std()

df_sample3 = df_sample2.copy()
df_sample3[cvar_list] = df_sample2[cvar_list].astype('category')
df_sample3[nvar_list] = df_sample2[nvar_list].astype('float64')

# Convert the categorical variables into dummies 
df_sample4 = df_sample3.copy()
df_sample4 = pd.get_dummies(df_sample3, prefix_sep='_')

# Remove the redundant dummies 
rdummies = ['Education_1', 'SecuritiesAccount_Yes', 'CDAccount_Yes', 'Online_Yes', 'CreditCard_Yes', 'PersonalLoan_No']
df_sample5 = df_sample4.copy()
df_sample5 = df_sample4.drop(columns=rdummies)

In [21]:
#data partition

from sklearn.model_selection import train_test_split

df4partition = df_sample5
testpart_size = 0.2

df_nontestData, df_testData = train_test_split(df4partition, test_size=testpart_size, random_state=1)

df_testData.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Education_2,Education_3,SecuritiesAccount_No,CDAccount_No,Online_No,CreditCard_No,PersonalLoan_Yes
2764,-1.250824,-1.317114,0.222137,-1.216733,0.550486,0.47684,0,1,1,1,1,0,0
4767,-0.90188,-0.968316,-0.625068,0.525938,-0.593902,0.437514,0,0,0,1,1,1,0
3814,-0.989116,-0.968316,-0.8423,0.525938,-0.365024,-0.555468,0,0,1,1,1,1,0
3499,0.319423,0.252477,0.873833,-1.216733,-0.937218,2.256343,0,0,1,1,0,1,0
2735,-0.814644,-0.706717,-0.081988,0.525938,0.378828,1.06673,1,0,1,1,0,1,0


In [22]:
# net gain is 5, so the gross gain should be 5+2=7
gain = 7 * df_testData.PersonalLoan_Yes.sum()
cost = 2 * df_testData.shape[0]
profit = gain - cost
avg_profit = profit / df_testData.shape[0]
print("The final average profit would be {} dollar.".format(avg_profit))

The final average profit would be -1.3 dollar.


In [23]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

DV = 'PersonalLoan_Yes'
y = df_nontestData[DV]
X = df_nontestData.drop(columns=[DV])

# A user-defined function summary_coef
# to display the estimated coefficients of a model candidate obtained by the Logistic Regression analysis
def summary_coef(model_object):
    n_predictors = X.shape[1]
    model_coef = pd.DataFrame(model_object.coef_.reshape(1, n_predictors), columns=X.columns.values)
    model_coef['Intercept'] = model_object.intercept_
    return model_coef.transpose()

# Run Logistic regression with k-fold cross validation with k=5
kfolds = 5

min_alpha = 0.01
max_alpha = 100

max_C = 1/min_alpha
min_C = 1/max_alpha

n_candidates = 5000

import numpy as np
C_list = list(np.linspace(min_C, max_C, num=n_candidates))

In [25]:
def profit_calculation_c(model, x_value, y_value):
  
    # Specify the decision cut-off used in the decision rule
    d_cutoff = 2/7

  # Get the decisions made by the decision rule for each observation in the dataset
  # The method predict_proba is to get the predicted probability
  # Then we compare the predicted probabilities with the decision cut-off d_cutoff
  # True means SEND, False means NOT SEND
  # list() is to convert the results into a Python list
    decision = list(model.predict_proba(x_value)[:,1] > d_cutoff)
  
  # We put the actual class into a Python list called y
    y = list(y_value)
  
  # Get the number of observations of the dataset and put it into n_obs
    n_obs = len(y)

  # cum_profit is for cumulating the profit during the for-loop
    cum_profit = 0

    for i in range(n_obs): # i will go from 0 to (n_obs-1)
        if decision[i] == True and y[i] == 1: # if the decision is SEND and the actual class is 1 (Accept) for the i-th observation
            profit = 5# the net profit is 5
        elif decision[i] == True and y[i] == 0: # if the decision is SEND and the actual class is 0 (Reject) for the i-th observation
            profit = -2 # the net profit is -2
        else:
            profit = 0 # For any other situation, the net profit is zero
        cum_profit += profit # cumulating the profit
  
    average_net_profit = cum_profit / n_obs # Derive the average net profit
    return average_net_profit # return the average net profit

clf_optimal_c = LogisticRegressionCV(Cs=C_list, cv=kfolds, scoring=profit_calculation_c, penalty='l1', solver='saga', max_iter=200, random_state=1, n_jobs=-1).fit(X,y)

# Calcuate the average net profit over the test partition based on the final selected model

# y_test_actual is the actual values of the DV in the test partition
y_test_actual = df_testData[DV]

# X_test is the predictor values in the test partition
X_test = df_testData.drop(columns=[DV])

# Use the clf_optimal object to apply the model associated with clf_optimal to the test partition
# Use the user-defined profit_calculation function to get the profit
print("The final average profit would be {} dollar.".format(profit_calculation_c(clf_optimal_c, X_test, y_test_actual)))

The final average profit would be 0.327 dollar.


In [26]:
def profit_calculation_e(model, x_value, y_value):
  
    # Specify the decision cut-off used in the decision rule
    d_cutoff = 8/28

  # Get the decisions made by the decision rule for each observation in the dataset
  # The method predict_proba is to get the predicted probability
  # Then we compare the predicted probabilities with the decision cut-off d_cutoff
  # True means SEND, False means NOT SEND
  # list() is to convert the results into a Python list
    decision = list(model.predict_proba(x_value)[:,1] > d_cutoff)
  
  # We put the actual class into a Python list called y
    y = list(y_value)
  
  # Get the number of observations of the dataset and put it into n_obs
    n_obs = len(y)

  # cum_profit is for cumulating the profit during the for-loop
    cum_profit = 0

    for i in range(n_obs): # i will go from 0 to (n_obs-1)
        if decision[i] == True and y[i] == 1: # if the decision is SEND and the actual class is 1 (Accept) for the i-th observation
            profit = 20# the net profit is 5
        elif decision[i] == True and y[i] == 0: # if the decision is SEND and the actual class is 0 (Reject) for the i-th observation
            profit = -8 # the net profit is -2
        else:
            profit = 0 # For any other situation, the net profit is zero
        cum_profit += profit # cumulating the profit
  
    average_net_profit = cum_profit / n_obs # Derive the average net profit
    return average_net_profit # return the average net profit

clf_optimal_e = LogisticRegressionCV(Cs=C_list, cv=kfolds, scoring=profit_calculation_e, penalty='l1', solver='saga', max_iter=200, random_state=1, n_jobs=-1).fit(X,y)

# compare two models
summary_coef(clf_optimal_c) == summary_coef(clf_optimal_e)

Unnamed: 0,0
Age,True
Experience,True
Income,True
Family,True
CCAvg,True
Mortgage,True
Education_2,True
Education_3,True
SecuritiesAccount_No,True
CDAccount_No,True
