In [276]:
#Importing the required libraries
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Activation
from tensorflow.keras import regularizers

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

# EDA 
In this section we will do EDA and do the nessacary preprocessing. Main thing we
will be concerned will be missing values, we will take care of them, and also check whether the target has balanced classes

In [250]:
#loading data
#column name data not given hence header=None
df = pd.read_csv('/content/sample_data/crx.data',header=None)

In [251]:
df.columns = "Gender, Age, Debt, Married, BankCustomer, EducationLevel, Ethnicity, YearsEmployed, PriorDefault, Employed, CreditScore, DriversLicense, Citizen, ZipCode, Income, ApprovalStatus".split(', ')

In [252]:
df.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [253]:
#getting summary statistics
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    object 
 1   Age             690 non-null    object 
 2   Debt            690 non-null    float64
 3   Married         690 non-null    object 
 4   BankCustomer    690 non-null    object 
 5   EducationLevel  690 non-null    object 
 6   Ethnicity       690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    object 
 9   Employed        690 non-null    object 
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    object 
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    object 
 14  Income          690 non-null    int64  
 15  ApprovalStatus  690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


In [254]:
#missing values are marked as '?', now we shall convert it to np.nan
df = df.replace('?',np.nan)

In [255]:
#checking for missing values
df.isnull().sum()

Gender            12
Age               12
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode           13
Income             0
ApprovalStatus     0
dtype: int64

In [256]:
#the good news is that the target doesnt have any missing values.
#we will first impute the missing values of the mnumeric columns using the
#the mean of the column
df.fillna(df.mean(),inplace=True)

  after removing the cwd from sys.path.


In [257]:
#checking for missing values
df.isnull().sum()

Gender            12
Age               12
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode           13
Income             0
ApprovalStatus     0
dtype: int64

In [258]:
#checking the unique values in the columns which have missing values
cols = ['Gender','Age','Married','BankCustomer','EducationLevel','Ethnicity','ZipCode']
for col in cols:
  print(df[col].value_counts())
  print()

b    468
a    210
Name: Gender, dtype: int64

22.67    9
20.42    7
24.50    6
18.83    6
25.00    6
        ..
48.25    1
28.33    1
18.75    1
18.50    1
36.42    1
Name: Age, Length: 349, dtype: int64

u    519
y    163
l      2
Name: Married, dtype: int64

g     519
p     163
gg      2
Name: BankCustomer, dtype: int64

c     137
q      78
w      64
i      59
aa     54
ff     53
k      51
cc     41
m      38
x      38
d      30
e      25
j      10
r       3
Name: EducationLevel, dtype: int64

v     399
h     138
bb     59
ff     57
j       8
z       8
dd      6
n       4
o       2
Name: Ethnicity, dtype: int64

00000    132
00120     35
00200     35
00160     34
00100     30
        ... 
00021      1
00393      1
00395      1
00093      1
00256      1
Name: ZipCode, Length: 170, dtype: int64



In [259]:
#casting column 1 to numeric
df['Age'] = df['Age'].astype('float')
df['Age'].dtype

dtype('float64')

In [260]:
df['Age'].fillna(df['Age'].mean(),inplace=True)
#checking for missing values
df.isnull().sum()

Gender            12
Age                0
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode           13
Income             0
ApprovalStatus     0
dtype: int64

In [261]:
#we still see missing values in catergorical variables
#we will impute them now
for col in df.columns:
  #checking if the column is object type
  if df[col].dtype == 'object':
    df= df.fillna(df[col].value_counts().index[0])
#checking for missing values
df.isnull().sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
ApprovalStatus    0
dtype: int64

In [262]:
#now we map the target attributes "+" -> 1 and "-" -> 0
mapper = {'+':1,
          '-':0}
df["ApprovalStatus"] = df["ApprovalStatus"].apply(lambda x:mapper[x])
print(df["ApprovalStatus"].dtype)

int64


In [263]:
#we are going to drop columns 11 and 13 which are zipcode and driverslicense
df = df.drop(["ZipCode"],axis=1)

In [264]:
#now we check the split of the target
df["ApprovalStatus"].value_counts()
#there seems to be only 83 difference, which is not that significant

0    383
1    307
Name: ApprovalStatus, dtype: int64

In [265]:
#creating dummy variable for the categorical fields
features = list(df.columns)[0:-1]
print(features)
for col in features:
  #checking if the column is object type
  if df[col].dtype == 'object':
    print(col)
    df = pd.concat([df,pd.get_dummies(df[col],prefix=col)],axis=1)
    df.drop(col,axis=1,inplace=True)
df.columns

['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore', 'DriversLicense', 'Citizen', 'Income']
Gender
Married
BankCustomer
EducationLevel
Ethnicity
PriorDefault
Employed
DriversLicense
Citizen


Index(['Age', 'Debt', 'YearsEmployed', 'CreditScore', 'Income',
       'ApprovalStatus', 'Gender_a', 'Gender_b', 'Married_b', 'Married_l',
       'Married_u', 'Married_y', 'BankCustomer_b', 'BankCustomer_g',
       'BankCustomer_gg', 'BankCustomer_p', 'EducationLevel_aa',
       'EducationLevel_b', 'EducationLevel_c', 'EducationLevel_cc',
       'EducationLevel_d', 'EducationLevel_e', 'EducationLevel_ff',
       'EducationLevel_i', 'EducationLevel_j', 'EducationLevel_k',
       'EducationLevel_m', 'EducationLevel_q', 'EducationLevel_r',
       'EducationLevel_w', 'EducationLevel_x', 'Ethnicity_b', 'Ethnicity_bb',
       'Ethnicity_dd', 'Ethnicity_ff', 'Ethnicity_h', 'Ethnicity_j',
       'Ethnicity_n', 'Ethnicity_o', 'Ethnicity_v', 'Ethnicity_z',
       'PriorDefault_f', 'PriorDefault_t', 'Employed_f', 'Employed_t',
       'DriversLicense_f', 'DriversLicense_t', 'Citizen_g', 'Citizen_p',
       'Citizen_s'],
      dtype='object')

In [266]:
#now we seperate the features from the target
x_features = list(df.columns)
x_features.remove('ApprovalStatus')
X = df[x_features].values
y = df['ApprovalStatus']
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

# Experimentation(Question 2)

*   In this Experimentation we will create 4 models according to our literature review.
*   The metric we will use for selecting the best model is F1-score.
*   This is not a imbalanced dataset so accuracy will also do
*   We will do a 5 fold cross validation for each model and this will be done 30 times to get the mean of the F1-score



In [267]:
#-------------------------------------------------------------------------------------------------------#
#model 0
model_0 = Sequential()
model_0.add(Dense(60,input_dim=X.shape[1],activation='tanh',kernel_initializer='random_normal'))
model_0.add(Dense(5,activation='tanh',kernel_initializer='random_normal'))
model_0.add(Dense(1,activation='sigmoid',kernel_initializer='random_normal'))
model_0.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy']
                )
#-------------------------------------------------------------------------------------------------------#
#model 1
model_1 = Sequential()
model_1.add(Dense(25,input_dim=X.shape[1],activation='tanh',kernel_initializer='random_normal'))
model_1.add(Dense(1,activation=None,kernel_initializer='random_normal'))
model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy']
                )
#-------------------------------------------------------------------------------------------------------#
#model 2
model_2 = Sequential()
model_2.add(Dense(28,input_dim=X.shape[1],activation='relu',kernel_initializer='random_normal'))
model_2.add(Dense(1,activation=None,kernel_initializer='random_normal'))
model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy']
                )
#-------------------------------------------------------------------------------------------------------#
#model 3
model_3 = Sequential()
model_3.add(Dense(22,input_dim=X.shape[1],activation='relu',kernel_initializer='random_normal'))
model_3.add(Dense(1,activation='sigmoid',kernel_initializer='random_normal'))
model_3.compile(loss=tf.keras.losses.MeanSquaredError(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy']
                )

models = [model_0,model_1,model_2,model_3]

In [268]:
n_split=5
for num,model in enumerate(models):
  f1_score_ls_means = []
  for i in range(30):
    f1_score_ls = []
    for train_index,test_index in KFold(n_split,shuffle=True,random_state=7).split(X):
      tf.keras.backend.clear_session()
      x_train,x_test=X[train_index],X[test_index]
      y_train,y_test=y[train_index],y[test_index]
      # print(x_train.shape)
      model.fit(x_train, y_train,epochs=20,verbose=0)
      predictions = (model.predict(x_test) > 0.5).astype("int32").ravel()
      f1_score_ls.append(f1_score(y_test, predictions))
    f1_score_ls_means.append(sum(f1_score_ls)/len(f1_score_ls))
  print(f"Average F1-Score of Model_{num}: ",sum(f1_score_ls_means)/len(f1_score_ls_means))

Average F1-Score of Model_0:  0.9532627359795771
Average F1-Score of Model_1:  0.9411191392177825
Average F1-Score of Model_2:  0.9174357619138604
Average F1-Score of Model_3:  0.9072997772273168


# Regularization (Question 4)
*   It seems like Model_0 is the best model out of the four models
*   We will use Model_0 for the rest of the experimentation on L1 and L2 regularization
*   We will select lambda to vary from 0.01 to 0.05



Model 0 is will now be updated with a kernel regularizer

In [280]:
regularizers_ls = [regularizers.L1,regularizers.L2]
lambda_values = [0.00001,0.0001,0.0002,0.0003,0.0004,0.0005]
for norm,regularizer in enumerate(regularizers_ls):
  for lambda_value in lambda_values:
    #model 0
    model_0 = Sequential()
    model_0.add(Dense(60,input_dim=X.shape[1],activation='tanh',kernel_initializer='random_normal',kernel_regularizer=regularizer(lambda_value)))
    model_0.add(Dense(5,activation='tanh',kernel_initializer='random_normal',kernel_regularizer=regularizer(lambda_value)))
    model_0.add(Dense(1,activation='sigmoid',kernel_initializer='random_normal'))
    model_0.compile(loss='binary_crossentropy',
                    optimizer=tf.keras.optimizers.Adam(),
                    metrics=['accuracy']
                    )
    n_split=5
    f1_score_ls_means = []
    for i in range(15):
      f1_score_ls = []
      for train_index,test_index in KFold(n_split,shuffle=True,random_state=7).split(X):
        tf.keras.backend.clear_session()
        x_train,x_test=X[train_index],X[test_index]
        y_train,y_test=y[train_index],y[test_index]
        # print(x_train.shape)
        model_0.fit(x_train, y_train,epochs=20,verbose=0)
        predictions = (model_0.predict(x_test) > 0.5).astype("int32").ravel()
        f1_score_ls.append(f1_score(y_test, predictions))
      f1_score_ls_means.append(sum(f1_score_ls)/len(f1_score_ls))
    print(f"Average F1-Score of Model_0 with L{norm+1} and lambda {lambda_value}: ",sum(f1_score_ls_means)/len(f1_score_ls_means))

Average F1-Score of Model_0 with L1 and lambda 1e-05:  0.9381757800722663
Average F1-Score of Model_0 with L1 and lambda 0.0001:  0.9264472359397449
Average F1-Score of Model_0 with L1 and lambda 0.0002:  0.9184415366897226
Average F1-Score of Model_0 with L1 and lambda 0.0003:  0.9198266231855766
Average F1-Score of Model_0 with L1 and lambda 0.0004:  0.9111218298864903
Average F1-Score of Model_0 with L1 and lambda 0.0005:  0.9080478485857164
Average F1-Score of Model_0 with L2 and lambda 1e-05:  0.9384078893274441
Average F1-Score of Model_0 with L2 and lambda 0.0001:  0.933451622805573
Average F1-Score of Model_0 with L2 and lambda 0.0002:  0.9366901571213432
Average F1-Score of Model_0 with L2 and lambda 0.0003:  0.9252603673623468
Average F1-Score of Model_0 with L2 and lambda 0.0004:  0.9248649476903492
Average F1-Score of Model_0 with L2 and lambda 0.0005:  0.9184443293338579
