In [18]:
# %%
import pandas as pd 

df = pd.read_csv("./data/defaults.csv")

# Select relevant columns from the dataset
dataPrep = df.drop(["ID"], axis=1)
print(dataPrep.dtypes)
all_cols = df.columns
all_cols

Credit Given          float64
Gender                 object
Education              object
Marital Status         object
Age                     int64
Pay Stat M1            object
Bill Amt M1             int64
Amt Paid M1             int64
Pay Stat M2            object
Bill Amt M2             int64
Amt Paid M2             int64
Pay Stat M3            object
Bill Amt M3             int64
Amt Paid M3             int64
Default Next Month     object
dtype: object


Index(['ID', 'Credit Given', 'Gender', 'Education', 'Marital Status', 'Age',
       'Pay Stat M1', 'Bill Amt M1', 'Amt Paid M1', 'Pay Stat M2',
       'Bill Amt M2', 'Amt Paid M2', 'Pay Stat M3', 'Bill Amt M3',
       'Amt Paid M3', 'Default Next Month'],
      dtype='object')

In [15]:
# %%

# Check the missing values
dataNull = dataPrep.isnull().sum()
dataNull


Credit Given          16
Gender                12
Education             11
Marital Status         8
Age                    0
Pay Stat M1            0
Bill Amt M1            0
Amt Paid M1            0
Pay Stat M2            0
Bill Amt M2            0
Amt Paid M2            0
Pay Stat M3            0
Bill Amt M3            0
Amt Paid M3            0
Default Next Month     0
dtype: int64

In [19]:
# %%
# Replace the missing values of string variable with mode
mode = dataPrep.mode().iloc[0]
print(mode)
cols = dataPrep.select_dtypes(include='object').columns
print(cols)
dataPrep[cols] = dataPrep[cols].fillna(mode)
dataPrep

Credit Given                50000.0
Gender                       Female
Education                University
Marital Status        Never Married
Age                              29
Pay Stat M1                       0
Bill Amt M1                       0
Amt Paid M1                       0
Pay Stat M2                       0
Bill Amt M2                       0
Amt Paid M2                       0
Pay Stat M3                       0
Bill Amt M3                       0
Amt Paid M3                       0
Default Next Month               No
Name: 0, dtype: object
Index(['Gender', 'Education', 'Marital Status', 'Pay Stat M1', 'Pay Stat M2',
       'Pay Stat M3', 'Default Next Month'],
      dtype='object')


Unnamed: 0,Credit Given,Gender,Education,Marital Status,Age,Pay Stat M1,Bill Amt M1,Amt Paid M1,Pay Stat M2,Bill Amt M2,Amt Paid M2,Pay Stat M3,Bill Amt M3,Amt Paid M3,Default Next Month
0,120000.0,Female,University,Never Married,26,0,3272,1000,0,3455,0,2,3261,2000,Yes
1,90000.0,Female,University,Never Married,34,0,14331,1000,0,14948,1000,0,15549,5000,No
2,50000.0,Female,University,Married,37,0,28314,1100,0,28959,1069,0,29547,1000,No
3,50000.0,Male,University,Married,57,0,20940,9000,0,19146,689,0,19131,679,No
4,50000.0,Male,Graduate School,Never Married,37,0,19394,1000,0,19619,1000,0,20024,800,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,180000.0,Female,Graduate School,Never Married,43,-1,22928,0,0,3129,2398,-1,2398,23149,No
1596,60000.0,Female,Graduate School,Never Married,24,0,15474,1000,0,14612,2000,0,16354,1000,No
1597,40000.0,Female,High School,Married,59,0,14548,2770,0,16274,613,2,15650,14,No
1598,140000.0,Female,University,Married,34,0,103337,4500,0,105942,4000,0,101830,4200,No


In [20]:
# %%

# Replace numerical columns with mean
cols = dataPrep.select_dtypes(include='float').columns
print(cols)
mean = dataPrep[cols].mean()
print(mean)
dataPrep[cols] = dataPrep[cols].fillna(mean)
dataPrep

Index(['Credit Given'], dtype='object')
Credit Given    157815.656566
dtype: float64


Unnamed: 0,Credit Given,Gender,Education,Marital Status,Age,Pay Stat M1,Bill Amt M1,Amt Paid M1,Pay Stat M2,Bill Amt M2,Amt Paid M2,Pay Stat M3,Bill Amt M3,Amt Paid M3,Default Next Month
0,120000.0,Female,University,Never Married,26,0,3272,1000,0,3455,0,2,3261,2000,Yes
1,90000.0,Female,University,Never Married,34,0,14331,1000,0,14948,1000,0,15549,5000,No
2,50000.0,Female,University,Married,37,0,28314,1100,0,28959,1069,0,29547,1000,No
3,50000.0,Male,University,Married,57,0,20940,9000,0,19146,689,0,19131,679,No
4,50000.0,Male,Graduate School,Never Married,37,0,19394,1000,0,19619,1000,0,20024,800,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,180000.0,Female,Graduate School,Never Married,43,-1,22928,0,0,3129,2398,-1,2398,23149,No
1596,60000.0,Female,Graduate School,Never Married,24,0,15474,1000,0,14612,2000,0,16354,1000,No
1597,40000.0,Female,High School,Married,59,0,14548,2770,0,16274,613,2,15650,14,No
1598,140000.0,Female,University,Married,34,0,103337,4500,0,105942,4000,0,101830,4200,No


In [11]:
# %%
dataPrep = pd.get_dummies(dataPrep, drop_first=True)
# dataPrep = pd.get_dummies(df, columns=['Education', 'Gender', 'Marital Status', 'Default Next Month'], drop_first=True)
dataPrep.columns

Index(['Credit Given', 'Age', 'Bill Amt M1', 'Amt Paid M1', 'Bill Amt M2',
       'Amt Paid M2', 'Bill Amt M3', 'Amt Paid M3', 'Gender_Male',
       'Education_High School', 'Education_Others', 'Education_University',
       'Marital Status_Never Married', 'Marital Status_Others',
       'Pay Stat M1_0', 'Pay Stat M1_2', 'Pay Stat M1_3', 'Pay Stat M1_>3',
       'Pay Stat M2_0', 'Pay Stat M2_2', 'Pay Stat M2_3', 'Pay Stat M2_>3',
       'Pay Stat M3_0', 'Pay Stat M3_2', 'Pay Stat M3_3', 'Pay Stat M3_>3',
       'Default Next Month_Yes'],
      dtype='object')

In [13]:
# %%
print(dataPrep.columns)
print(df.columns)

Index(['Credit Given', 'Age', 'Bill Amt M1', 'Amt Paid M1', 'Bill Amt M2',
       'Amt Paid M2', 'Bill Amt M3', 'Amt Paid M3', 'Gender_Male',
       'Education_High School', 'Education_Others', 'Education_University',
       'Marital Status_Never Married', 'Marital Status_Others',
       'Pay Stat M1_0', 'Pay Stat M1_2', 'Pay Stat M1_3', 'Pay Stat M1_>3',
       'Pay Stat M2_0', 'Pay Stat M2_2', 'Pay Stat M2_3', 'Pay Stat M2_>3',
       'Pay Stat M3_0', 'Pay Stat M3_2', 'Pay Stat M3_3', 'Pay Stat M3_>3',
       'Default Next Month_Yes'],
      dtype='object')
Index(['ID', 'Credit Given', 'Gender', 'Education', 'Marital Status', 'Age',
       'Pay Stat M1', 'Bill Amt M1', 'Amt Paid M1', 'Pay Stat M2',
       'Bill Amt M2', 'Amt Paid M2', 'Pay Stat M3', 'Bill Amt M3',
       'Amt Paid M3', 'Default Next Month'],
      dtype='object')
