# Data Cleaning and Preprocessing

### Importing Packages

In [1]:
# Importing all necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import train_test_split
import numpy as np

### Importing and Preview of Datasets

In [2]:
# Importing the raw "application_record" CSV file from Kaggle as a Pandas dataframe
application_record = pd.read_csv("source/application_record.csv")
application_record

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438552,6840104,M,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,-22717,365243,1,0,0,0,,1.0
438553,6840222,F,N,N,0,103500.0,Working,Secondary / secondary special,Single / not married,House / apartment,-15939,-3007,1,0,0,0,Laborers,1.0
438554,6841878,F,N,N,0,54000.0,Commercial associate,Higher education,Single / not married,With parents,-8169,-372,1,1,0,0,Sales staff,1.0
438555,6842765,F,N,Y,0,72000.0,Pensioner,Secondary / secondary special,Married,House / apartment,-21673,365243,1,0,0,0,,2.0


In [3]:
# Checking the number of unique IDs in the Application Record Dataset
application_record["ID"].nunique()

438510

In [4]:
# Importing the raw "credit_record" CSV file from Kaggle as a Pandas dataframe
credit_record = pd.read_csv("source/credit_record.csv")
credit_record

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C
...,...,...,...
1048570,5150487,-25,C
1048571,5150487,-26,C
1048572,5150487,-27,C
1048573,5150487,-28,C


In [5]:
# Checking the number of unique IDs in the Credit Record Dataset
print(credit_record["ID"].nunique()) 
# Only 45985 unique client IDs in the "credit_record" CSV file

45985


### Converting Multiclass Label to Binary Label

In [6]:
# Now, we observe that there are multiple rows per unique client ID in the "credit_record" CSV file.
# This is because for each unique client ID, there are records of his/her loan status for the current month, as well as previous months.
# Note that the STATUS column is formatted as following:
# 0: 1-29 days past due // 1: 30-59 days past due // 2: 60-89 days overdue 
# 3: 90-119 days overdue // 4: 120-149 days overdue // 5: Overdue or bad debts, write-offs for more than 150 days 
# C: paid off that month // X: No loan for the month
# Our group has the following consesus, listed below:
# We label a client "Bad" if he/she has/had a loan >= 90 days past due.
# We label a client "Good" if he/she has/had a loan <90 days past due.

# Firstly, we group the "credit_record" CSV by "ID" column 
# Secondly, for each unique client ID, we check if any value in the "STATUS" column = 3, 4 or 5
# If found, label the unique client ID as "Bad"
# Else if not found, label the unique client ID as "Good"
classified_credit_record = credit_record.groupby("ID")["STATUS"].apply(
    lambda x: "Bad" if any(x.isin(["3", "4", "5"])) else "Good"
).reset_index()
# After applying the .groupby() and .apply() functions, we created a Series with "ID" now being an index, and client classification as values.
# Thus, the .reset_index() function is necessary to convert "ID" back into a column and have a Pandas DataFrame (not a Series) to work with

classified_credit_record

Unnamed: 0,ID,STATUS
0,5001711,Good
1,5001712,Good
2,5001713,Good
3,5001714,Good
4,5001715,Good
...,...,...
45980,5150482,Good
45981,5150483,Good
45982,5150484,Good
45983,5150485,Good


In [7]:
# Name the column containing values "Good" and "Bad" as "TYPE"
classified_credit_record = classified_credit_record.rename(columns={"STATUS":"TYPE"})
print(classified_credit_record["TYPE"].value_counts()) 
# 45654 "Good" clients + 331 "Bad" clients = 45985 total clients, tallying with original number of unique client IDs in the "credit_record" CSV file

TYPE
Good    45654
Bad       331
Name: count, dtype: int64


### Merging Application Record with Credit Record

In [8]:
# Perform an INNER JOIN of the "application_record" and "filtered_credit_record" dataframes
# Firstly, this removes all Client IDs with a credit history but no application record.
# This is necessary because these data points are meaningless, containing only output ("TYPE" a.k.a credit history) but no input features at all.
# Secondly, this removes all Client IDs with an application record but no credit history.
# This is necessary because our Logistic Regression and Random Forest models must be trained on clients possessing both application record and credit history,
# so that our models are capable of predicting whether a new credit card applicant is going to be "good" (approve application) or "bad" (reject application) based on his/her input features.
merged_df = application_record.merge(classified_credit_record, on="ID", how="inner")

In [9]:
# Preview of the merged dataframe
merged_df

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,TYPE
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,Good
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,Good
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,Good
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,Good
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,5149828,M,Y,Y,0,315000.0,Working,Secondary / secondary special,Married,House / apartment,-17348,-2420,1,0,0,0,Managers,2.0,Bad
36453,5149834,F,N,Y,0,157500.0,Commercial associate,Higher education,Married,House / apartment,-12387,-1325,1,0,1,1,Medicine staff,2.0,Bad
36454,5149838,F,N,Y,0,157500.0,Pensioner,Higher education,Married,House / apartment,-12387,-1325,1,0,1,1,Medicine staff,2.0,Bad
36455,5150049,F,N,Y,0,283500.0,Working,Secondary / secondary special,Married,House / apartment,-17958,-655,1,0,0,0,Sales staff,2.0,Good


### Data Cleaning

In [10]:
# Drop the unnecessary ID column, since we already ensure that all entries in merged_df are unique
merged_df = merged_df.drop(columns=['ID'])

# Impute missing "OCCUPATION_TYPE" values with "Unknown"
merged_df["OCCUPATION_TYPE"].fillna("Unknown", inplace=True)

# The "DAYS_BIRTH" column merely stores how many days a person has been alive, counting backwards from current day
# So, we create a more readable column called "AGE" in years, by performing days/365, and rounding down to nearest integer
merged_df["AGE"] = (-merged_df["DAYS_BIRTH"] / 365).astype(int)

# The "DAYS_EMPLOYED" column uses a very large positive number "365243" to encode those who are currently unemployed, which can cause model confusion with such large magnitudes
# To improve interpretability, we create a more readable column called "IS_EMPLOYED", by indicating whether a person is employed (Y), or unemployed (N)
merged_df["IS_EMPLOYED"] = merged_df["DAYS_EMPLOYED"].apply(lambda x: "Y" if x < 0 else "N")

# Then, we remove the "DAYS_BIRTH" and "DAYS_EMPLOYED" columns since we do not need them anymore
merged_df.drop(columns=["DAYS_BIRTH", "DAYS_EMPLOYED"], inplace=True)

# For the columns "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "IS_EMPLOYED" we binary-encode their values as "Y"=1, and "N"=0
merged_df["FLAG_OWN_CAR"] = merged_df["FLAG_OWN_CAR"].map({"Y": 1, "N": 0})
merged_df["FLAG_OWN_REALTY"] = merged_df["FLAG_OWN_REALTY"].map({"Y": 1, "N": 0})
merged_df["IS_EMPLOYED"] = merged_df["IS_EMPLOYED"].map({"Y": 1, "N": 0})

# For the column "CODE_GENDER", we binary-encode their values as "M"=1, and "F"=0
merged_df["CODE_GENDER"] = merged_df["CODE_GENDER"].map({"M": 1, "F": 0})

# For the column "TYPE", we binary-encode their values as "Good"=1, and "Bad"=0
merged_df["TYPE"] = merged_df["TYPE"].map({"Good": 1, "Bad": 0})

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["OCCUPATION_TYPE"].fillna("Unknown", inplace=True)


In [11]:
# Preview of the leaned dataframe
merged_df

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,TYPE,AGE,IS_EMPLOYED
0,1,1,1,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,1,0,0,Unknown,2.0,1,32,1
1,1,1,1,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,1,0,0,Unknown,2.0,1,32,1
2,1,1,1,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,2.0,1,58,1
3,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,1,52,1
4,0,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,1,52,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,1,1,1,0,315000.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Managers,2.0,0,47,1
36453,0,0,1,0,157500.0,Commercial associate,Higher education,Married,House / apartment,1,0,1,1,Medicine staff,2.0,0,33,1
36454,0,0,1,0,157500.0,Pensioner,Higher education,Married,House / apartment,1,0,1,1,Medicine staff,2.0,0,33,1
36455,0,0,1,0,283500.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Sales staff,2.0,1,49,1


### Scaling and Normalisation

In [12]:
# Scale all 4 numerical columns "CNT_CHILDREN", "AMT_INCOME_TOTAL", "CNT_FAM_MEMBERS", "AGE" to have mean = 0, and standard deviation = 1
numerical_cols = ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'CNT_FAM_MEMBERS', 'AGE']
scaler = StandardScaler() # Initialise the StandardScaler
merged_df[numerical_cols] = scaler.fit_transform(merged_df[numerical_cols]) # Standardise all numerical features

print(merged_df["TYPE"].value_counts()) 
# 36155 "Good"=1 clients, 302 "Bad"=0 clients == CLASS IMBALANCE

TYPE
1    36155
0      302
Name: count, dtype: int64


### Data Splitting

In [13]:
# Separate the Feature Variables (all other columns) from the Target Variable ("TYPE" column)
X = merged_df.drop(columns=["TYPE"])  # Feature Variables are stored in X
y = merged_df["TYPE"]  # Target Variable is stored in y

# Perform an 80-20 Train-Test split of merged_df
# "stratify=y", ensures that class distribution in the train and test sets matches the original dataset (before SMOTE)
# "random_state=42", ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [14]:
# Identify categorical columns (list of column indices) within X
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
categorical_indices = [X.columns.get_loc(col) for col in categorical_features]

# Double check that the indices of the categorical columns within X are correct
print(categorical_indices)

[5, 6, 7, 8, 13]


### SMOTE

In [15]:
# To ensure class balance between "Good" and "Bad" clients, we perform Synthetic Minority Over-Sampling Technique for Numerical-Categorical Data (SMOTE-NC)
# SMOTE-NC creates new, synthetic data points for the minority class
# HOWEVER, we apply SMOTE-NC only to the training data, because we DO NOT want synthetic samples in test data
# This is because the test set should represent the actual class distribution
# ALSO, we apply SMOTE AFTER train-test split, to prevent synthetic data points from leaking information from the test data
# Laslty, unlike standard SMOTE, SMOTE-NC allows for specifying which columns are categorical
# This removes the need to for encoding of categorical columns into numerical columns
smote_nc = SMOTENC(categorical_features=categorical_indices, sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote_nc.fit_resample(X_train, y_train)

# Check class distribution after SMOTE-NC is executed, should be balanced 50-50 between TYPE = 1 and TYPE = 0 clients
print("Training set after SMOTE:")
print(y_train_resampled.value_counts(normalize=True) * 100)

Training set after SMOTE:
TYPE
1    50.0
0    50.0
Name: proportion, dtype: float64


### Exported Processed Datasets

In [16]:
# Save the resampled training set (after SMOTE-NC), comprising X_train_resampled and y_train_resampled as the TRAINING DATA for logistic regression and random forest models
# train_resampled = pd.DataFrame(X_train_resampled, columns=X.columns)
# train_resampled["TYPE"] = y_train_resampled  # Add target column back
# train_resampled.to_csv("C:/Users/65905/Downloads/train_dataset.csv", index=False)

# Save the original test set (unchanged), comprising the unchanged X_test and y_test as the TESTING DATA for evaluation of models
# test_set = pd.DataFrame(X_test, columns=X.columns)
# test_set["TYPE"] = y_test  # Add target column back
# test_set.to_csv("C:/Users/65905/Downloads/test_dataset.csv", index=False)