- **Based on the findings and insights from the data exploration, we will now proceed with data preprocessing.**

In [1]:
# Importing Required Libraries
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Defining the path to the datasets folder
data_dir = "datasets"
dataset_path = os.path.join(data_dir, "UCI_Credit_Card.csv")

# Loading the CSV file into a DataFrame
dataframe = pd.read_csv(dataset_path)

# Displaying the first few rows to confirm loading
dataframe.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [3]:
 # Renaming the target column
dataframe.rename(columns={'default.payment.next.month': 'default_payment_next_month'}, inplace=True)

In [4]:
# Replacing 0 with NaN in the MARRIAGE column
dataframe = dataframe.replace({'MARRIAGE': {0: np.nan}})

# Replacing 4, 5, and 6 with 0 in the EDUCATION column to consider them as 'others'
dataframe = dataframe.replace({'EDUCATION': {4: 0, 5: 0, 6: 0}})

# Displaying missing values in each column
dataframe.isnull().sum()

ID                             0
LIMIT_BAL                      0
SEX                            0
EDUCATION                      0
MARRIAGE                      54
AGE                            0
PAY_0                          0
PAY_2                          0
PAY_3                          0
PAY_4                          0
PAY_5                          0
PAY_6                          0
BILL_AMT1                      0
BILL_AMT2                      0
BILL_AMT3                      0
BILL_AMT4                      0
BILL_AMT5                      0
BILL_AMT6                      0
PAY_AMT1                       0
PAY_AMT2                       0
PAY_AMT3                       0
PAY_AMT4                       0
PAY_AMT5                       0
PAY_AMT6                       0
default_payment_next_month     0
dtype: int64

- Now only `MARRIAGE` column have missing values, which we will handle in the preprocessing step after splitting the data into training and testing sets.

**Handling Outliers**

In [5]:
numerical_columns = ['LIMIT_BAL',
        'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
        'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',]

# Function to detect Outliers in Numerical Variables using IQR Method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
    return len(outliers), lower_bound, upper_bound

# Creating a copy if df to avoid modifying the original DataFrame
df1 = dataframe.copy()

for col in numerical_columns:
    outlier_count, lower, upper = detect_outliers(df1, col)
    df1 = df1[(df1[col] >= lower) & (df1[col] <= upper)]
df1.reset_index(drop=True, inplace=True)

# Printing the shape of df1 after removing outliers
df1.shape

(14604, 25)

- If we remove outliers, we will lose around 50% of the data, so we will not remove outliers. Instead, we can apply log transformation for skewed numerical variables or use tree-based models which are robust to skewed distributions and outliers.

**Feature Engineering**

In [6]:
def initiate_feature_engineering(dataframe: pd.DataFrame) -> pd.DataFrame:

    # Age Groups
    dataframe['Age_Groups'] = pd.cut(dataframe['AGE'], bins=[20, 25, 30, 35, 40, 45, 50, 55, 60, np.inf], 
                                 labels=['20-25', '25-30', '30-35', '35-40', '40-45', '45-50', '50-55', '55-60', '60+'], 
                                 right=False)
    dataframe = dataframe.drop(columns=['AGE'])

    # Average Bill Amount
    dataframe['Avg_Bill_Amt'] = dataframe[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].mean(axis=1).round(2)

    # Average Payment Amount
    dataframe['Avg_Pay_Amt'] = dataframe[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].mean(axis=1).round(2)

    # Average Delay Score Calculation
    dataframe['Avg_Delay_Score'] = dataframe[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].mean(axis=1).round(2)
    
    # Average Credit Utilization Ratio Calculation
    bill_amt_cols = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
    utilization_ratios = []
    for col in bill_amt_cols:
        dataframe[f'UTIL_{col}'] = dataframe[col] / dataframe['LIMIT_BAL']
        utilization_ratios.append(f'UTIL_{col}')
    dataframe['Average_Credit_Utilization_Ratio'] = dataframe[utilization_ratios].mean(axis=1).round(2)
    dataframe = dataframe.drop(columns=[col for col in dataframe.columns if 'UTIL_' in str(col)])

    # droping 'ID' column if exists
    if 'ID' in dataframe.columns:
        dataframe = dataframe.drop(columns=['ID'])

    return dataframe

dataframe = initiate_feature_engineering(dataframe)
dataframe.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month,Age_Groups,Avg_Bill_Amt,Avg_Pay_Amt,Avg_Delay_Score,Average_Credit_Utilization_Ratio
0,20000.0,2,2,1.0,2,2,-1,-1,-2,-2,...,0.0,0.0,0.0,0.0,1,20-25,1284.0,114.83,-0.33,0.06
1,120000.0,2,2,2.0,-1,2,0,0,0,2,...,1000.0,1000.0,0.0,2000.0,1,25-30,2846.17,833.33,0.5,0.02
2,90000.0,2,2,2.0,0,0,0,0,0,0,...,1000.0,1000.0,1000.0,5000.0,0,30-35,16942.17,1836.33,0.0,0.19
3,50000.0,2,2,1.0,0,0,0,0,0,0,...,1200.0,1100.0,1069.0,1000.0,0,35-40,38555.67,1398.0,0.0,0.77
4,50000.0,1,2,1.0,-1,0,-1,0,0,0,...,10000.0,9000.0,689.0,679.0,0,55-60,18223.17,9841.5,-0.33,0.36


- We created 5 new features based on the existing features:
  - `Age_Groups`: Categorizing age into groups.
  - `Avg_Bill_Amt`: Average of the bill amounts for the last 6 months.
  - `Avg_Pay_Amt`: Average of the payment amounts for the last 6 months.
  - `Avg_Delay_Score`: Average delay score for the last 6 months.
  - `Avg_Utilization_Ratio`: Average utilization ratio for the last 6 months. (Payment amount divided by bill amount for each month then averaged)

- We also dropped 2 columns:
  - `AGE`: As we created age groups, the original `AGE` column is no longer needed.
  - `ID`: Unique identifier for each customer, which does not contribute to the prediction of default payment.

In [7]:
# Drop duplicates if any
if dataframe.duplicated().any():
    dataframe = dataframe.drop_duplicates()

# Checking the shape of the DataFrame after adding new features
dataframe.shape

(29875, 28)

- Now we have 29875 rows and 28 columns in the dataset after performing feature engineering and removing duplicate rows.

**Data Splitting**
- We will split the dataset into training and testing sets using an 75-25 split. This will allow us to train the model on a larger portion of the data and evaluate its performance on unseen data.

In [8]:
# Split data into X (features) and y (target)
TARGET_COLUMN = 'default_payment_next_month'

X = dataframe.drop(columns= TARGET_COLUMN)
y = dataframe[TARGET_COLUMN]

# Displaying the shape of the features and target
print(f"Shape of features (X): {X.shape}")
print(f"Shape of target (y): {y.shape}")

Shape of features (X): (29875, 27)
Shape of target (y): (29875,)


In [9]:
# Spliting the dataset into training and testing sets
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and testing sets with a 75-25 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)

# Displaying the shapes of the training and testing sets
print(f"Shape of training features (X_train): {X_train.shape}")
print(f"Shape of testing features (X_test): {X_test.shape}")

Shape of training features (X_train): (22406, 27)
Shape of testing features (X_test): (7469, 27)


In [10]:
# Checking proportions of default payment next month in training data
y_train.value_counts(normalize=True).round(2)

default_payment_next_month
0    0.78
1    0.22
Name: proportion, dtype: float64

In [11]:
# Checking proportions of default payment next month in testing data
y_test.value_counts(normalize=True).round(2)

default_payment_next_month
0    0.78
1    0.22
Name: proportion, dtype: float64

- We have same proportion of target variable in both training and testing sets, which is good for model evaluation.

**Data preprocessing**

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

def get_data_transformer_object():
    numerical_columns = [
        'LIMIT_BAL',
        'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
        'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 
        'Avg_Bill_Amt', 'Avg_Pay_Amt', 'Avg_Delay_Score', 'Average_Credit_Utilization_Ratio'
    ]

    nominal_columns = ['SEX', 'MARRIAGE']

    ordinal_columns = ['EDUCATION', 'Age_Groups', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

    numerical_pipeline = Pipeline(steps=[('imputation', SimpleImputer(strategy='median')),
                                        ('scaling', StandardScaler())])

    nominal_pipeline = Pipeline(steps=[('imputation', SimpleImputer(strategy='most_frequent')),
                                        ('encoding', OneHotEncoder(handle_unknown ='ignore', drop='first'))])
                
    ordinal_pipeline = Pipeline(steps=[('imputation', SimpleImputer(strategy='most_frequent')),
                                            ('encoding', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

    preprocessor = ColumnTransformer([("numerical_pipeline", numerical_pipeline, numerical_columns),
                                            ("nominal_pipeline", nominal_pipeline, nominal_columns),
                                            ("ordinal_pipeline", ordinal_pipeline, ordinal_columns)])
            
    return preprocessor

# Creating the preprocessor object
preprocessor = get_data_transformer_object()

# performing fit and transform on the training data, and transform on the test data
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# Displaying the shape of the scaled training and testing data
print(f"Shape of scaled training features (X_train_scaled): {X_train_scaled.shape}")
print(f"Shape of scaled testing features (X_test_scaled): {X_test_scaled.shape}")

Shape of scaled training features (X_train_scaled): (22406, 28)
Shape of scaled testing features (X_test_scaled): (7469, 28)


- In this data preprocessing step, we use `ColumnTransformer` and `Pipeline` from `sklearn` to apply different preprocessing steps to different types of features:
    - For numerical features, we handled missing values by replacing them with the median of the column, and then scaled the numerical features using `StandardScaler`.
    - For nominal categorical features, we handled missing values by replacing them with the most frequent value (mode) in the column, and then applied `OneHotEncoder` to convert them into numerical format. In this case, we used `drop='first'` to avoid the dummy variable trap.
    - For ordinal categorical features, we handled missing values by replacing them with the most frequent value (mode) in the column, and then applied `OrdinalEncoder` to convert them into numerical format.

In [13]:
# Checking for missing values in the scaled training and testing data
print(f"Missing values in X_train_scaled: {np.isnan(X_train_scaled).sum()}")
print(f"Missing values in X_test_scaled: {np.isnan(X_test_scaled).sum()}")

Missing values in X_train_scaled: 0
Missing values in X_test_scaled: 0


**Handling class imbalance in training data**

In [14]:
# Applying SMOTE for balancing the training dataset
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=0.8, random_state=1)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

# Displaying the class distribution in the balanced training data
y_train_balanced.value_counts(normalize=True).round(2)

default_payment_next_month
0    0.56
1    0.44
Name: proportion, dtype: float64

- We used `SMOTE` (Synthetic Minority Over-sampling Technique) to handle class imbalance in the training data. This technique generates synthetic samples for the minority class to balance the class distribution.
- We set the `sampling_strategy` to 0.8, which means we want the minority class to be 80% of the majority class after resampling.

In [15]:
# Combining the balanced training features and target into a single array
train_arr = np.c_[X_train_balanced, np.array(y_train_balanced)]

# Combining the testing features and target into a single array
test_arr = np.c_[X_test_scaled, np.array(y_test)]

# Displaying the shapes of the training and testing arrays
print(f"Shape of training array (train_arr): {train_arr.shape}")
print(f"Shape of testing array (test_arr): {test_arr.shape}")

Shape of training array (train_arr): (31411, 29)
Shape of testing array (test_arr): (7469, 29)


In [16]:
# Checking for missing values in the training and testing arrays
print(f"Missing values in train_arr: {np.isnan(train_arr).sum()}")
print(f"Missing values in test_arr: {np.isnan(test_arr).sum()}")

Missing values in train_arr: 0
Missing values in test_arr: 0


In [17]:
# Checking if the training array has unique rows
np.unique(train_arr, axis=0).shape[0] == train_arr.shape[0]

True

In [18]:
# Checking if the testing array has unique rows
np.unique(test_arr, axis=0).shape[0] == test_arr.shape[0]

True

**Converting numpy arrays to DataFrame**

In [19]:
# Get feature names from the fitted preprocessor
feature_names = preprocessor.get_feature_names_out()

# Adding target column name
all_column_names = list(feature_names) + [TARGET_COLUMN]

# Displaying the total number of columns and their names
print(f"Total columns: {len(all_column_names)}")    

# removing prefix from all column names
all_column_names = [name.split('__')[-1] for name in all_column_names]

# Printing the column names
for i, name in enumerate(all_column_names, 1):
    print(f"{i}. {name}")

Total columns: 29
1. LIMIT_BAL
2. BILL_AMT1
3. BILL_AMT2
4. BILL_AMT3
5. BILL_AMT4
6. BILL_AMT5
7. BILL_AMT6
8. PAY_AMT1
9. PAY_AMT2
10. PAY_AMT3
11. PAY_AMT4
12. PAY_AMT5
13. PAY_AMT6
14. Avg_Bill_Amt
15. Avg_Pay_Amt
16. Avg_Delay_Score
17. Average_Credit_Utilization_Ratio
18. SEX_2.0
19. MARRIAGE_2.0
20. MARRIAGE_3.0
21. EDUCATION
22. Age_Groups
23. PAY_0
24. PAY_2
25. PAY_3
26. PAY_4
27. PAY_5
28. PAY_6
29. default_payment_next_month


In [20]:
# Converting train_arr to DataFrame
train_df = pd.DataFrame(train_arr, columns=all_column_names)

print(f"Shape of train_df: {train_df.shape}")
train_df.head()

Shape of train_df: (31411, 29)


Unnamed: 0,LIMIT_BAL,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,...,MARRIAGE_3.0,EDUCATION,Age_Groups,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default_payment_next_month
0,-0.295787,-0.70042,-0.695766,-0.68168,-0.675634,-0.666399,-0.654807,-0.344494,-0.265643,-0.28412,...,0.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.910493,-0.014833,0.010673,0.029585,-0.350936,-0.335613,-0.331043,-0.212549,-0.198817,-0.203824,...,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
2,-0.372625,-0.672639,-0.671643,-0.671405,-0.661317,-0.666399,-0.654807,-0.313067,-0.233744,-0.23455,...,0.0,3.0,4.0,3.0,4.0,1.0,1.0,1.0,0.0,0.0
3,1.010463,-0.53714,-0.485143,-0.310075,-0.421837,-0.601452,-0.618208,0.557646,0.870398,0.037064,...,0.0,3.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0
4,-0.987332,-0.452758,-0.415126,-0.370462,-0.330451,-0.293328,-0.264933,-0.224544,-0.176541,-0.230589,...,0.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0


In [21]:
# converting test_arr to DataFrame
test_df = pd.DataFrame(test_arr, columns=all_column_names)

print(f"Shape of test_df: {test_df.shape}")
test_df.head()

Shape of test_df: (7469, 29)


Unnamed: 0,LIMIT_BAL,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,...,MARRIAGE_3.0,EDUCATION,Age_Groups,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default_payment_next_month
0,-0.679978,0.338089,0.406213,0.447585,0.190638,0.26968,0.318631,-0.127805,-0.127536,-0.177059,...,0.0,2.0,1.0,4.0,2.0,2.0,2.0,2.0,2.0,1.0
1,1.702008,-0.70042,-0.695766,-0.68168,-0.675634,-0.666399,-0.654807,-0.344494,-0.265643,-0.28412,...,0.0,2.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.141008,-0.61485,-0.592801,-0.561566,-0.530616,-0.501611,-0.487252,-0.277022,-0.214632,-0.222345,...,0.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,1.0
3,-0.910493,-0.136706,-0.096817,-0.052011,-0.012742,0.080871,0.122413,-0.224544,-0.172086,-0.28412,...,0.0,3.0,0.0,4.0,4.0,4.0,4.0,3.0,3.0,1.0
4,1.010463,-0.696152,-0.691339,-0.429155,-0.439756,-0.511356,-0.476539,-0.325542,0.518315,-0.177059,...,0.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,0.0


**Saving Train and Test data into CSV files**

In [22]:
# Creating new directory to save csv output files
csv_output_dir = "csv_outputs"
os.makedirs(csv_output_dir, exist_ok=True)

# Saving the train data into csv file
train_data_path = os.path.join(csv_output_dir, "train_data.csv")
train_df.to_csv(train_data_path, index=False)

# Saving the test data into csv file
test_data_path = os.path.join(csv_output_dir, "test_data.csv")
test_df.to_csv(test_data_path, index=False)

# Printing the paths where the data is saved
print(f"Train data saved to: {train_data_path}")
print(f"Test data saved to: {test_data_path}")

Train data saved to: csv_outputs\train_data.csv
Test data saved to: csv_outputs\test_data.csv


- Saved the preprocessed training and testing arrays to `.npy` files for training and testing machine learning models. This allows us to quickly load the data without having to repeat the preprocessing steps.