# Loan - Data preparation

In [180]:
# import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

# Read 'Loan_data.csv'

In [181]:
basepath = r'C:\Users\T\Python_DA'             #  in my case:  r'C:\Users\T\Python_DA'
inputfolder = 'Source'          #               'Homeworks - Module II'
inputfile = 'Loan_data.csv'                    #               'Loan_data.csv'

In [182]:
%run Read_csv.ipynb

      Loan_ID  Gender Married Dependents     Education Self_Employed  \
0    LP001003    Male     Yes          1      Graduate            No   
1    LP001005    Male     Yes          0      Graduate           Yes   
2    LP001006    Male     Yes          0  Not Graduate            No   
3    LP001008    Male      No          0      Graduate            No   
4    LP001011    Male     Yes          2      Graduate           Yes   
..        ...     ...     ...        ...           ...           ...   
524  LP002978  Female      No          0      Graduate            No   
525  LP002979    Male     Yes         3+      Graduate            No   
526  LP002983    Male     Yes          1      Graduate            No   
527  LP002984    Male     Yes          2      Graduate            No   
528  LP002990  Female      No          0      Graduate           Yes   

     Applicant_Income  Coapplicant_Income  Loan_Amount  Loan_Amount_Term  \
0                4583              1508.0          128     

# Run the Utilities-Data_preparation file

In [183]:
%run Utilities-Data_preparation.ipynb

In [184]:
show_number_of_missing(df)

Loan_ID: 0 value(s) missing
Gender: 12 value(s) missing
Married: 2 value(s) missing
Dependents: 12 value(s) missing
Education: 0 value(s) missing
Self_Employed: 25 value(s) missing
Applicant_Income: 0 value(s) missing
Coapplicant_Income: 0 value(s) missing
Loan_Amount: 0 value(s) missing
Loan_Amount_Term: 0 value(s) missing
Credit_History: 0 value(s) missing
Property_Area: 0 value(s) missing
Loan_Approved: 0 value(s) missing


# Preparation

In [185]:
# Determine the name of the 'Loan_Approved' column in a variable
label_column = "Loan_Approved"

In [186]:
# Replace the value 'Y' to '1'-re and the value 'N' to '0'
df[label_column] = df[label_column].replace({"Y": 1, "N": 0})

In [187]:
# Convert 'Loan_Approved' to int16
df[label_column] = df[label_column].astype('int16')
print(df[label_column].head())

0    0
1    1
2    1
3    1
4    1
Name: Loan_Approved, dtype: int16


In [188]:
# Split the datase 
x, y = x_y_split(df, label_column)

In [189]:
# Remove theunnecessary"Loan_ID"column from the xdataset.
x = x.drop(columns=["Loan_ID"])

In [190]:
# The list of column names of the x dataset
x_columnlist = x.columns.tolist()

print("List of feature columns:")
print(x_columnlist)

List of feature columns:
['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Applicant_Income', 'Coapplicant_Income', 'Loan_Amount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area']


# Build 'train' and 'test' for regression model

In [191]:
# Training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train set size: {x_train.shape}, {y_train.shape}")
print(f"Test set size: {x_test.shape}, {y_test.shape}")

Train set size: (423, 11), (423,)
Test set size: (106, 11), (106,)


# Use 'add_categ_numeric_features_to_list' function from 'Utilities-Data_preparation' file

In [192]:
# Identify categorical and numeric features in xwith using the "add_categ_numeric_features_to_list" utility function.
categorical_features, numeric_features = add_categ_numeric_features_to_list(x_train)

Unique values for 'Gender': ['Female' 'Male' nan]
Unique values for 'Married': ['No' 'Yes' nan]
Unique values for 'Dependents': ['0' '2' '1' '3+' nan]
Unique values for 'Education': ['Graduate' 'Not Graduate']
Unique values for 'Self_Employed': ['No' 'Yes' nan]
Unique values for 'Property_Area': ['Rural' 'Semiurban' 'Urban']


In [193]:
categorical_features

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area']

In [194]:
numeric_features

['Applicant_Income',
 'Coapplicant_Income',
 'Loan_Amount',
 'Loan_Amount_Term',
 'Credit_History']

# Replace missing values (Transformer)

In [195]:
col_trans_imp_categ = create_col_trans('imp_categ', transformer_imp_unknown, categorical_features)

In [196]:
col_trans_imp_numeric = create_col_trans('imp_numeric', transformer_imp_mean, numeric_features)

In [197]:
# x_train = col_trans_imp_categ.fit_transform(x_train)
# x_train = col_trans_imp_numeric.fit_transform(x_train)
# x_train

# #Convert ORDINAL Categorical features to Numeric

## Ordinal variables are categories that have a logical order between them, meaning that values can be "scaled" according to a specific order.

In [198]:
ordinal_features = ['Dependents', 'Education', 'Self_Employed', 'Property_Area']

## For these variables, we use OrdinalEncoder, which assigns ordinal numbers to them (e.g. 0, 1, 2...).

In [199]:
ordinal_categories = [
    ['0', '1', '2', '3+', 'unknown'],              
    ['Not Graduate', 'Graduate'],                 
    ['Yes', 'No'],                                 
    ['Rural', 'Semiurban', 'Urban']                
]

In [200]:
col_trans_ordinal = create_col_trans('ordinal', transformer_ordinal(ordinal_categories), ordinal_features)

In [201]:
# x_train = col_trans_ordinal.fit_transform(x_train)
# x_train

## ***The essence of ordinal variables is that the categories have a logical order, which is important to preserve for the model.***

# #Convert NOMINAL Categorical features to Numeric

## Nominal variables are categories that cannot be logically sorted in order — only designations and labels.

In [202]:
nominal_features = ["Gender", "Married"]

## For these, we use OneHotEncoder, which creates a separate binary column for each category. This is important because machine learning models can't interpret "nominal" data directly — they need to convert it into numbers.

In [203]:
known_categories = [
    ["Male", "Female"], 
    ["Yes", "No"], 
    
]

In [204]:
col_trans_nominal = create_col_trans('nominal', transformer_nominal(known_categories), nominal_features)

In [205]:
# x_train = col_trans_nominal.fit_transform(x_train)
# x_train

# #Scale numeric features to be between 0 and 1.

## Numeric features are real numbers that can be used for operations (summing, multiplication, averaging, and so on).

In [206]:
numeric_features = ["Applicant_Income", "Coapplicant_Income", "Loan_Amount", "Loan_Amount_Term", "Credit_History"]

## These are usually normalized or scaled (e.g., between 0 and 1) for modeling so that all features are given equal weight.

In [207]:
col_trans_scale = create_col_trans('scale', transformer_scale_0_1, numeric_features)

In [208]:
# x_train = col_trans_scale.fit_transform(x_train)
# x_train

# Create pipeline

In [209]:
pipe = make_pipeline(col_trans_imp_categ, col_trans_imp_numeric, col_trans_ordinal, col_trans_nominal, col_trans_scale)

# Fit and transformthe pipeline to the "x_train" dataset.

In [210]:
x_train = pipe.fit_transform(x_train)
x_train

Unnamed: 0,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Gender_Male,Gender_Female,Married_Yes,Married_No,Dependents,Education,Self_Employed,Property_Area
263,0.044675,0.049236,0.202401,0.72973,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
136,0.049685,0.042764,0.241852,0.72973,1.0,1.0,0.0,1.0,0.0,2.0,1.0,1.0,0.0
309,0.059988,0.108372,0.375643,0.72973,1.0,1.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0
233,0.030550,0.065697,0.228130,0.72973,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
59,0.037823,0.090611,0.265866,0.72973,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,0.072913,0.000000,0.168096,0.72973,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
139,0.028312,0.098502,0.192110,0.72973,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
123,0.058256,0.024529,0.185249,0.72973,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
466,0.025838,0.074593,0.190395,0.72973,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0


In [211]:
print("\nData preparation has been made!")


Data preparation has been made!


- # Machine learning models typically work with numbers, which is why you need to convert text or category data into numbers. 
- # Scaling helps to give numeric variables a more equal weight, not dominated by a very large or very small value.
- # One-hot encoding preserves the separation of nominal variables without assuming an order.
- # And ordinal encoding preserves the relationship between ranked categories.