In [1]:
### Step 5- Feature Engineering and Preprocessing
import pandas as pd
df = pd.read_csv(r"loans_clean.csv")
df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849.0,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
df.shape

(614, 13)

In [4]:
#Encode Target Variable
# To make machine to read Loan_Status
df['Loan_Status']=df['Loan_Status'].map({'Y':1,'N':0})
df['Loan_Status'].value_counts()

Loan_Status
1    422
0    192
Name: count, dtype: int64

In [5]:
#Encode Categorical Variable
df= pd.get_dummies(df,drop_first=True)
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Loan_ID_LP001003,Loan_ID_LP001005,Loan_ID_LP001006,Loan_ID_LP001008,...,Loan_ID_LP002990,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,5849.0,0.0,128.0,360.0,1.0,1,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
1,4583.0,1508.0,128.0,360.0,1.0,0,True,False,False,False,...,False,True,True,True,False,False,False,False,False,False
2,3000.0,0.0,66.0,360.0,1.0,1,False,True,False,False,...,False,True,True,False,False,False,False,True,False,True
3,2583.0,2358.0,120.0,360.0,1.0,1,False,False,True,False,...,False,True,True,False,False,False,True,False,False,True
4,6000.0,0.0,141.0,360.0,1.0,1,False,False,False,True,...,False,True,False,False,False,False,False,False,False,True


In [None]:
##Categorical encoding
- Converted Gender, Married, Education, etc. into dummy variables
- Example: Gender_Male = 1 if Male else 0

In [7]:
#Feature Scaling 
##Machine learning algorithms (especially distance-based ones like Logistic Regression, KNN, SVM, etc.) are sensitive to feature magnitude.
##If one feature has much larger values than others (e.g., LoanAmount in thousands, ApplicantIncome in lakhs), it can dominate the model and distort learning.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_cols= ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
df[num_cols]= scaler.fit_transform(df[num_cols])

In [None]:
##Scaling
- Applied StandardScaler to numeric columns
- Mean ~ 0, std ~ 1 now


In [8]:
##Feature Creation
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['DTI'] = df['LoanAmount'] / (df['TotalIncome']+1)


In [None]:
#Feature engineering
- TotalIncome = ApplicantIncome + CoapplicantIncome
- DTI = LoanAmount / TotalIncome
- Hypothesis: High DTI → less likely approved

In [9]:
##Split data into train and test
from sklearn.model_selection import train_test_split
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape


((491, 629), (123, 629))

In [None]:
##Train-test split
- Training size: 80%
- Test size: 20%
- Stratified → keeps Loan_Status balance same in train/test


In [11]:
##Save Processed Data
train = pd.concat([X_train,y_train], axis=1)
test=pd.concat([X_test,y_test],axis=1)
train.to_csv("dptrain.csv",index=False)
test.to_csv("dptest.csv",index=False)


In [13]:
##Deliverables
#Notebook: 04-EDA.ipynb.
#Saved files: dptrain.csv, dptest.csv.
