In [17]:
# Step 1 - Load and Understand Data (Home Loan Eligibility Analysis)

In [16]:
import pandas as pd
df = pd.read_csv(r"C:\Users\USER\Downloads\Home Loan Eligibility.csv")
df.head()
#Load and Display first 5 rows

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
df.shape 
#Size of the Data - rows and columns

(614, 13)

In [7]:
df.info 
#Shows column names,types,missing values

<bound method DataFrame.info of       Loan_ID  Gender Married Dependents     Education Self_Employed  \
0    LP001002    Male      No          0      Graduate            No   
1    LP001003    Male     Yes          1      Graduate            No   
2    LP001005    Male     Yes          0      Graduate           Yes   
3    LP001006    Male     Yes          0  Not Graduate            No   
4    LP001008    Male      No          0      Graduate            No   
..        ...     ...     ...        ...           ...           ...   
609  LP002978  Female      No          0      Graduate            No   
610  LP002979    Male     Yes         3+      Graduate            No   
611  LP002983    Male     Yes          1      Graduate            No   
612  LP002984    Male     Yes          2      Graduate            No   
613  LP002990  Female      No          0      Graduate           Yes   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0               5849          

In [8]:
df.describe()
#Summary Statistics for numeric columns 

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [11]:
df['Loan_Status'].value_counts()
#Categorical columns


Loan_Status
Y    422
N    192
Name: count, dtype: int64

In [12]:
df['Credit_History'].value_counts()


Credit_History
1.0    475
0.0     89
Name: count, dtype: int64

In [13]:
df['Education'].value_counts()

Education
Graduate        480
Not Graduate    134
Name: count, dtype: int64

In [15]:
df['Property_Area'].value_counts()

Property_Area
Semiurban    233
Urban        202
Rural        179
Name: count, dtype: int64

In [None]:
# Step 1 - Load & Understand Data (Home Loan Eligibility)

### Dataset Overview
- Number of rows & columns:The dataset has 614 rows and 13 columns.
- Target Column:'Loan_Status' -> indicates if the applicant is eligible for the loan.
### Column information
- ApplicationIncome -> Monthly income of the applicant
- CoapplicantIncome -> Monthly income of the coapplicant
- LoanAmount -> Requested Loan Amount
- Loan_Amount_Term -> Duration of loan in months
- Credit_History -> Credit History (1- good, 0-bad)
- Loan_Status -> Target Variable (Y = eligible, N = not eligible)
### Data Summary
- Numeric Summary -> Mean,Median,Min,Max,Count for income.
- Categorical Summary 
    - Loan_Status -> Count of approved vs not approved
    - Credit_History -> Applicants with good or bad history
    - Education -> Graduate vs not graduate applicants
    - Property_Area -> Count of Individual Areas.
### Notes
- Income (Applicant + Coapplicant) may influence eligibility.  
- Credit history seems like a major factor.  
- Loan amount & term could affect repayment ability.  
- Missing values must be handled before modeling.
### Deliverables for Step 1
- Dataset loaded successfully.  
- First rows, column info, summary statistics, and categorical counts reviewed.  
- Clear understanding of what each column represents.  
- Identified 'Loan_Status' as the target column.


In [None]:
# Step 2 - Check Data Quality

In [18]:
df.isnull().sum()
#Checks for missing values

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [19]:
df.duplicated().sum()
#Checks for duplicates

0

In [26]:
df['Gender'].value_counts(dropna=False)
#Checks for unusual values in Categorical Values- "male", "MALE", "M","Unknown","?"

Gender
Male      489
Female    112
NaN        13
Name: count, dtype: int64

In [23]:
df['Married'].value_counts(dropna=False)

Married
Yes    398
No     213
NaN      3
Name: count, dtype: int64

In [24]:
df['Credit_History'].value_counts(dropna=False)

Credit_History
1.0    475
0.0     89
NaN     50
Name: count, dtype: int64

In [25]:
df['Loan_Status'].value_counts(dropna=False)
#Nan - (Not a Number) Missing or undefined data

Loan_Status
Y    422
N    192
Name: count, dtype: int64

In [27]:
df['ApplicantIncome'].describe()
#Checking for Outliers / Strange Values

count      614.000000
mean      5403.459283
std       6109.041673
min        150.000000
25%       2877.500000
50%       3812.500000
75%       5795.000000
max      81000.000000
Name: ApplicantIncome, dtype: float64

In [28]:
df['LoanAmount'].describe()

count    592.000000
mean     146.412162
std       85.587325
min        9.000000
25%      100.000000
50%      128.000000
75%      168.000000
max      700.000000
Name: LoanAmount, dtype: float64

In [None]:
#Outlier can be found by finding IQR -Interquartile Range
Outlier for Loan Amount
Q1=100
Q2=128
Q3=168
IQR=Q3-Q1
=168-100
IQR=68
Lower Bound = Q1-1.5*IQR
=100-102 
LB=-2
Upper Bound=Q3+1.5*IQR
=168+102
UB=270
As min value is 9 and max value is 700 ,The values above 270 are considered Outliers.

In [None]:
#Outlier can be found by finding IQR -Interquartile Range
Outlier for Applicant Income
Q1=2877.50
Q2=3812.50
Q3=5795.00
IQR=Q3-Q1
=5795.00-2877.50
IQR=2917.5
Lower Bound = Q1-1.5*IQR
=2877.5-4376.25 
LB=-1498.75 (Income cannot be negative, So it can be ignored)
Upper Bound=Q3+1.5*IQR
=5795+4376.25
UB=10171.25
As min value is 150.00 and max value is 81000 ,The values above 10171.25 are considered Outliers.

In [None]:
# Step 2 - Check Data Quality
### Missing Values
    -'LoanAmount' has 22 missing values.
    -'Credit_History' has 50 missing values.
    -'Gender' has 13 missing values.
    - Other columns are mostly complete.
->Need to impute or handle missing values.
### Duplicates
    - 0 Duplicates
    - Dataset has only unique rows.
### Unusual Values in Catagorical Columns
    -Gender -> 'Male', 'Female', plus 13 NaN.
    -Married -> 'Yes', 'No', some NaN.
    -'Credit_History' -> '1','0',some NaN.
    -'Loan_Status' -> Target column, values 'Y' and 'N' only.
### Outliers in Numeric Columns
    -ApplicantIncome -> Outlier Present(above 10171).
    -LoanAmount -> Outlier Present(above 270).
###Deliverables of Step 2
- Identified missing values in 'LoanAmount', 'Credit_History', and 'Gender'.  
- No duplicate rows found.  
- Detected unusual values in categorical columns.  
- Found extreme outliers in 'ApplicantIncome' and 'LoanAmount'.  