In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv("loan.csv") # Original dataset columns

In [4]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [5]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


# Preprocessing And Data Cleaning

In [7]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
df = df.drop(['Loan_ID'], axis = 1)  #DROP UNNECESSARY COLUMN

In [9]:
#Fill null values of categorical values with mode
df['Gender'].fillna(df['Gender'].mode()[0],inplace=True)
df['Married'].fillna(df['Married'].mode()[0],inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0],inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0],inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0],inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0],inplace=True)

In [10]:
#Fill null values of numerical values with mode
df['LoanAmount'].fillna(df['LoanAmount'].mean(),inplace=True)

In [11]:
#Remove outliers
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

  df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]


In [12]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.000000,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.000000,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.000000,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.000000,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
607,Male,Yes,2,Not Graduate,No,3987,1411.0,157.000000,360.0,1.0,Rural,Y
608,Male,Yes,0,Graduate,No,3232,1950.0,108.000000,360.0,1.0,Rural,Y
609,Female,No,0,Graduate,No,2900,0.0,71.000000,360.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.000000,360.0,1.0,Urban,Y


In [16]:
# Original dataset columns
columns = ["Gender", "Married", "Dependents", "Education", "Self_Employed",
           "ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term",
           "Credit_History", "Property_Area", "Loan_Status"]

# Synthetic Data Generation

In [13]:
# Number of synthetic samples to generate
num_samples = 1000

In [14]:
# Create a synthetic dataset
synthetic_data = pd.DataFrame()

In [17]:
# Generate synthetic data for each column
for col in columns:
    if col in ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area", "Loan_Status"]:
        unique_values = ['Male', 'Female'] if col == 'Gender' else ['Yes', 'No'] if col in ['Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status'] else ['0', '1', '2', '3+'] if col == 'Dependents' else ['Graduate', 'Not Graduate']
        synthetic_data[col] = np.random.choice(unique_values, num_samples)
    elif col == "ApplicantIncome" or col == "CoapplicantIncome" or col == "LoanAmount":
        synthetic_data[col] = np.random.randint(1000, 10000, num_samples)
    elif col == "Loan_Amount_Term":
        synthetic_data[col] = np.random.choice([360, 180, 240], num_samples)
    elif col == "Credit_History":
        synthetic_data[col] = np.random.choice([0, 1], num_samples, p=[0.2, 0.8])

In [19]:
# Display the synthetic dataset
#print(synthetic_data.head())
synthetic_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,1,Yes,No,3973,9064,4443,240,1,Yes,Yes
1,Female,No,1,No,No,1816,6682,3842,360,1,Yes,No
2,Female,Yes,2,Yes,No,3395,4357,5043,180,0,Yes,No
3,Male,Yes,1,Yes,Yes,9666,2710,4675,360,1,No,Yes
4,Male,No,3+,No,No,8739,3088,2350,180,1,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
995,Male,No,3+,No,No,4695,5523,7046,360,1,No,Yes
996,Female,No,3+,Yes,Yes,4026,5290,5739,360,1,No,No
997,Male,Yes,3+,No,No,8849,9614,3967,180,1,No,No
998,Male,No,0,No,No,5977,8616,6878,360,1,No,Yes


In [20]:
# Save the synthetic dataset to a CSV file
synthetic_data.to_csv("synthetic_dataset.csv", index=False)