In [6]:
# !pip install pandas numpy matplotlib seaborn scikit-learn

In [7]:
# !pip freeze > requirements.txt

In [8]:
# !pip install -r requirements.txt

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [10]:
# Generate sample loan approval data
np.random.seed(42)
n_samples = 1000

data = {
    'Loan_ID': [f'LP{str(i).zfill(6)}' for i in range(1, n_samples + 1)],
    'Gender': np.random.choice(['Male', 'Female'], n_samples, p=[0.6, 0.4]),
    'Married': np.random.choice(['Yes', 'No'], n_samples, p=[0.7, 0.3]),
    'Dependents': np.random.choice(['0', '1', '2', '3+'], n_samples, p=[0.4, 0.3, 0.2, 0.1]),
    'Education': np.random.choice(['Graduate', 'Not Graduate'], n_samples, p=[0.8, 0.2]),
    'Self_Employed': np.random.choice(['Yes', 'No'], n_samples, p=[0.15, 0.85]),
    'ApplicantIncome': np.random.normal(5000, 2000, n_samples).astype(int),
    'CoapplicantIncome': np.random.exponential(1500, n_samples).astype(int),
    'LoanAmount': np.random.normal(150, 50, n_samples).astype(int),
    'Loan_Amount_Term': np.random.choice([360, 180, 240, 300], n_samples, p=[0.8, 0.1, 0.05, 0.05]),
    'Credit_History': np.random.choice([1.0, 0.0], n_samples, p=[0.85, 0.15]),
    'Property_Area': np.random.choice(['Urban', 'Semiurban', 'Rural'], n_samples, p=[0.4, 0.35, 0.25])
}

# Create loan status based on some logic
loan_status = []
for i in range(n_samples):
    score = 0
    if data['Credit_History'][i] == 1.0:
        score += 3
    if data['Education'][i] == 'Graduate':
        score += 1
    if data['Married'][i] == 'Yes':
        score += 1
    if data['ApplicantIncome'][i] > 4000:
        score += 1
    if data['LoanAmount'][i] < 200:
        score += 1
    
    # Add some randomness
    if np.random.random() < 0.1:
        score = np.random.randint(0, 6)
    
    loan_status.append('Y' if score >= 3 else 'N')

data['Loan_Status'] = loan_status

# Create DataFrame
df = pd.DataFrame(data)

# Introduce some missing values
missing_indices = np.random.choice(df.index, size=int(0.05 * len(df)), replace=False)
df.loc[missing_indices[:len(missing_indices)//3], 'Gender'] = np.nan
df.loc[missing_indices[len(missing_indices)//3:2*len(missing_indices)//3], 'LoanAmount'] = np.nan
df.loc[missing_indices[2*len(missing_indices)//3:], 'Credit_History'] = np.nan

print(f"Dataset created with {len(df)} rows and {len(df.columns)} columns")
df.head()

Dataset created with 1000 rows and 13 columns


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP000001,Male,Yes,0,Graduate,No,3806,101,142.0,360,1.0,Semiurban,Y
1,LP000002,Female,Yes,0,Graduate,No,219,1195,152.0,360,1.0,Semiurban,Y
2,LP000003,Female,No,3+,Graduate,No,4175,1340,201.0,360,1.0,Urban,Y
3,LP000004,Male,No,0,Graduate,No,6826,1586,140.0,360,1.0,Rural,Y
4,LP000005,Male,No,0,Graduate,Yes,6075,442,121.0,360,1.0,Urban,Y
