In [29]:
import pandas as pd

In [30]:
# Load the datasets
train_data = pd.read_csv('Training Dataset.csv')
test_data = pd.read_csv('Test Dataset.csv')
sample_submission = pd.read_csv('Sample_Submission.csv')

In [31]:
# Function to preprocess and feature engineer
def preprocess_data(data):
    # Impute missing values for categorical columns with mode
    data['Gender'] = data['Gender'].fillna(data['Gender'].mode()[0])
    data['Married'] = data['Married'].fillna(data['Married'].mode()[0])
    data['Dependents'] = data['Dependents'].fillna(data['Dependents'].mode()[0])
    data['Self_Employed'] = data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])

    # Impute missing values for numerical columns with median
    data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].median())
    data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].median())
    data['Credit_History'] = data['Credit_History'].fillna(data['Credit_History'].median())

    # Feature Engineering
    # Create a new feature for Total Income
    data['Total_Income'] = data['ApplicantIncome'] + data['CoapplicantIncome']

    # Create a new feature for Loan Amount to Total Income ratio
    data['Loan_Amount_to_Income'] = data['LoanAmount'] / data['Total_Income']

    # Create a new feature for Loan Term in Years
    data['Loan_Term_Years'] = data['Loan_Amount_Term'] / 12

    # Convert categorical variables to numerical using one-hot encoding
    data = pd.get_dummies(data, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)

    return data


In [32]:
# Preprocess the training data
train_data = preprocess_data(train_data)

In [33]:
# Encode the target variable
train_data['Loan_Status'] = train_data['Loan_Status'].map({'Y': 1, 'N': 0})

In [34]:
# Splitting the training data into features and target variable
X_train = train_data.drop(columns=['Loan_ID', 'Loan_Status'])
y_train = train_data['Loan_Status']

In [35]:
# Preprocess the test data
test_data = preprocess_data(test_data)

In [36]:
# Ensure test_data has the same features as X_train
missing_cols = set(X_train.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[X_train.columns]


In [37]:
# Check for missing values after imputation and feature engineering
print(train_data.isnull().sum())
print(test_data.isnull().sum())

Loan_ID                    0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Loan_Status                0
Total_Income               0
Loan_Amount_to_Income      0
Loan_Term_Years            0
Gender_Male                0
Married_Yes                0
Dependents_1               0
Dependents_2               0
Dependents_3+              0
Education_Not Graduate     0
Self_Employed_Yes          0
Property_Area_Semiurban    0
Property_Area_Urban        0
dtype: int64
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Total_Income               0
Loan_Amount_to_Income      0
Loan_Term_Years            0
Gender_Male                0
Married_Yes                0
Dependents_1               0
Dependents_2               0
Dependents_3+              0
Education_Not Graduate     0
Self_Employed_Yes          0
P

In [38]:
# Display the shapes of the preprocessed data
print(X_train.shape, y_train.shape, test_data.shape)

(614, 17) (614,) (367, 17)
