## Data Preparation

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import pickle

In [6]:
# Load the dataset
data = pd.read_csv('personal-loan.csv')

In [7]:
# Handle missing values
print(data.isnull().sum())  # Check for missing values in each column

# Handle missing values in numerical columns by replacing them with the column mean
num_cols = ['age', 'yrs_experience', 'family_size', 'income', 'mortgage_amt', 'credit_card_spend']
imputer = SimpleImputer(strategy='mean')
data[num_cols] = imputer.fit_transform(data[num_cols])

# Handle missing values in categorical columns by replacing them with the most frequent value
cat_cols = ['education_level', 'credit_card_acct', 'fixed_deposit_acct', 'online_acct', 'personal_loan']
for col in cat_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)


customer_id             0
age                   380
yrs_experience        364
family_size           385
education_level         0
income                377
mortgage_amt            0
credit_card_acct        0
credit_card_spend       0
share_trading_acct      0
fixed_deposit_acct      0
online_acct             0
personal_loan           0
dtype: int64


In [8]:
# Convert categorical columns to numerical using label encoding
le = LabelEncoder()
for col in cat_cols:
    data[col] = le.fit_transform(data[col])


In [9]:
# Scale numerical columns using StandardScaler
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

In [10]:
# Save the data as a pickle file
pickle.dump(data, open('prepared_data.pkl', 'wb'))

# Confirmation
print("Saved as prepared_data.pkl")

Saved as prepared_data.pkl
