In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv("dataset/train.csv")

In [None]:
df.head()

In [None]:
# Drop the Loan_ID column as it is of no use for model.
data = df.drop(columns=["Loan_ID"])

In [None]:
data.info()

In [None]:
data.columns

In [None]:
categorical_data = [i for i in data.columns if data[i].dtype=="object"]
categorical_data

In [None]:
numerical_data = [i for i in data.columns if data[i].dtype!="object"]
numerical_data

In [None]:
discrete_numerical_data = [i for i in numerical_data if len(data[i].unique())<16]
discrete_numerical_data

In [None]:
continuous_numerical_data = [i for i in numerical_data if len(data[i].unique())>=16]
continuous_numerical_data

Data Visualisation

In [None]:
# For Categotical Data.
for i in categorical_data:
    data[i].value_counts().plot(kind="bar")
    plt.xlabel(i)
    plt.ylabel("Counts")
    plt.show()

In [None]:
# For Discrete Numerical Data
for i in discrete_numerical_data:
    data[i].value_counts().plot(kind="bar")
    plt.xlabel(i)
    plt.ylabel("Counts")
    plt.show()

In [None]:
# For Continuous Numerical Data
for i in continuous_numerical_data:
    sns.histplot(data[i])
    plt.xlabel(i+" Distribution")
    plt.show()

In [None]:
# Check for outliers(Since the above distribution graphs are skewed, hence outliers are present
for i in continuous_numerical_data:
    sns.boxplot(data = data, y=i)
    plt.show()

In [None]:
# From the above box plot we can know that outliers are present, hence we need to handle missing values by replacing with median value.

Handling Missing Values

In [None]:
sns.heatmap(data.isnull(), cbar=False)

In [None]:
# Since there are null values in categorical values and discrete numerical values, so we replace them with mode of that feature.
for i in categorical_data+discrete_numerical_data:
    data[i] = data[i].fillna(data[i].mode().iloc[0])

In [None]:
# Now We replace the loan amount column from numerical category.
data["LoanAmount"] = data["LoanAmount"].fillna(data["LoanAmount"].median())

In [None]:
# Correlation Matrix (numeric columns only)
data.select_dtypes(include=[np.number]).corr()

In [None]:
sns.heatmap(data.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm')

Feature Engineering

In [None]:
# We will encode the categorical data using label Encoder.
le = preprocessing.LabelEncoder()

for i in categorical_data:
    data[i] = le.fit_transform(data[i])

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
# If we want to apply log transformation for all the numerical variables, then majority of CoapplicantIncome values are 0.
# Hence we will create a new variable called TotalIncome = ApplicantIncome + CoapplicantIncome.

data["TotalIncome"] = data["ApplicantIncome"]+data["CoapplicantIncome"]

In [None]:
data.drop(["ApplicantIncome","CoapplicantIncome"],axis=1,inplace=True)

In [None]:
continuous_numerical_data

In [None]:
['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

In [None]:
continuous_numerical_data.remove("ApplicantIncome")
continuous_numerical_data.remove("CoapplicantIncome")
continuous_numerical_data.append("TotalIncome")

In [None]:
continuous_numerical_data

In [None]:
# Log Transformation
for i in continuous_numerical_data+["Loan_Amount_Term"]:
    data[i] = np.log(data[i])

In [None]:
data.head()

Model Building

In [None]:
# Splitting the data

In [None]:
X,y = data.drop(columns = "Loan_Status"),data["Loan_Status"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Logistic Regression
model = LogisticRegression()

In [None]:
model.fit(X_train,y_train)

In [None]:
print("Accuracy of Logistic Regression Model is ",model.score(X_test,y_test)*100)

In [None]:
score = cross_val_score(model, X, y, cv=5)
print("Cross validation is",np.mean(score)*100)

In [None]:
# import pickle
# # open a file, where you want to store the data
# file = open('model.pkl', 'wb')

# # dump information to that file
# pickle.dump(model, file)