# Analysing Data

In [113]:
import pandas as pd
import numpy as np

data=pd.read_csv('Train_Loan_Home.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,58490,0.0,0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,45830,12064.0,122880,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,30000,0.0,63360,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,25830,18864.0,115200,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,60000,0.0,135360,360.0,1.0,Urban,Y


# Assigning features and targets 

In [114]:
columnsX=data.columns[1:-1]
columnsY=data.columns[-1]

# Checking the inappropriate values

In [115]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
Loan_Amount           0
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# defining categorical cols, numerical cols to easily manipluate the values by leaving it null or replace that values with median value of that column

In [116]:

catCol=['Gependents']
numWithScalingCol=['Apender','Married','Education','Self_Employed','Property_Area','DplicantIncome','CoapplicantIncome','Loan_Amount','Loan_Amount_Term']
numerCol=['Credit_History']

# Importing Libarires for defing Machine Learing Pipeline


In [117]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import ensemble

In [118]:
numeric_transformer_scaling = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=99))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_scaling, numWithScalingCol),
        ('num2', numeric_transformer, numerCol),
        ('cat', categorical_transformer, catCol)])

In [119]:

modelPipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', ensemble.RandomForestClassifier())])


In [120]:
modelPipeline.fit(data[columnsX],data[columnsY])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['ApplicantIncome',
                                                   'CoapplicantIncome',
                                                   'Loan_Amount',
                                                   'Loan_Amount_Term']),
                                                 ('num2',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=99,
                                                    

# Predicted result from my ML pipeline


In [121]:
scores=modelPipeline.predict_proba(data[columnsX])[:,-1]

In [122]:
scores

array([0.89, 0.33, 0.91, 0.96, 0.99, 0.95, 0.91, 0.1 , 0.91, 0.21, 0.97,
       0.98, 0.99, 0.34, 0.86, 0.95, 0.85, 0.  , 0.31, 0.96, 0.04, 0.9 ,
       0.07, 0.06, 0.24, 0.96, 0.99, 0.94, 0.22, 0.93, 0.31, 0.29, 0.3 ,
       0.99, 0.14, 0.95, 0.06, 0.99, 0.76, 0.99, 0.25, 0.88, 0.91, 0.99,
       0.82, 0.89, 0.87, 0.94, 0.15, 0.97, 0.96, 0.98, 0.34, 0.31, 0.13,
       0.99, 0.95, 0.36, 0.9 , 0.93, 0.97, 0.96, 0.07, 0.09, 0.06, 0.35,
       0.09, 0.89, 0.93, 0.04, 0.94, 0.97, 0.91, 0.12, 0.95, 0.23, 0.25,
       0.23, 0.08, 0.89, 0.95, 0.93, 0.26, 0.26, 0.89, 1.  , 0.99, 1.  ,
       0.8 , 0.94, 1.  , 0.94, 0.94, 0.89, 0.82, 0.36, 0.99, 0.92, 0.89,
       0.98, 0.91, 0.9 , 0.9 , 0.89, 0.88, 0.95, 0.96, 0.25, 0.08, 0.92,
       1.  , 0.98, 0.04, 0.88, 0.94, 0.91, 0.99, 0.93, 0.28, 0.74, 0.8 ,
       0.96, 0.76, 0.99, 0.82, 0.94, 0.83, 0.86, 0.08, 0.35, 0.71, 0.74,
       0.93, 0.93, 0.93, 0.26, 0.14, 0.95, 0.07, 0.24, 0.34, 0.97, 0.89,
       1.  , 0.96, 0.97, 0.88, 0.92, 0.18, 0.99, 0.

In [123]:
data2=pd.read_csv('Test_Loan_Home.csv')

In [124]:
scores=modelPipeline.predict_proba(data2[columnsX])[:,-1]

In [125]:
scoreOutput={j:k for j,k in zip(data2['Loan_ID'],scores)}


In [126]:
import joblib

In [132]:
model = joblib.dump(modelPipeline,'modelPipeline.pkl')