In [88]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder , RobustScaler
from category_encoders import BinaryEncoder
from sklearn.linear_model import LogisticRegression

In [89]:
df = pd.read_pickle("Data_After_EDA.pkl")
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Loan_Status,LoanAmount_log,Total_Income,EMI,Total_Income_log,Balance_Income
0,Male,No,0,Graduate,No,1.0,Urban,1,4.852030,5849.0,0.355556,8.674026,5493.444444
1,Male,Yes,1,Graduate,No,1.0,Rural,0,4.852030,6091.0,0.355556,8.714568,5735.444444
2,Male,Yes,0,Graduate,Yes,1.0,Urban,1,4.189655,3000.0,0.183333,8.006368,2816.666667
3,Male,Yes,0,Not Graduate,No,1.0,Urban,1,4.787492,4941.0,0.333333,8.505323,4607.666667
4,Male,No,0,Graduate,No,1.0,Urban,1,4.948760,6000.0,0.391667,8.699515,5608.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,1.0,Rural,1,4.262680,2900.0,0.197222,7.972466,2702.777778
610,Male,Yes,3,Graduate,No,1.0,Rural,1,3.688879,4106.0,0.222222,8.320205,3883.777778
611,Male,Yes,1,Graduate,No,1.0,Urban,1,5.533389,8312.0,0.702778,9.025456,7609.222222
612,Male,Yes,2,Graduate,No,1.0,Urban,1,5.231109,7583.0,0.519444,8.933664,7063.555556


In [90]:
Encoder = ColumnTransformer(transformers=[("OHE",OneHotEncoder(sparse=False , drop="first" ), ["Gender" , "Married","Education", "Self_Employed" ]) , ("BE",BinaryEncoder() , ["Dependents" , "Property_Area"] )] , remainder = "passthrough")

In [91]:
steps = []
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("Model" , LogisticRegression()))
pipeline = Pipeline(steps=steps)

In [92]:
X = df.drop("Loan_Status" , axis = 1 )
y = df["Loan_Status"]

In [93]:
results = cross_validate(pipeline, X ,y , cv = 5 , scoring="accuracy" , return_train_score=True)

In [94]:
results["train_score"].mean()

0.811277492592129

In [95]:
results["test_score"].mean()

0.8072504331600694

In [96]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [97]:
models = list()
models.append(("LR" , LogisticRegression()))
models.append(("KNN" , KNeighborsClassifier()))
models.append(("CART" , DecisionTreeClassifier()))
models.append(("RF" , RandomForestClassifier()))
models.append(("xg" , XGBClassifier()))

In [98]:
import warnings
warnings.filterwarnings("ignore")

In [99]:
for model in models:
    steps = []
    steps.append(("Encoder" , Encoder))
    steps.append(("Scaler" , RobustScaler()))
    steps.append(model)
    pipeline = Pipeline(steps=steps)
    scores = cross_validate(pipeline ,X ,y , cv = 5 , scoring="accuracy" , return_train_score=True)
    print(model[0])
    print("Train_accuracy" , scores["train_score"].mean() )
    print("-" * 10)
    print("Test_accuracy" , scores["test_score"].mean())
    print("-" * 20)
    print("\n")

LR
Train_accuracy 0.811277492592129
----------
Test_accuracy 0.8072504331600694
--------------------


KNN
Train_accuracy 0.805554025291098
----------
Test_accuracy 0.7353058776489404
--------------------


CART
Train_accuracy 1.0
----------
Test_accuracy 0.6945755031320806
--------------------


RF
Train_accuracy 1.0
----------
Test_accuracy 0.7778621884579502
--------------------


xg
Train_accuracy 1.0
----------
Test_accuracy 0.7713581234172998
--------------------




In [100]:
from sklearn.model_selection import GridSearchCV

In [101]:
params = {
    'Model__learning_rate': [0.01, 0.1, 0.2], 
    'Model__n_estimators': [110,120,130],  # Number of trees (boosting rounds)
    'Model__reg_alpha': [0, 0.1, 0.5]

}

In [102]:
steps = []
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("Model" , XGBClassifier()))
pipeline = Pipeline(steps=steps)

In [103]:
grid_search = GridSearchCV(estimator=pipeline , param_grid=params , cv =10 ,scoring="accuracy" , return_train_score=True , n_jobs = -1)
grid_search.fit(X,y)

In [104]:
grid_search.best_params_

{'Model__learning_rate': 0.01,
 'Model__n_estimators': 120,
 'Model__reg_alpha': 0}

In [105]:
grid_search.cv_results_["mean_train_score"].mean()

0.9347547587184611

In [106]:
grid_search.cv_results_["mean_test_score"].mean()

0.7747233484145172

In [107]:
final_model = grid_search.best_estimator_

In [108]:
# extract feature importances, convert into a Series
importances = pd.Series(final_model.feature_importances_, index=X.columns)

# plot the horizontal bar chart
importances.plot(kind='barh', figsize=(12,8))

AttributeError: 'Pipeline' object has no attribute 'feature_importances_'

In [109]:
import joblib

In [110]:
joblib.dump(final_model , "Model.pkl")
joblib.dump(X.columns , "Inputs.pkl")

['Inputs.pkl']

In [111]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Loan_Status,LoanAmount_log,Total_Income,EMI,Total_Income_log,Balance_Income
0,Male,No,0,Graduate,No,1.0,Urban,1,4.85203,5849.0,0.355556,8.674026,5493.444444
1,Male,Yes,1,Graduate,No,1.0,Rural,0,4.85203,6091.0,0.355556,8.714568,5735.444444
2,Male,Yes,0,Graduate,Yes,1.0,Urban,1,4.189655,3000.0,0.183333,8.006368,2816.666667
3,Male,Yes,0,Not Graduate,No,1.0,Urban,1,4.787492,4941.0,0.333333,8.505323,4607.666667
4,Male,No,0,Graduate,No,1.0,Urban,1,4.94876,6000.0,0.391667,8.699515,5608.333333


In [112]:
X.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Credit_History', 'Property_Area', 'LoanAmount_log', 'Total_Income',
       'EMI', 'Total_Income_log', 'Balance_Income'],
      dtype='object')

In [113]:
%%writefile app_3rd.py
import streamlit as st
import pandas as pd
import joblib
import numpy as np


Inputs = joblib.load("Inputs.pkl")
Model = joblib.load("Model.pkl")

def prediction(Gender, Married, Dependents, Education, Self_Employed, Credit_History, Property_Area,
               LoanAmount_log, Total_Income, EMI, Balance_Income, Total_Income_log):
    Total_Income = ApplicantIncome + CoapplicantIncome
    EMI = LoanAmount / Loan_Amount_Term
    Balance_Income = Total_Income - (EMI * 1000)
    Total_Income_log = np.log (Total_Income)
    LoanAmount_log = np.log (LoanAmount)
    
    test_df = pd.DataFrame(columns=Inputs)
    test_df.at[0,"Gender"] = Gender
    test_df.at[0,"Married"] = Married
    test_df.at[0,"Dependents"] = Dependents
    test_df.at[0,"Education"] = Education
    test_df.at[0,"Self_Employed"] = Self_Employed
    test_df.at[0,"Credit_History"] = Credit_History
    test_df.at[0,"Property_Area"] = Property_Area
    test_df.at[0, "LoanAmount_log"] = LoanAmount_log
    test_df.at[0,"Total_Income"] = Total_Income
    test_df.at[0,"EMI"] = EMI
    test_df.at[0,"Balance_Income"] = Balance_Income
    test_df.at[0,"Total_Income_log"] = Total_Income_log
    st.dataframe(test_df)
    result = final_model.predict(test_df)[0]
    return result

    
def main():
    st.title("Predict loan approval")
    Gender = st.selectbox("Gender" , ['Yes', 'No'])
    Married = st.selectbox("Married" , ['Yes', 'No'])
    Dependents = st.selectbox("Dependents" , ['1', '2','3'])
    Education = st.selectbox("Education" , ['Graduate', 'Not Graduate'])
    Self_Employed = st.selectbox("Self_Employed" , ['Yes', 'No'])
    Credit_History = st.selectbox("Credit_History" , ['0', '1'])
    Property_Area = st.selectbox("Property_Area", ['Urban', 'Rural', 'Semiurban'])
    Total_Income = st.slider("Total_Income", min_value = 1000, max_value = 100000, value= 0, step=1 )
    LoanAmount = st.slider("LoanAmount", min_value = 1000, max_value = 500000, value= 0, step=1 )
    Loan_Amount_Term = st.slider("Loan_Amount_Term", min_value = 1, max_value = 500000, value= 0, step=1 )
    ApplicantIncome = st.slider("ApplicantIncome", min_value = 1000, max_value = 100000, value= 0, step=1 )
    CoapplicantIncome = st.slider("CoapplicantIncome", min_value =1000, max_value = 100000, value= 0, step=1 )
    

    
    if st.button("predict"):
        result = prediction(online_order, book_table, votes, location,approx_cost,listed_in,listed_in_city,cuisines_counts,rest_type_counts)
        label = ["Approved" , "Not-approved"]
        st.text(f"The Loan will be {label[result]}")
        
if __name__ == '__main__':
    main()    

Writing app_3rd.py
