In [22]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
import statsmodels.api as sm

from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.preprocessing import OrdinalEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score

from sklearn.compose import ColumnTransformer


from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import FunctionTransformer

from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv(r"../data/interim/2_feature_engineered/1_base_data.csv")

In [4]:
class CustomOrdinalEncoder(BaseEstimator,TransformerMixin):
    '''
    Converts categories into numbers for KNNImputing
    '''
    
    def __init__(self,feature_name=None):
        self.ord_encoder = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=1001)
        self.feature = feature_name
    
    def fit(self,X,y=None):
        na_indices = X[X.isna()].index.values
        not_na_rows = X[X.notna()]
        not_na_np = not_na_rows.to_numpy().reshape(-1,1)
        self.ord_encoder = self.ord_encoder.fit(not_na_np)
        return self
    
    def transform(self,X,y=None):
        na_indices = X[X.isna()].index.values
        not_na_rows = X[X.notna()]
        not_na_np = not_na_rows.to_numpy().reshape(-1,1)
        transformed_data = self.ord_encoder.transform(not_na_np)
        not_na_encode = pd.Series(transformed_data.flatten(),index = not_na_rows.index.values)
        return not_na_encode
    
    def inverse_transform(self,X,y=None):
        col = X.to_numpy().reshape(-1,1)
        return self.ord_encoder.inverse_transform(col)

In [12]:
cat_columns = ['Self_Employed',
 'Dependents',
 'Gender',
 'Married',
 'Education',
 'Property_Area',
 'Credit_History']

num_columns = ['LoanAmount','ApplicantIncome','TotalApplicantIncome']

In [55]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Loan_Status']),
                                                 df['Loan_Status'],stratify=df['Loan_Status'],random_state=60,train_size=0.6)

In [56]:
X_train = X_train.sort_index()
y_train = y_train.sort_index()
X_test = X_test.sort_index()
y_test = y_test.sort_index()

In [57]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Property_Area,TotalApplicantIncome
0,0,Male,No,0,Graduate,No,8.674026,,1.0,Urban,8.674026
1,1,Male,Yes,1,Graduate,No,8.430109,4.85203,1.0,Rural,8.714568
2,2,Male,Yes,0,Graduate,Yes,8.006368,4.189655,1.0,Urban,8.006368
7,7,Male,Yes,3+,Graduate,No,8.018296,5.062595,0.0,Semiurban,8.61975
12,12,Male,Yes,2,Graduate,No,8.03041,5.298317,1.0,Urban,9.321792


In [58]:
ordinal_encoders = []

for feature in cat_columns:
    encoder = CustomOrdinalEncoder(feature_name=feature)
    encoder = encoder.fit(X_train[feature])
    ordinal_encoders.append(encoder)
    X_train.loc[:,feature] = encoder.transform(X_train[feature])
    X_test.loc[:,feature] = encoder.transform(X_test[feature])

In [59]:
scaler = MinMaxScaler()
X_train.loc[:,num_columns] = scaler.fit_transform(X_train.loc[:,num_columns])
X_test.loc[:,num_columns] = scaler.transform(X_test.loc[:,num_columns])

In [60]:
imputer = KNNImputer(n_neighbors=1,) #impute with values from nearest neighbour
X_train = pd.DataFrame(imputer.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test),columns=X_test.columns)

In [61]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Property_Area,TotalApplicantIncome
0,0.0,1.0,0.0,0.0,0.0,0.0,0.58227,0.480287,1.0,2.0,0.294755
1,1.0,1.0,1.0,1.0,0.0,0.0,0.543501,0.639957,1.0,0.0,0.305634
2,2.0,1.0,1.0,0.0,0.0,1.0,0.47615,0.480287,1.0,2.0,0.115596
3,7.0,1.0,1.0,3.0,0.0,0.0,0.478046,0.690715,0.0,1.0,0.280191
4,12.0,1.0,1.0,2.0,0.0,0.0,0.479972,0.747537,1.0,2.0,0.468577


In [62]:
X_train = pd.get_dummies(data=X_train,columns=cat_columns)
X_test = pd.get_dummies(data=X_test,columns=cat_columns)

In [63]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Model

In [64]:
params = {
#     'C':[0.05,0.5,0.8,1,1.5]
}

In [65]:
clf = LogisticRegression(fit_intercept=True,random_state=123,max_iter=1000)
model = GridSearchCV(clf,param_grid=params)

In [66]:
model = model.fit(X_train,y_train)

In [67]:
model.score(X_train,y_train)

0.8043478260869565

In [68]:
# model.score(X_test,y_test)

In [69]:
f1_score(y_test,model.predict(X_test))

0.8663101604278075

In [70]:
f1_score(y_train,model.predict(X_train))

0.8718861209964414