In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from typing import Optional
import pandas as pd
import numpy as np
from warnings import filterwarnings

filterwarnings("ignore")

In [2]:
class MoE:
    def __init__(self, models:Optional[list] = None)->None:
        """
        If any model requires probability to be set True initially, please set it True
        """
        self._models = None
        if models is None:
            self._models = [KNeighborsClassifier(), DecisionTreeClassifier(), SVC(probability=True)]
        else:
            self._models = models
        self._gate_router:RandomForestClassifier|None = None
    def _gate_fit(self, features:pd.DataFrame, labels:pd.Series)->None:
        self._gate_router = RandomForestClassifier() # using random forests as MoE
        model_labels = []
        for feature, label in zip(features.itertuples(index=False), labels):
            best_model = -1
            best_proba = -1
            for idx, model in enumerate(self._models):
                proba = model.predict_proba([feature])[0][label]
                if proba > best_proba:
                    best_model, best_proba = idx, proba
            model_labels.append(best_model)
        self._gate_router.fit(features, model_labels)
    def fit(self, features:pd.DataFrame, labels:pd.Series)->None:
        for model in self._models:
            model.fit(features, labels)
        self._gate_fit(features, labels)
    def _predict_gate(self, features:pd.DataFrame)->np.ndarray:
        return self._gate_router.predict(features)
    def predict(self, features:pd.DataFrame)->np.ndarray:
        gates = self._predict_gate(features)
        output = []
        for row, gate in zip(features.itertuples(index=False), gates):
            model = self._models[gate]
            output.append(model.predict([row]).item())
        return np.array(output)
    def predict_proba(self, features:pd.DataFrame)->np.ndarray:
        gates = self._predict_gate(features)
        output = []
        for row, gate in zip(features.itertuples(index=False), gates):
            model = self._models[gate]
            output.append(model.predict_proba([row])[0])
        return np.array(output)
    def scores(self, features:pd.DataFrame, y_true:pd.Series)->pd.DataFrame:
        model_scores = []
        names = []
        for model in self._models:
            y_pred = model.predict(features)
            score = accuracy_score(y_true, y_pred)
            model_scores.append(score)
            names.append(model.__class__.__name__)
        y_pred = self.predict(features)
        score = accuracy_score(y_true, y_pred)
        model_scores.append(score)
        names.append("MOE")
        return pd.DataFrame([model_scores], columns=names)
    def print_scores(self, features:pd.DataFrame, y_true:pd.Series)->None:
        scores = self.scores(features, y_true)
        print(scores)

<h2>Loading the data and preprocessing it</h2>

In [3]:
dataset = pd.read_csv("datasets/loan_data.csv")

dataset.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [4]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

for column in dataset.columns:
    if dataset[column].dtype == "object":
        dataset[column] = LabelEncoder().fit_transform(dataset[column])

X,y = dataset.drop(columns=["loan_status"]), dataset["loan_status"]

X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train, X_test = pd.DataFrame(X_train, columns=X.columns), pd.DataFrame(X_test, columns=X.columns)
y_train, y_test = pd.Series(y_train, name="targets"), pd.Series(y_test, name="targets")

In [5]:
model = MoE() # using default models
model.fit(X_train, y_train)

In [6]:
y_pred_test = model.predict(X_test)
y_pred_test

array([0, 0, 0, ..., 0, 0, 0])

In [7]:
y_proba_test = model.predict_proba(X_test)
y_proba_test

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [8]:
model.print_scores(X_test, y_test)

   KNeighborsClassifier  DecisionTreeClassifier      SVC       MOE
0              0.890842                0.897778  0.90633  0.900673


<p>Here the MOE gets the best accuracy</p>