In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools
from scipy import stats
from sklearn.preprocessing import QuantileTransformer, StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier


def removeOutlier(df:pd.DataFrame):
    ##### remove outlier
    df = df.loc[(df['age'] <= 60) & (df['campaign'] <= 20)]
    df['pdays'] = df['pdays'].replace([999], [-1])
    df = df.loc[(df['pdays'] <= 15) & 
                (df['job'] != 'unknown') & 
                (df['marital'] != 'unknown') & 
                (df['housing'] != 'unknown') &
                (df['loan'] != 'unknown') &
                (df['default'] != 'yes') &
                (df['education'] != 'illiterat')]
    return df

def convertNum2Cat(df:pd.DataFrame):
    ##### change numerical data to categorical data
    df['month'] = df['month'].replace(['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'], list(range(1,13,1)))
    df['y'] = df['y'].replace(['no', 'yes'], [0, 1])
    return df

def groupData(df:pd.DataFrame):
    ### group numerical data
    ##### pdays
    df['pdays'] = df['pdays'].replace(list(range(16)), (['yes']*16))
    df['pdays'] = df['pdays'].replace([-1], ('no'))
    #### previous
    df['previous'] = df['previous'].replace(list(range(1,16,1)), (['yes']*15))
    df['previous'] = df['previous'].replace([0], ['no'])
    ### group categorical data
    ###### education
    degree = ['university.degree', 'professional.course']
    basic_school = ['high.school', 'basic.9y', 'basic.4y', 'basic.6y', 'unknown']
    df['education'] = df['education'].replace(degree, ['yes']*len(degree))
    df['education'] = df['education'].replace(basic_school, ['no']*len(basic_school))

    ##### job
    have_job = ['admin.', 'blue-collar', 'technician', 'services', 'management', 'entrepreneur', 'self-employed', 'housemaid']
    no_job = ['retired', 'unemployed', 'student']
    df['job'] = df['job'].replace(have_job, ['yes']*len(have_job))
    df['job'] = df['job'].replace(no_job, ['no']*len(no_job))

    ##### marital
    alone = ['single', 'divorced']
    not_alone = ['married']
    df['marital'] = df['marital'].replace(not_alone, ['yes']*len(not_alone))
    df['marital'] = df['marital'].replace(alone, ['no']*len(alone))    
    return df
    
def correctSkewed(df:pd.DataFrame):
    ##### correct skewed data
    quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=1)
    skewed_data_list = ['age','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
    x_skew = df[skewed_data_list].values
    X_trans = quantile_transformer.fit_transform(x_skew)
    df[skewed_data_list] = X_trans
    df_x = df.drop(['y'], axis=1)
    df_y = df['y']
    return df_x, df_y

def preprocess4Search():
    df = pd.read_csv('bank-additional-full.csv', sep=';')
    df = df.drop(['duration'], axis=1)
    df = removeOutlier(df)
    df = convertNum2Cat(df)
    df = groupData(df)
    df_x, df_y = correctSkewed(df)
    
    return df_x, df_y

In [None]:
def searchReduceNum():
    df_x, df_y = preprocess4Search()
    cols = df_x.columns
    num_data = [i for i in cols if (len(df_x[i].unique()) != 2 and df_x[i].dtype != 'object')]
    cat_data = list(set(cols) - set(num_data))
    num_data = pd.Index(num_data)
    cat_data = pd.Index(cat_data)
    x_train,x_test,y_train,y_test = train_test_split(df_x, df_y, test_size = 0.1, random_state = 1)

    k_chi2 = SelectKBest(chi2)
    k_mutal = SelectKBest(mutual_info_classif)
    k_anova = SelectKBest()
    lst = []
    for i in range(1,3,1): ##### max 2 layers
        comb = itertools.combinations(range(1,9,1), i) ####### max 8 neuron
        for val in comb:
            lst.append(val)
    ##### Declare model and model parameters
    model = MLPClassifier(max_iter=10000, random_state=1, learning_rate='adaptive')
    number_num_feature = list(range(1, (len(num_data) + 1), 1))
    number_cat_feature = list(range(1, (len(cat_data) + 1), 1))
    num_tran = Pipeline(steps=[("reduce_num_dim", "passthrough"), ("scaler", StandardScaler())])
    cat_tran = Pipeline([("one_hot", OneHotEncoder()), ("reduce_cat_dim", "passthrough")])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_tran, num_data),
            ("cat", cat_tran, cat_data),
        ]
    )
    pipe = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classify", model),
        ]
    )

    param_grid = [
        {
            "preprocessor__num__reduce_num_dim" : [k_anova],
            "preprocessor__num__reduce_num_dim__k" : number_num_feature,
            "preprocessor__cat__reduce_cat_dim" : [k_chi2],
            "preprocessor__cat__reduce_cat_dim__k" : number_cat_feature,
            "classify__hidden_layer_sizes" : lst,
            "classify__activation" : ['identity', 'logistic', 'tanh', 'relu'],
            "classify__solver" : ['lbfgs', 'sgd', 'adam']
        }
    ]

    grid = GridSearchCV(pipe, n_jobs = -1, param_grid = param_grid, scoring = 'f1', verbose = 5)
    grid.fit(x_train, y_train)

    print(grid.best_params_)
    y_pred = grid.predict(x_test)
    print("Classification report \n=======================")
    print(classification_report(y_true=y_test, y_pred=y_pred))
    print("Confusion matrix \n=======================")
    #print(confusion_matrix(y_true=y_test, y_pred=y_pred))
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
    plt.show()

    return grid