In [None]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.neural_network import MLPClassifier
import os
import sys

ROOT = os.path.abspath(os.path.join(os.pardir))
if ROOT not in sys.path:
    sys.path.append(ROOT)

from src.config import config
from src.E2EPipeline import E2EPipeline

pd.set_option('display.max_columns', None)

# update jupyter kernel automatically
%load_ext autoreload
%autoreload 2

In [None]:
df_raw = pd.read_excel("/Users/kewenyang/Documents/GitHub/Maybank_Classification/data/Assessment.xlsx",
                       engine='openpyxl',
                       sheet_name=1)
df_raw.head()

In [None]:
pipe = E2EPipeline()
df_raw = pipe.preprocess(df_raw, True)
df_raw.head()

In [None]:
pd.set_option('display.max_rows', None)

df_raw.isnull().sum(axis=0) / df_raw.shape[0]

In [None]:
# drop the customer_id
df_X = df_raw.drop([config.index_col, config.target_name], axis=1)
df_y = df_raw.loc[:, [config.target_name]]

In [None]:
# train, val, test split
X, X_test, y, y_test = train_test_split(df_X, df_y, test_size=0.1, random_state=1, shuffle=True, stratify=df_y)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1, shuffle=True, stratify=y)

display(f"train set size: {X_train.shape}, val set size: {X_val.shape}, test set size: {X_test.shape}")

In [None]:
X_train.head()

In [None]:
# get cols requiring onehotencoding
categorical = [col for col in (config.ordinal + config.nominal) if col != config.target_name]
categorical.remove('HL_tag')
categorical.remove('AL_tag')

display(f"the categorical variables that need one hot encoding are: {categorical}")

# apply onehotencoding for categorical variables
enc = OneHotEncoder(handle_unknown='error', sparse_output=False, drop=None)
enc.fit_transform(df_X.loc[:, categorical])
feature_labels = enc.get_feature_names_out()


feature_arr = enc.transform(X[categorical])
cat_X = pd.DataFrame(feature_arr, columns=feature_labels).reset_index(drop=True)
X = pd.concat([X.drop(categorical, axis=1).reset_index(drop=True), cat_X], axis=1)

# for training data
feature_arr = enc.transform(X_train[categorical])
cat_train = pd.DataFrame(feature_arr, columns=feature_labels).reset_index(drop=True)
X_train = pd.concat([X_train.drop(categorical, axis=1).reset_index(drop=True), cat_train], axis=1)
display("training data after onehot encoding:", X_train.head())

feature_arr = enc.transform(X_val[categorical])
cat_val = pd.DataFrame(feature_arr, columns=feature_labels).reset_index(drop=True)
X_val = pd.concat([X_val.drop(categorical, axis=1).reset_index(drop=True), cat_val], axis=1)
display("val data after onehot encoding:", X_val.head())

feature_arr = enc.transform(X_test[categorical])
cat_test = pd.DataFrame(feature_arr, columns=feature_labels).reset_index(drop=True)
X_test = pd.concat([X_test.drop(categorical, axis=1).reset_index(drop=True), cat_test], axis=1)
display("test data after onehot encoding:", X_test.head())

In [None]:
# # try to drop useless columns based on feature selection
useless = ['C_HSE_OFFICE',
 'ANN_TRN_AMT / AVG_TRN_AMT',
 'C_HSE_COMMERICAL BUILDING',
 'AVG_TRN_AMT / ANN_TRN_AMT',
 'C_HSE_INDUSTRIAL BUILDING',
 'C_HSE_HOTEL/ SERVICE APARTMENT']

X.drop(useless, axis = 1, inplace=True)
X_train.drop(useless, axis = 1, inplace=True)
X_val.drop(useless, axis = 1, inplace=True)
X_test.drop(useless, axis = 1, inplace=True)

In [None]:
# label encoding for target variable
le = preprocessing.LabelEncoder()

y = pd.DataFrame(le.fit_transform(y[config.target_name]), columns=["Y"])

y_train = pd.DataFrame(le.transform(y_train[config.target_name]), columns=["Y"])
display("training after label encoding:", y_train.head())

y_val = pd.DataFrame(le.transform(y_val[config.target_name]), columns=["Y"])
display("y_val after label encoding:", y_val.head())

y_test = pd.DataFrame(le.transform(y_test[config.target_name]), columns=["Y"])
display("y_test after label encoding:", y_test.head())

In [None]:
# upsamping
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=1, k_neighbors=5)

X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
X_train.head()

In [None]:
%%time
clf = MLPClassifier(random_state=1, max_iter=100).fit(X_train, y_train)

In [None]:
clf.predict(X_test[:1])

In [None]:
%%time
# predict on test set
preds = clf.predict(X_test)

# 0.9228875406664908
f1_score(y_test, preds)