In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [2]:
df = pd.read_csv("data/parenting_dataset_asli.csv", index_col="NO")
df.drop(columns=["Responden"], inplace=True)
df.head()

Unnamed: 0_level_0,AE1,AE2,AE3,AE4,AE5,AE6,AE7,AE8,AE9,AE10,...,PE2,PE3,PE4,PE5,PE6,PE7,PE8,PE9,PE10,Parenting_style
NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,5,2,5,1,4,1,4,1,5,...,4,3,1,4,1,2,4,1,1,Authoration
2,1,4,2,4,2,5,1,4,5,5,...,2,3,2,3,4,3,2,2,1,Authorative
3,2,4,4,5,5,4,2,5,5,5,...,4,3,4,4,4,4,4,3,3,Authorative
4,1,3,2,4,2,4,1,3,2,5,...,5,2,4,3,3,3,5,5,4,Permissive
5,1,1,3,3,1,1,1,5,3,3,...,5,1,4,3,4,3,4,4,3,Permissive


In [3]:
df.Parenting_style.value_counts()

Authorative    182
Authoration    163
Permissive     155
Name: Parenting_style, dtype: int64

In [4]:
X = df.drop(columns="Parenting_style")
y = df.Parenting_style

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((400, 52), (100, 52), (400,), (100,))

In [5]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp
from jcopml.tuning import random_search_params as rsp
from jcopml.tuning.space import Integer, Real

In [6]:
gsp.xgb_params

{'algo__max_depth': [3, 6, 10],
 'algo__colsample_bytree': [0.4, 0.6, 0.8],
 'algo__n_estimators': [100, 150, 200],
 'algo__subsample': [0.4, 0.6, 0.8],
 'algo__gamma': [1, 5, 10],
 'algo__learning_rate': [0.01, 0.1, 1],
 'algo__reg_alpha': [0.01, 0.1, 10],
 'algo__reg_lambda': [0.01, 0.1, 10]}

In [7]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(poly=2), X_train.columns), #jangan lupa scalling ya, karena semua data numeric gaada pake categoric jadi dibuang aja categoric nya
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBClassifier(n_jobs=-1, random_state=42))
])

model = GridSearchCV(pipeline, gsp.xgb_params, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 6561 candidates, totalling 19683 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 2216.9min


KeyboardInterrupt: 

In [20]:
save_model(model.best_estimator_, "pakis_model.h5")

Model is pickled as model/pakis_model.h5
