# 1.Import Common Packages

In [None]:
import tensorflow

tensorflow.__version__

'2.13.0'

In [None]:
tensorflow.keras.optimizers.SGD()

<keras.src.optimizers.sgd.SGD at 0x7e09e40a23b0>

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

#2.Import CSV DATA

In [None]:
df = pd.read_csv("____________", index_col="___________", parse_dates=["____________"])
df.head()

#3.Datasets Splitting

 stratified shuffle

In [None]:
## stratified shuffle
X = df.drop(columns="___________")
y = "_____________"

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

Shuffle

In [None]:
## Shuffle
X = df.drop(columns="___________")
y = "_____________"

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

#4.Preprocessor

## common

In [None]:
#COMMON
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), ["______________"]),
    ('categoric', cat_pipe(encoder='onehot'), ["_____________"]),
])

## Advanced Sample

In [None]:
# ADVANCED
# Note: You could not use gsp, rsp, and bsp recommendation in advance mode
# You should specify your own parameter grid / interval when tuning
preprocessor = ColumnTransformer([
    ('numeric1', num_pipe(impute='mean', poly=2, scaling='standard', transform='yeo-johnson'), ["______________"]),
    ('numeric2', num_pipe(impute='median', poly=2, scaling='robust'), ["______________"]),
    ('categoric1', cat_pipe(encoder='ordinal'), ["_____________"]),
    ('categoric2', cat_pipe(encoder='onehot'), ["_____________"])
])

#5.Supervised Regression

In [None]:
#KNN
from sklearn.neighbors import KNeighborsRegressor
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', KNeighborsRegressor())
])

#SVM
from sklearn.svm import SVR
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])


#RF
from sklearn.ensemble import RandomForestRegressor
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', RandomForestRegressor(n_jobs=-1, random_state=42))
])


#XGBOOST
from xgboost import XGBRegressor
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(n_jobs=-1, random_state=42))
])


#LINEAR REGRESSION
from sklearn.linear_model import LinearRegression
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', LinearRegression())
])


#ELASTIC NET
from sklearn.linear_model import ElasticNet
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', ElasticNet())
])

#5.Supervised Classification

In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', KNeighborsClassifier())
])


#SVM
from sklearn.svm import SVC
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVC(max_iter=500))
])


#RF
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', RandomForestClassifier(n_jobs=-1, random_state=42))
])


#XGBOOST
from xgboost import XGBClassifier
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBClassifier(n_jobs=-1, random_state=42))
])


#LOGISTIC
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=42))
])

#6.TUNING

In [None]:
from skopt.space import Real, Categorical, Integer
params = {
'algo__max_depth': Integer(low=-1, high=12, transform='identity'),
'algo__learning_rate': Real(low=0.01, high=1, prior='log-uniform', transform='identity'),
'algo__colsample_bytree': Real(low=0.1, high=1, transform='identity'),
'algo__subsample': Real(low=0.2, high=0.8, transform='identity'),
'algo__num_leaves': Integer(low=20, high=100, transform='identity'),
'algo__reg_alpha': Real(low=0.001, high=10, prior='log-uniform', transform='identity'),
'algo__reg_lambda': Real(low=0.001, high=10, prior='log-uniform', transform='identity')
}

In [None]:
#GRID SEARCH CV
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

model = GridSearchCV(pipeline, gsp."_______________", cv="___", scoring='___', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

In [None]:
#RANDOMIZED SEARCH CV
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp

model = RandomizedSearchCV(pipeline, rsp."_______________", cv="___", scoring='___', n_iter="___", n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

In [None]:
#BAYESIAN
from skopt import BayesSearchCV
from jcopml.tuning import bayes_search_params as bsp

model = BayesSearchCV(pipeline, bsp."_______________", cv="___", scoring="__", n_iter="___", n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

In [None]:
# Halving Search

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from jcopml.tuning import random_search_params as rsp

model_hs = HalvingRandomSearchCV(pipeline,_____,cv = 3,n_candidates = 300,scoring="balanced_accuracy",min_resources ="exhaust",n_jobs = -1,verbose = -1,factor=2 )
model_hs.fit(X_train,y_train)

print(model.best_params_)
print(model.score(X_train, y_train),model.best_score_, model.score(X_test, y_test))

#7.SAVE MODEL

In [None]:
save_model(model, "__________.pkl")

save_model(model.best_estimator_, "__________.pkl")

# Spot check Algorithm

In [None]:
df_model = pd.DataFrame(columns = ["model","set_data","score"])
set_data = ["test","train"]
models = {
        "KNN" : KNeighborsClassifier(),
        "SVM":SVC(),
        "Random Forest":RandomForestClassifier(random_state = 42,n_jobs = -1),
        "Logistic Regression" : LogisticRegression(random_state = 42),
        "LGBM" : LGBMClassifier(random_state = 42),
        "XGB" : XGBClassifier(random_state = 42)
        }

scorer = "accuracy"
num_cv = 5
cv = StratifiedKFold(n_splits = num_cv,shuffle = True,random_state = 42)

for m in models:
    pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', models[m])
])
    spot_check = cross_val_score(pipeline,X_train,y_train,cv = cv,scoring = scorer,n_jobs= -1 )
    spot_check = spot_check.mean()
    model = pipeline.fit(X_train,y_train)
    score = pipeline.score(X_test,y_test)
    model_list = [m] * 2
    tes = pd.DataFrame(list(zip(model_list,set_data,[score,spot_check])),columns = ["model","set_data","score"])
    df_model = pd.concat([df_model,tes],ignore_index = True)

In [None]:
#set figsize
plt.figure(figsize=(10, 5))
plots = sns.barplot(x="model", y="score", data=df_model, ci=None,hue = "set_data")
plots.set_title("Mean Score of Model")
plots.bar_label(plots.containers[0],fmt = "%.3f")
plots.bar_label(plots.containers[1],fmt = "%.3f")
plt.yticks(np.arange(0,1.1,step = 0.1))
plt.ylabel("accuracy")
plt.legend();

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


models = {"KNN":KNeighborsClassifier(),
        "SVM":SVC(),
        "RF":RandomForestClassifier(random_state = 42,n_jobs = -1),
        "XGB":XGBClassifier(n_jobs=-1, random_state=42),
        "LGBM":LGBMClassifier(random_state=42,n_jobs=-1)}

scorer = "accuracy"
for model in models:
    pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', models[model])
])
    spot_check = cross_val_score(pipeline,X_train,y_train,cv = 5,scoring = scorer,n_jobs= -1 )
    print(model + ":" + str(round(spot_check.mean(),3)) + "(" + str(round(spot_check.std(),3)) +")")

# EDA



## Missing Value

In [None]:
plt.figure(figsize=(10,6))
sns.displot(
    data=___.isna().melt(value_name="missing"),
    y="variable",
    hue="missing",
    multiple="fill",
    aspect=2
)
plt.title("Missing Value Proportion Each Feature");

## Correlation Matrix

In [None]:
sns.heatmap(df.corr(method="spearman"),cmap = "bwr" ,vmin = -1,vmax=1,annot = True,cbar = False,fmt = ".2f")

## Plot Numerical Data


In [None]:
#plot numerical data (Classification Task)
# Plot KDE 1 Tiap Feature
numerical = df_set1.select_dtypes(include=['int64', 'float64']).columns

fig, axes = plt.subplots(7,8,figsize = (20,10))
fig.subplots_adjust(hspace=1, wspace=0.5)
row = 0
cols = 0

for item in numerical:
    if cols > 7:
        cols = 0
        row += 1
    ax = sns.histplot(x=item,data=df_set1 ,ax=axes[row, cols],bins = 20)
    ax.set_ylabel("")
    cols += 1


In [None]:
# Plot KDE Berdasarkan Target
def plot_kde(data,start,end,row_num,col_num,title,target ):
    df_set_pos = data[data[target] == ____].drop(target,axis = 1)
    df_set_neg = data[data[target] == _____].drop(target,axis = 1)
    df_set_pos = df_set_pos.iloc[:,start:end+1]
    df_set_neg = df_set_neg.iloc[:,start:end +1]
    #plot numerical data (Classification Task)
    numerical = df_set_pos.select_dtypes(include=['int64', 'float64']).columns
    fig, axes = plt.subplots(row_num,col_num,figsize = (10,8))
    fig.subplots_adjust(hspace=0.7, wspace=0.3)
    fig.suptitle(title,fontsize = 20)
    row = 0
    cols = 0

    for item in numerical:
        if cols > col_num -1:
            cols = 0
            row += 1
        ax1 = sns.kdeplot(x = item,data = df_set_pos,ax = axes[row,cols],color = "____" ,label = "____")
        ax2 = sns.kdeplot(x = item,data = df_set_neg,ax = axes[row,cols],color = "_____" ,label = "____")
        ax1.legend()
        ax1.set_ylabel("")
        ax2.set_ylabel("")
        cols += 1

plot_kde(df,0,4,3,2,'Distribusi IPS berdasarkan Keterlambatan',"TERLAMBAT")

In [None]:
#Plot Histogram,KDE,Boxplot
def num_eda(data,feature,target,bins ):
    df_set_pos = data[data[target] == ____].drop(target,axis = 1)
    df_set_neg = data[data[target] == ____].drop(target,axis = 1)
    #plot numerical data (Classification Task)
    fig, axes = plt.subplots(2,2,figsize = (18,4))

    ax1 = sns.histplot(x = feature,data = data,ax = axes[0,0],bins = bins,kde = True,edgecolor = "k",color = "orange")
    ax1.grid(linestyle='--', linewidth=0.5, color='gray')
    ax1.set_title(f"{feature} Distribution")

    ax2 = sns.histplot(x = feature,data = df_set_pos,ax = axes[0,1],bins = bins,label ="____",kde = True,color = "green",linewidth = 0 )
    ax2_1 = sns.histplot(x = feature,data = df_set_neg,ax = axes[0,1],label = "Not - ______",bins = bins,kde = True,color = "red",linewidth = 0)
    ax2.grid(linestyle='--', linewidth=0.1, color='gray')
    ax2.set_title(f"{feature} Distribution by Target Class")
    ax2.legend()

    ax3 = sns.boxplot(x = feature,data = data,ax = axes[1,0],color = "orange")
    ax3.grid(linestyle='--', linewidth=0.5, color='gray')

    ax4 = sns.boxplot(x = feature,y = target,data = data,ax = axes[1,1],orient = "h",palette = ["red","green"])
    ax4.grid(linestyle='--', linewidth=0.1, color='gray')
    ax4.legend()

## Plot Categorical Data

In [None]:
#plot categorical Data (classification Task)
categorical = df.select_dtypes(include=["object"]).columns

fig, axes = plt.subplots(3, 4, figsize=(20, 10))
fig.suptitle('Categorical Data')
row = 0
cols = 0
for item in categorical:
    if (row == 0 & cols == 0) :
        sns.countplot(y=item, hue="y", data=df, ax=axes[row][cols]).set_xlabel("")

    else:
        sns.countplot(x=item, hue="y", data=df, ax=axes[row][cols]).set_ylabel("")
    cols += 1
    if cols == 4:
        cols = 0
        row += 1

In [None]:
#plot categorical Data (classification Task)
def cat_eda(data,feature,target):
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    ax1 = sns.countplot(x=feature, data=data, ax=axes[0],color = "orange",edgecolor = "k")
    ax1.bar_label(ax1.containers[0])
    ax1.set_title(f"{feature} Distribution")

    ax2 = sns.countplot(x = feature,hue = target, data = data,ax = axes[1],palette = ["r",'lime'],edgecolor = "k")
    ax2.set_title(f"{feature} Distribution with target")
    ax2.bar_label(ax2.containers[0])
    ax2.bar_label(ax2.containers[1])

In [None]:
# function for training
def train_model_ts(list_model,X_train,y_train,X_test,y_test,metric,cv,scorer,pipeline,groups):
    df_model = pd.DataFrame(columns = ["model_name","set_data","score","model"])
    set_data = ["test","cv","train"]

    for m in list_model:
        pipeline_copy = deepcopy(pipeline)
        pipeline_copy.set_params(model = list_model[m])
        spot_check = cross_val_score(pipeline_copy,X_train,y_train,cv = cv,scoring = scorer,n_jobs= -1,groups = groups )
        spot_check = spot_check.mean()
        model = pipeline_copy.fit(X_train,y_train)
        score = metric(y_test,model.predict(X_test),squared = False)
        score_train = metric(y_train,model.predict(X_train),squared = False)
        model_list = [m] * 3
        tes = pd.DataFrame(list(zip(model_list,set_data,[score,spot_check,score_train],[model,model,model])),columns = ["model_name","set_data","score","model"])
        print(f"model {m} selesai di training")
        print(f"score test {score}")
        print(f"score cv {spot_check}")
        print(f"score train {score_train}")
        print("=====================================")
        df_model = pd.concat([df_model,tes],ignore_index = True)

    return df_model


#function for feature selection
#since im using pipeline for my workflow, so ineed to modify the rfecv function from scikit-learn,
# so it can work with pipeline. other than that is same as scikit-learn rfecv function
def rfecv(X, y, pipeline,min_features_to_select=3, cv = 3,step=3,scoring_metric="f1",scoring_decimals=3,random_state=42,groups = None):
    # Initialize survivors and ranked list
    estimator = deepcopy(pipeline)
    estimator.steps.pop(-1)
    survivors = estimator.fit_transform(X_train,y_train).columns.tolist()
    ranks = []
    scores = []
    # While the survivor list is longer than min_features_to_select
    while len(survivors) >= min_features_to_select:
        print(ranks)
        remove_column_transformer = FunctionTransformer(lambda x: x.drop(ranks, axis=1))
        estimator = deepcopy(pipeline)
        estimator.steps.insert(-1, ('remove_column_transformer', remove_column_transformer))
        # Get only the surviving features

        # Train and get the scores, cross_validate clones
        # the model internally, so this does not modify
        # the estimator passed to this function
        print("[%.2f] evaluating %i features ..." % (time(), len(survivors)))
        cv_result = cross_validate(estimator, X, y,
                                cv=cv,
                                groups = groups,
                                scoring=scoring_metric,
                                return_estimator=True)
        # Append the mean performance to
        score = np.mean(cv_result["test_score"])
        if scoring_decimals is None:
            scores.append(score)
        else:
            scores.append(round(score, scoring_decimals))
        print("[%.2f] ... score %f." % (time(), scores[-1]))

        # Get feature weights from the model fitted
        # on the best fold and square the weights as described
        # in the paper. If the estimator is a Pipeline,
        # we get the weights from the last element.
        best_estimator = cv_result["estimator"][np.argmax(cv_result["test_score"])]
        if isinstance(best_estimator, Pipeline):
            weights = best_estimator[-1].feature_importances_
        else:
            weights = best_estimator.feature_importances_
        weights = list(np.power(weights, 2))

        # Remove step features (but respect min_features_to_select)
        for _ in range(max(min(step, len(survivors) - min_features_to_select), 1)):

            # Find the feature with the smallest ranking criterion
            # and update the ranks and survivors
            idx = np.argmin(weights)
            ranks.insert(0, survivors.pop(idx))
            weights.pop(idx)

    # Calculate the best set of surviving features
    ranks_reverse = list(reversed(ranks))
    last_max_idx = len(scores) - np.argmax(list(reversed(scores))) - 1
    removed_features = set(ranks_reverse[0:last_max_idx * step])
    best_features = [f for f in X.columns if f not in removed_features]

    # Return ranks and scores
    return best_features, max(scores), ranks, scores