# imports

In [1]:
import pandas as pd
import numpy as np; np.random.seed(0)
import matplotlib.pyplot as plt; plt.rcParams['font.family'] = 'Malgun Gothic'
import seaborn as sns; sns.set_theme(font='Malgun Gothic')

import warnings; warnings.filterwarnings(action='ignore')
pd.set_option('display.max_rows', 100, 'display.max_columns', 100, 'max_colwidth', None)

In [None]:
from scipy.stats import skew, kurtosis
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA

In [2]:
from sklearn import datasets

# sampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

# preprocessing
from sklearn.impute import SimpleImputer, MissingIndicator, KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.preprocessing import Binarizer
from sklearn.decomposition import PCA

# estimators
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from xgboost import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

# ensemble
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, VotingClassifier, VotingRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor

# clustering
from sklearn.cluster import KMeans, DBSCAN

# neural
import tensorflow as tf; tf.random.set_seed(0)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error, mean_squared_log_error, mean_absolute_error
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.metrics import silhouette_samples, silhouette_score

# model_selection
from sklearn.model_selection import train_test_split, TimeSeriesSplit, KFold, StratifiedKFold, GridSearchCV

# tools
import pycaret.classification as pycla
import pycaret.regression as pyreg
import pycaret.clustering as pyclu
import pycaret.utils as pyuti
import optuna

# functions

In [3]:
# functions

# --------------------------------------------------
def check_outliers(X_train, features, rate=1.5):
    
    dict = {}
    
    for col in features:
        
        Q1 = np.percentile(X_train[col], 25) 
        Q3 = np.percentile(X_train[col], 75) 
        IQR = Q3 - Q1
        
        min = Q1 - (rate * IQR)
        max = Q3 + (rate * IQR)
        
        idxs = X_train[(X_train[col] < min) | (X_train[col] > max)].index
        
        dict[col] = idxs
        
    return dict
# --------------------------------------------------


# --------------------------------------------------
def scores(y_test, y_pred):
    
    """evaluation scores"""
    
    f1        = f1_score         (y_test, y_pred, average='macro')
    accuracy  = accuracy_score   (y_test, y_pred, )
    precision = precision_score  (y_test, y_pred, average='macro')
    recall    = recall_score     (y_test, y_pred, average='macro')
    c_matrix  = confusion_matrix (y_test, y_pred, )
    
    print(
          f'accuracy  = {accuracy:.6f},  '
          f'f1 score  = {f1:.6f},  \n'
          f'precision = {precision:.6f},  '
          f'recall    = {recall:.6f},')
    print(c_matrix)
    
    # plt.figure(figsize=(12, 9))
    # plt.title('confusion matrix')
    # plt.xlabel('Predict')
    # plt.ylabel('Actual')
    # sns.heatmap(c_matrix, annot=True, linewidths=1, cmap='Blues', annot_kws={"size": 14})
    # plt.show()
# --------------------------------------------------

# --------------------------------------------------
def curves(y_test, probas_pred):
        
    FPRS, TPRS, thresholds = roc_curve(y_test, probas_pred[:, 1])
 
    f, ax = plt.subplots(1, 2, figsize=(16, 6))
 
    # settings
    plt.subplot(121)
    plt.title('ROC curve')
    plt.gray()
    plt.xlabel('FPR(1- specificity)')
    plt.ylabel('TPR')
 
    # x, y values
    plt.plot(FPRS, TPRS, label='ROC', linestyle='solid')
    plt.plot([0, 1], [0, 1], label='50%', color='gray', linestyle=':')
    plt.legend()
 
    precisions, recalls, thresholds = precision_recall_curve(y_test, probas_pred[:, 1])
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
 
    plt.subplot(122)
    plt.title('precision recall curve')
    plt.gray()
    plt.xlabel('threshold')
    plt.ylabel('scores')
 
    # x, y values
    plt.plot(thresholds, precisions[:thresholds.shape[0]], label='precision', linestyle=':')
    plt.plot(thresholds, recalls[:thresholds.shape[0]],    label='recall',    linestyle='--')
    plt.plot(thresholds, f1_scores[:thresholds.shape[0]],  label='f1',        linestyle='solid')
    # valid linestyle = '-', '--', '-.', ':', 'None', ' ', '', 'solid', 'dashed', 'dashdot', 'dotted'
    plt.legend()
 
    plt.show()
    
    print('AUC = ', roc_auc_score(y_test, probas_pred[:, 1]))
# --------------------------------------------------

In [4]:
iris = datasets.load_iris()
df = pd.DataFrame(iris['data'], columns=iris['feature_names'])
df['target'] = iris['target']

# classification metrics

In [5]:
# classification metrics : INPUT TRAIN & TARGET
train  = df
target = 'target'

# split ------------------------------------------------------
X = train.drop(target, axis=1)
y = train[target]
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=True, stratify=y
)

# model loop -------------------------------------------------
classification_result_list = []
classification_feature_importance_list = []
classification_models = [
    LogisticRegression        (),
    DecisionTreeClassifier    (),
    BaggingClassifier         (),
    RandomForestClassifier    (),
    GradientBoostingClassifier(),
    XGBClassifier             (verbosity=0),
    XGBRFClassifier           (verbosity=0),
    LGBMClassifier            (),
    CatBoostClassifier        (verbose=0),
]
classification_models_fitted = []

for classification_model in classification_models:
    
    classification_model.fit(X_train, y_train)
    
    # fitted models appended
    classification_models_fitted.append(classification_model)
    
    y_pred      = classification_model.predict(X_val)
    probas_pred = classification_model.predict_proba(X_val)
    
    # scores -----------------------------------------------------
    accuracy  = accuracy_score (y_val, y_pred, )
    f1        = f1_score       (y_val, y_pred, average='macro')
    precision = precision_score(y_val, y_pred, average='macro')
    recall    = recall_score   (y_val, y_pred, average='macro')
    
    if probas_pred.shape[1] > 2: # multi-label
        auc   = roc_auc_score  (y_val, probas_pred, multi_class='ovr')
    else:                        # binary-label
        auc   = roc_auc_score  (y_val, probas_pred[:, -1],           )
    
    classification_model_score = [classification_model.__class__.__name__, accuracy, f1, precision, recall, auc]
    try:
        classification_feature_importance_list.append(classification_model.feature_importances_)
    except:
        classification_feature_importance_list.append(np.zeros(len(X.columns)))
    
    classification_result_list.append(classification_model_score)
    
    print('fitted :', classification_model.__class__.__name__)

print('-' * 50)
print('classification_models_fitted : list ready')
print('-' * 50)

# result df
classification_df = pd.DataFrame(classification_result_list, columns=['estimator', 'accuracy', 'F1', 'precision', 'recall', 'AUC'])
classification_feature_importance_df = pd.DataFrame(classification_feature_importance_list, columns=X.columns)
classification_df = pd.concat([classification_df, classification_feature_importance_df], axis=1)
classification_df

fitted : LogisticRegression
fitted : DecisionTreeClassifier
fitted : BaggingClassifier
fitted : RandomForestClassifier
fitted : GradientBoostingClassifier
fitted : XGBClassifier
fitted : XGBRFClassifier
fitted : LGBMClassifier
fitted : CatBoostClassifier
--------------------------------------------------
classification_models_fitted : list ready
--------------------------------------------------


Unnamed: 0,estimator,accuracy,F1,precision,recall,AUC,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,LogisticRegression,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
1,DecisionTreeClassifier,0.966667,0.966583,0.969697,0.966667,0.975,0.016667,0.0,0.527976,0.455357
2,BaggingClassifier,0.933333,0.93266,0.944444,0.933333,1.0,0.0,0.0,0.0,0.0
3,RandomForestClassifier,0.966667,0.966583,0.969697,0.966667,1.0,0.097348,0.025451,0.420446,0.456755
4,GradientBoostingClassifier,0.966667,0.966583,0.969697,0.966667,1.0,0.006405,0.011945,0.267045,0.714605
5,XGBClassifier,0.933333,0.93266,0.944444,0.933333,1.0,0.019696,0.017986,0.592,0.370319
6,XGBRFClassifier,0.933333,0.93266,0.944444,0.933333,1.0,0.019255,0.100157,0.386219,0.49437
7,LGBMClassifier,0.966667,0.966583,0.969697,0.966667,1.0,314.0,313.0,463.0,302.0
8,CatBoostClassifier,0.966667,0.966583,0.969697,0.966667,1.0,6.544351,11.203605,35.594006,46.658038


In [6]:
# # classification cross_validate : INPUT TRAIN & TARGET
# # ============================================================
# train_metrics  = iris_df
# target_metrics = 'target'
# # ============================================================

# # split ------------------------------------------------------
# X_metrics = train_metrics.drop(target_metrics, axis=1)
# y_metrics = train_metrics[target_metrics]

# # scoring= --------------------------------------------------
# classification_scoring = {
#     'accuracy':'accuracy',
#     'f1_macro':'f1_macro',
#     'precision':'precision_macro',
#     'recall':'recall_macro',
#     'roc_auc_ovr':'roc_auc_ovr',
# }

# # cv loop ----------------------------------------------------
# classification_cv_list   = []
# classification_cv_models = [
#     LogisticRegression        (random_state=0),
#     DecisionTreeClassifier    (random_state=0),
#     BaggingClassifier         (random_state=0),
#     RandomForestClassifier    (random_state=0),
#     GradientBoostingClassifier(random_state=0),
#     XGBClassifier             (random_state=0, verbosity=0),
#     XGBRFClassifier           (random_state=0, verbosity=0),
#     LGBMClassifier            (random_state=0),
#     CatBoostClassifier        (random_state=0, verbose=0),
# ]

# for classification_cv_model in classification_cv_models:
    
#     cv_results = cross_validate(
#         classification_cv_model,
#         X_metrics, y_metrics,
#         scoring=classification_scoring,
#         cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0),
#         return_estimator=True,
#     )

#     # scores -----------------------------------------------------
#     accuracy  = cv_results['test_accuracy'].mean()
#     f1        = cv_results['test_f1_macro'].mean()
#     precision = cv_results['test_precision'].mean()
#     recall    = cv_results['test_recall'].mean()
#     auc       = cv_results['test_roc_auc_ovr'].mean()

#     classification_cv_score = [
#         classification_cv_model.__class__.__name__, accuracy, f1, precision, recall, auc]

#     classification_cv_list.append(classification_cv_score)

#     classification_cv_df = pd.DataFrame(
#         classification_cv_list, columns=['cv_estimator', 'accuracy', 'F1', 'precision', 'recall', 'AUC'])

#     # feature importances-----------------------------------------
#     try:
        
#         cv_fi_list = []
#         for idx, estimator in enumerate(cv_results['estimator']):
#             fi = pd.DataFrame(estimator.feature_importances_, 
#                             index=X_metrics.columns, columns=['importance'])
#             # fi = fi.sort_values('importance', ascending=False)
            
#             cv_fi_list.append(estimator.feature_importances_.tolist())
#         fi_means = np.array(cv_fi_list).mean(axis=0)
        
#         fi_df = pd.DataFrame(fi_means, columns=['importance'], index=X_metrics.columns)
#         # fi_df = fi_df.sort_values(by='importance', ascending=False)
        
#         fig, ax = plt.subplots(figsize=(12, 3))
#         ax.set_title(classification_cv_model.__class__.__name__)
#         sns.set_theme(style='whitegrid')
#         sns.set_color_codes('pastel')
#         sns.barplot(x='importance', y=X_metrics.columns, data=fi_df)
#         plt.show()
        
#     except:
#         pass

# classification_cv_df

In [7]:
# # classification GridsearchCV : INPUT TRAIN & TARGET
# # ============================================================
# train_metrics  = iris_df
# target_metrics = 'target'
# # ============================================================

# # split ------------------------------------------------------
# X_metrics = train_metrics.drop(target_metrics, axis=1)
# y_metrics = train_metrics[target_metrics]

# # param_grid= ------------------------------------------------
# hyper_param = {
#     'random_state':[0, 1, 2],
#     # 'n_estimators':[100],
#     # 'min_samples_split':[2],
#     # 'min_samples_leaf':[1],
# }

# # scoring= ---------------------------------------------------
# classification_scoring = {
#     'accuracy':'accuracy',
#     'f1_macro':'f1_macro',
#     'precision':'precision_macro',
#     'recall':'recall_macro',
#     'roc_auc_ovr':'roc_auc_ovr',
# }

# # GridSearchCV loop ------------------------------------------
# classification_gscv_list   = []
# classification_gscv_models = [
#     LogisticRegression        (random_state=0),
#     DecisionTreeClassifier    (random_state=0),
#     BaggingClassifier         (random_state=0),
#     RandomForestClassifier    (random_state=0),
#     GradientBoostingClassifier(random_state=0),
#     XGBClassifier             (random_state=0, verbosity=0),
#     XGBRFClassifier           (random_state=0, verbosity=0),
#     LGBMClassifier            (random_state=0),
#     CatBoostClassifier        (random_state=0, verbose=0),
# ]

# for classification_gscv_model in classification_gscv_models:
    
#     gscv = GridSearchCV(
#         classification_gscv_model,
#         param_grid=hyper_param,
#         scoring=classification_scoring,
#         refit='f1_macro',
#         cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0),
#     )
    
#     gscv.fit(X_metrics, y_metrics)
    
#     classification_gscv_score = [
#         gscv.estimator.__class__.__name__,
#         gscv.best_score_,
#         gscv.best_params_,]
    
#     classification_gscv_list.append(classification_gscv_score)

# classification_gscv_df = pd.DataFrame(
#     classification_gscv_list, columns=['gscv_estimator', 'best_score_', 'best_params_'])
# classification_gscv_df

# regression metrics

In [8]:
boston = datasets.load_boston()
df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
df['target'] = boston['target']

In [9]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [11]:
# regression metrics : INPUT TRAIN & TARGET
train  = df
target = 'target'

# split ------------------------------------------------------
X = train.drop(target, axis=1)
y = train[target]
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=True,
)

# model loop -------------------------------------------------
regression_result_list   = []
regression_models = [
    LinearRegression         (),
    DecisionTreeRegressor    (),
    Ridge                    (),
    Lasso                    (),
    ElasticNet               (),
    BaggingRegressor         (),
    RandomForestRegressor    (),
    GradientBoostingRegressor(),
    XGBRegressor             (verbosity=0),
    XGBRFRegressor           (verbosity=0),
    LGBMRegressor            (),
    CatBoostRegressor        (verbose=0),
]
regression_models_fitted = []

for regression_model in regression_models:
    
    regression_model.fit(X_train, y_train)
    
    # fitted models appended
    regression_models_fitted.append(regression_model)
    
    y_pred = regression_model.predict(X_val)
    
    # errors -----------------------------------------------------
    mse  = mean_squared_error    (y_val, y_pred)
    rmse = mean_squared_error    (y_val, y_pred, squared=False)
    msle = mean_squared_log_error(y_val, y_pred)
    mae  = mean_absolute_error   (y_val, y_pred)
    
    regression_model_score = [regression_model.__class__.__name__, mse, rmse, msle, mae]
    regression_result_list.append(regression_model_score)
    
    print('fitted :', regression_model.__class__.__name__)

print('-' * 50)
print('regression_models_fitted : list ready')
print('-' * 50)

# result df
regression_df = pd.DataFrame(regression_result_list, columns=['estimator', 'MSE', 'RMSE', 'MSLE', 'MAE'])
regression_df

fitted : LinearRegression
fitted : DecisionTreeRegressor
fitted : Ridge
fitted : Lasso
fitted : ElasticNet
fitted : BaggingRegressor
fitted : RandomForestRegressor
fitted : GradientBoostingRegressor
fitted : XGBRegressor
fitted : XGBRFRegressor
fitted : LGBMRegressor
fitted : CatBoostRegressor
--------------------------------------------------
regression_models_fitted : list ready
--------------------------------------------------


Unnamed: 0,estimator,MSE,RMSE,MSLE,MAE
0,LinearRegression,29.260195,5.409269,0.113617,3.668639
1,DecisionTreeRegressor,27.184118,5.213839,0.046761,3.407843
2,Ridge,30.144524,5.490403,0.143062,3.71186
3,Lasso,33.914349,5.823603,0.141266,4.022421
4,ElasticNet,33.297027,5.770358,0.136855,3.993194
5,BaggingRegressor,15.167646,3.894566,0.025914,2.502843
6,RandomForestRegressor,14.736465,3.83881,0.025125,2.442088
7,GradientBoostingRegressor,10.589213,3.254107,0.021359,2.301365
8,XGBRegressor,10.506135,3.241317,0.01991,2.26954
9,XGBRFRegressor,18.46452,4.297036,0.029104,2.712314


# regression GridSerachCV

In [16]:
# regression GridsearchCV : INPUT TRAIN & TARGET
train  = df
target = 'target'

# split ------------------------------------------------------
X = train.drop(target, axis=1)
y = train[target]

# estimator --------------------------------------------------
estimator = CatBoostRegressor(verbose=0)

# param_grid= ------------------------------------------------
hyper_param = {
    # 'n_estimators':[100],
}

# scoring= ---------------------------------------------------
regression_scoring = {
    # 'MSE':'neg_mean_squared_error',
    # 'RMSE':'neg_root_mean_squared_error',
    # 'MSLE':'neg_mean_squared_log_error',
    'MAE':'neg_mean_absolute_error',
}

# GridSearchCV -----------------------------------------------

gscv = GridSearchCV(
    estimator=estimator, param_grid=hyper_param, scoring=regression_scoring, 
    refit='MAE', 
    cv=5, 
    verbose=2, 
)

gscv.fit(X, y)

print(gscv.best_estimator_)
print(gscv.best_score_ * -1)
print(gscv.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
<catboost.core.CatBoostRegressor object at 0x000002A37F1C90D0>
3.0490128911729757
{}


# optuna

In [None]:
# # Optuna
# def objective_xgbr(trial):
    
#     param = {
#         'n_estimators'    :trial.suggest_int       ('n_estimators', 100, 1000),
#         'max_depth'       :trial.suggest_int       ('max_depth', 8, 16),
#         'min_child_weight':trial.suggest_int       ('min_child_weight', 1, 50),
#         'gamma'           :trial.suggest_int       ('gamma', 1, 3),
#         'learning_rate'   :0.01,
#         'lambda'          :trial.suggest_loguniform('lambda', 1e-3, 10.0),
#         'alpha'           :trial.suggest_loguniform('alpha', 1e-3, 10.0),
#         'random_state'    :0
#         }
    
#     xgbr = XGBRegressor(**param)
#     xgbr.fit(X_train, y_train, verbose=False)
#     y_pred = xgbr.predict(X_test)
    
#     score = mean_squared_error(y_pred, y_test, squared=False)
#     return score

# study = optuna.create_study()
# study.optimize(objective_xgbr, n_trials=100)
# print(study.best_params)

# xgbr = XGBRegressor(**study.best_params)
# xgbr.fit(X_train, y_train, verbose=False)
# y_pred = xgbr.predict(X_test)

# score = mean_squared_error(y_pred, y_test, squared=False)
# print(score)

In [None]:
# # OptunaSearchCV
# rfr = RandomForestRegressor(random_state=0)

# param_distributions = {
#     'n_estimators':     optuna.distributions.IntUniformDistribution(100, 3000), 
#     'max_depth':        optuna.distributions.IntUniformDistribution(1,   200), 
#     'min_samples_split':optuna.distributions.IntUniformDistribution(2,   40), 
#     'min_samples_leaf': optuna.distributions.IntUniformDistribution(1,   20), 
#     }
    
    
# optuna_search = optuna.integration.OptunaSearchCV(rfr, param_distributions, 
#                                                   cv=5, n_trials=300, random_state=0, 
#                                                   scoring='neg_root_mean_squared_error', verbose=1)
# optuna_search.fit(X_train_test_met, y_train_test_met)

# print(optuna_search.best_score_)
# print(optuna_search.best_estimator_)
# print(optuna_search.best_params_)

# optuna_search.predict(X_sub)

# deep learning