In [4]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

dataset = load_breast_cancer()

scaler = StandardScaler()
dataset_scaled = scaler.fit_transform(dataset.data)

X_train, X_test, y_train, y_test = train_test_split(dataset_scaled, dataset.target, test_size=0.3, random_state=0)


# LogisticRegression

In [5]:
from sklearn.metrics import accuracy_score, roc_auc_score

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_preds = lr_clf.predict(X_test)
lr_preds_proba = lr_clf.predict_log_proba(X_test)[:,1]

print(f'accuracy : {accuracy_score(y_test, lr_preds)}, roc_auc : {roc_auc_score(y_test, lr_preds)} ')

accuracy : 0.9766081871345029, roc_auc : 0.9715608465608465 


# solvers

In [6]:
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']

for solver in solvers:
    lr_clf = LogisticRegression(solver=solver, max_iter=600)
    lr_clf.fit(X_train, y_train)
    lr_preds = lr_clf.predict(X_test)
    lr_preds_proba = lr_clf.predict_log_proba(X_test)[:,1]

    print(f' solver {solver} :: accuracy : {accuracy_score(y_test, lr_preds)}, roc_auc : {roc_auc_score(y_test, lr_preds)} ')

 solver lbfgs :: accuracy : 0.9766081871345029, roc_auc : 0.9715608465608465 
 solver liblinear :: accuracy : 0.9824561403508771, roc_auc : 0.9794973544973544 
 solver newton-cg :: accuracy : 0.9766081871345029, roc_auc : 0.9715608465608465 
 solver sag :: accuracy : 0.9824561403508771, roc_auc : 0.9794973544973544 
 solver saga :: accuracy : 0.9824561403508771, roc_auc : 0.9794973544973544 


liblinear, saga : l1, l2
lbfgs, newton-cg, sag : l2

C: alpha 역수 > 작을 수록 규제가 크다

# GridSearchCV

In [26]:

from sklearn.model_selection import GridSearchCV

# params = {
#     'solver': ['liblinear', 'lbfgs'],
#     'penalty': ['l2', 'l1'],
#     'C' : [0.01, 0.1, 1, 5, 10, ]
# }
params = [
    {'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 5, 10, 100]},
    {'solver': ['lbfgs', 'newton-cg'], 'penalty': ['l2'], 'C': [0.1, 1, 5, 10, 100]}
]
lr_clf = LogisticRegression(max_iter=1000)

grid_clf = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=3)
grid_clf.fit(dataset_scaled, dataset.target)
print(f'Best parameter : {grid_clf.best_params_}, Best_accuracy {grid_clf.best_score_}')

Best parameter : {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}, Best_accuracy 0.9789102385593614


Best parameter : {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}, Best_accuracy 0.9648565859092174
Best parameter : {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}, Best_accuracy 0.9789102385593614

accuracy : 0.9532163742690059, roc_auc : 0.9497354497354497 
accuracy : 0.9707602339181286, roc_auc : 0.9636243386243386 

In [27]:
best_param = grid_clf.best_params_

best_lr_clf = LogisticRegression(**best_param)
best_lr_clf.fit(X_train, y_train)

lr_preds = best_lr_clf.predict(X_test)
lr_preds_proba = best_lr_clf.predict_log_proba(X_test)[:,1]

print(f'accuracy : {accuracy_score(y_test, lr_preds)}, roc_auc : {roc_auc_score(y_test, lr_preds)} ')

accuracy : 0.9707602339181286, roc_auc : 0.9636243386243386 


# RandomForestRegressor

In [33]:
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore') 

# 보스턴 데이터 세트 로드
boston = load_boston()
bostonDF = pd.DataFrame(boston.data, columns = boston.feature_names)

bostonDF['PRICE'] = boston.target
y_target = bostonDF['PRICE']
X_data = bostonDF.drop(['PRICE'], axis=1,inplace=False)

rf = RandomForestRegressor(random_state=0, n_estimators=1000)
eval_model(rf,X_data, y_target )

model : RandomForestRegressor(n_estimators=1000, random_state=0) avg_r2 : 0.6266886861376235  avg_rmse : 4.422538982804892


In [35]:
def eval_model(model, X_data, y_target, cv=5):
    neg_mse_scores = cross_val_score(model, X_data, y_target, scoring="neg_mean_squared_error", cv = 5)
    rmse_scores  = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)

    r2_scores = cross_val_score(model,  X_data, y_target, scoring='r2', cv=5)
    avg_r2 = np.mean(r2_scores)
    print(f'model : {model} avg_r2 : {avg_r2}  avg_rmse : {avg_rmse}')


In [36]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

dt_reg = DecisionTreeRegressor(random_state=0, max_depth=4)
rf_reg = RandomForestRegressor(random_state=0, n_estimators=1000)
gb_reg = GradientBoostingRegressor(random_state=0, n_estimators=1000)
xgb_reg = XGBRegressor(n_estimators=1000)
lgb_reg = LGBMRegressor(n_estimators=1000)

# 트리 기반의 회귀 모델을 반복하면서 평가 수행 
models = [dt_reg, rf_reg, gb_reg, xgb_reg, lgb_reg]
for model in models:  
    eval_model(model, X_data, y_target)

model : DecisionTreeRegressor(max_depth=4, random_state=0) avg_r2 : 0.17632964818238434  avg_rmse : 5.977957424580515
model : RandomForestRegressor(n_estimators=1000, random_state=0) avg_r2 : 0.6266886861376235  avg_rmse : 4.422538982804892
model : GradientBoostingRegressor(n_estimators=1000, random_state=0) avg_r2 : 0.6599942147734554  avg_rmse : 4.26899822168126
model : XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
   