In [71]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/kdt_240424/m5_머신러닝/dataset/train.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [70]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# 타이타닉 데이터 전처리
def preprocess_titanic(df):
    # 1. 누락된 값 처리
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

    # 2. 범주형 변수 인코딩
    df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

    # 3. 특성 선택
    df = df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']]

    return df

# 전처리
df_processed = preprocess_titanic(df)

# 변수 설정
X = df_processed
y = df['Survived']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 데이터 불균형 처리 (SMOTE)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 모델 및 평가 함수
def evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    cla_report = classification_report(y_test, y_pred)

    print(f"Classification Report:\n {cla_report}")
    print(f'정확도 : {accuracy:.4f}')
    print(f"정밀도 :  {precision:.4f}")
    print(f"재현율 : {recall:.4f}")
    print(f"f1 스코어 : {f1:.4f}")
    print(f"Roc Auc : {roc_auc:.4f}")
    print()

# Random Forest 최적화 및 평가
param_rf = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_forest_clf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(estimator=random_forest_clf, param_grid=param_rf, cv=5, n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train_smote, y_train_smote)
best_rf = grid_search_rf.best_estimator_

print(f"최적 파라미터 Random Forest의 평가")
evaluate(best_rf, X_train_smote, X_test, y_train_smote, y_test)
print(f"최적 파라미터: {grid_search_rf.best_params_}")


Fitting 5 folds for each of 144 candidates, totalling 720 fits
최적 파라미터 Random Forest의 평가
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.85      0.83       157
           1       0.77      0.74      0.76       111

    accuracy                           0.80       268
   macro avg       0.80      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268

정확도 : 0.8022
정밀도 :  0.7736
재현율 : 0.7387
f1 스코어 : 0.7558
Roc Auc : 0.8662

최적 파라미터: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [54]:
import re
df['Title'] = df['Name'].apply(lambda x: re.findall(r'\b([a-zA-Z]+)\.', x)[0])

In [None]:
pd.crosstab(df["Title"], df["Sex"])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [55]:
df['Title'] = df['Title'].replace(["Capt", "Col", "Don", "Jonkheer", "Major", "Rev", "Sir"], "MaleRare")
df['Title'] = df['Title'].replace(["Countess", "Lady", "Mlle", "Mme", "Ms"], "FemaleRare")
df['Title'] = df['Title'].replace(["Dr"], "Rare")

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [None]:
df.Title.value_counts()

Title
Mr            517
Miss          182
Mrs           125
Master         40
MaleRare       14
Rare            7
FemaleRare      6
Name: count, dtype: int64

In [56]:
# 결측값 제거
mean_age = df['Age'].mean()

title_mean_age =df.groupby('Title')['Age'].mean()

def fill_age(row):
    if pd.isnull(row['Age']):
        if row['Title'] == 'Mr':
            return title_mean_age['Mr']
        elif row['Title'] == 'Mrs':
            return title_mean_age['Mrs']
        elif row['Title'] == 'Miss':
            return title_mean_age['Miss']
        elif row['Title'] == 'Master':
            return title_mean_age['Master']
        else:
            return mean_age
    else:
        return row['Age']

# DataFrame에 적용
df['Age'] = df.apply(fill_age, axis=1)

In [57]:
# 결측값 제거
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
dtype: int64


In [None]:
# 이상치 제거
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)

IQR = Q3 -Q1

fare_outlier = df[(df['Fare'] < (Q1 -1.5 * IQR)) | (df['Fare'] > (Q3 + 1.5 * IQR))]

df = df.drop(fare_outlier.index)

    # age_group을 6개 범주로 확대
bins = [0,5,12,18,35,60,100]
labels = ['Infant','Child','Teenager','Yound Adult','Adult','Senior']
df['age_group'] = pd.cut(df['Age'], bins=bins, labels = labels)
df.drop(columns=['Age'], inplace = True)

In [58]:
# 필요없는 열 제거
df.drop(columns = ['PassengerId','Name','Ticket','Cabin'], inplace=True)

In [59]:
bins = [0,5,12,18,35,60,100]
labels = ['Infant','Child','Teenager','Yound Adult','Adult','Senior']
df['age_group'] = pd.cut(df['Age'], bins=bins, labels = labels)
df.drop(columns=['Age'], inplace = True)

In [60]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,Title,age_group
0,0,3,male,1,0,7.25,S,Mr,Yound Adult
1,1,1,female,1,0,71.2833,C,Mrs,Adult
2,1,3,female,0,0,7.925,S,Miss,Yound Adult
3,1,1,female,1,0,53.1,S,Mrs,Yound Adult
4,0,3,male,0,0,8.05,S,Mr,Yound Adult


In [61]:

category_column = ['age_group','Sex','Embarked']
for column in category_column:
    le = LabelEncoder()
    le = le.fit(df[column])
    df[column] = le.transform(df[column])

In [65]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked',
       'Title', 'age_group'],
      dtype='object')

In [67]:
df.drop(columns='Title',inplace=True)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/kdt_240424/m5_머신러닝/dataset/train.csv')


# 전처리 함수
def transform_features(df):

    # 파생변수 생성
    df['Title'] = df['Name'].apply(lambda x: re.findall(r'\b([a-zA-Z]+)\.', x)[0])
    print(pd.crosstab(df["Title"], df["Sex"]))



    df['Title'] = df['Title'].replace(["Capt", "Col", "Don", "Jonkheer", "Major", "Rev", "Sir"], "MaleRare")
    df['Title'] = df['Title'].replace(["Countess", "Lady", "Mlle", "Mme", "Ms"], "FemaleRare")
    df['Title'] = df['Title'].replace(["Dr"], "Rare")

    # 필요없는 열 제거
    df.drop(columns = ['PassengerId','Name','Ticket','Cabin'], inplace=True)

    # 결측치 제거
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
    df.Age.fillna(df.Age.mean(), inplace=True)
    print(df.isnull().sum())

    # 이상치 제거
    Q1 = df['Fare'].quantile(0.25)
    Q3 = df['Fare'].quantile(0.75)

    IQR = Q3 -Q1

    fare_outlier = df[(df['Fare'] < (Q1 -1.5 * IQR)) | (df['Fare'] > (Q3 + 1.5 * IQR))]

    df = df.drop(fare_outlier.index)

    # age_group을 6개 범주로 확대
    bins = [0,5,12,18,35,60,100]
    labels = ['Infant','Child','Teenager','Yound Adult','Adult','Senior']
    df['age_group'] = pd.cut(df['Age'], bins=bins, labels = labels)
    df.drop(columns=['Age'], inplace = True)



    category_column = ['age_group','Sex','Embarked','Title']
    for column in category_column:
        le = LabelEncoder()
        le = le.fit(df[column])
        df[column] = le.transform(df[column])
    return df
# 전처리 함수를 통해 전처리 한 df 반환
df = transform_features(df)

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler,OneHotEncoder, LabelEncoder
import re
import warnings
warnings.filterwarnings('ignore')

# df = pd.read_csv('/content/drive/MyDrive/kdt_240424/m5_머신러닝/dataset/train.csv')


# # 전처리 함수
# def transform_features(df):

#     df_numeric = df.select_dtypes(include='number')
#     print(df_numeric.corr()['Survived'])

#     df['Title'] = df['Name'].apply(lambda x: re.findall(r'\b([a-zA-Z]+)\.', x)[0])
#     print(pd.crosstab(df["Title"], df["Sex"]))

#     # 필요없는 열 제거
#     df.drop(columns = ['PassengerId','Name','Ticket','Cabin'], inplace=True)


#     df['Title'] = df['Title'].replace(["Capt", "Col", "Don", "Jonkheer", "Major", "Rev", "Sir"], "MaleRare")
#     df['Title'] = df['Title'].replace(["Countess", "Lady", "Mlle", "Mme", "Ms"], "FemaleRare")
#     df['Title'] = df['Title'].replace(["Dr"], "Rare")

#     # 결측치 제거
#     df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
#     df['Age'].fillna(df.Age.mean(), inplace=True)
#     df['Fare'].fillna(0,inplace=True)
#     print(df.isnull().sum())


    # 파생변수 생성
    # df['family_size'] = df['SibSp']+df['Parch']+1
    # 필요없는 열 제거
    # df.drop(columns=['SibSp','Parch'],inplace=True)


    # # age_group을 6개 범주로 확대
    # bins = [0,5,12,18,35,60,100]
    # labels = ['Infant','Child','Teenager','Yound Adult','Adult','Senior']
    # df['age_group'] = pd.cut(df['Age'], bins=bins, labels = labels)
    # df.drop(columns=['Age'], inplace = True)




    # category_column = ['age_group','Sex','Embarked','Title']
    # for column in category_column:
    #     le = LabelEncoder()
    #     le = le.fit(df[column])
    #     df[column] = le.transform(df[column])
    # return df

# 분류 평가 함수
def evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]

    cla_report = classification_report(y_test,y_pred)

    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    roc_auc = roc_auc_score(y_test,y_proba)

    print(f"Classification Report:\n {cla_report}")
    print(f'정확도 : {accuracy:.4f}')
    print(f"정밀도 :  {precision:.4f}")
    print(f"재현율 : {recall:.4f}")
    print(f"f1 스코어 : {f1:.4f}")
    print(f"Roc Auc : {roc_auc:.4f}")
    print()

# 전처리 함수를 통해 전처리 한 df 반환
# df = transform_features(df)
print("전처리 완료")

print("="*80)

# 변수 설정
X = df.drop('Survived', axis=1)
y = df['Survived']


# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state = 42)

# 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 모델링 및 평가

decision_tree_clf = DecisionTreeClassifier(random_state=10)
print(f"Decision Tree의 평가")
evaluate(decision_tree_clf, X_train, X_test, y_train, y_test)

random_forest_clf = RandomForestClassifier(random_state=10)
print(f"Random Forest의 평가")
evaluate(random_forest_clf, X_train, X_test, y_train, y_test)

logistic_regression_clf = LogisticRegression(max_iter=2000,random_state=10)
print(f"Logistic Regression의 평가")
evaluate(logistic_regression_clf, X_train, X_test, y_train, y_test)


print("평가 완료")
print("="*80)

# 파라미터 생성
param_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth':[2,3,5,10,12],
    'min_samples_split':[2,3,5],
    'min_samples_leaf':[1,5,8,10]

}

param_lr = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10, 50, 100],
    'solver': ['liblinear', 'saga']
}
param_rf = {
    'n_estimators':[10,100,200],
    'max_features' : ['sqrt', 'log2'],
    'max_depth':[2,3,5,10,12],
    'min_samples_split':[2,3,5],
    'min_samples_leaf':[1,5,8,10]
}


# Decision Tree 최적화
grid_search_dt = GridSearchCV(estimator=decision_tree_clf, param_grid=param_dt, cv=5, n_jobs=-1, verbose=0)
grid_search_dt.fit(X_train, y_train)
best_dt = grid_search_dt.best_estimator_
print(grid_search_dt.best_params_)
print(f"최적 파라미터 Decision Tree의 평가")
evaluate(best_dt, X_train, X_test, y_train, y_test)

# Random Forest 최적화
grid_search_rf = GridSearchCV(estimator=random_forest_clf, param_grid=param_rf, cv=5, n_jobs=-1, verbose=0)
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_
print(grid_search_rf.best_params_)
print(f"최적 파라미터 Random Forest의 평가")
evaluate(best_rf, X_train, X_test, y_train, y_test)

# Logistic Regression 최적화
grid_search_lr = GridSearchCV(estimator=logistic_regression_clf, param_grid=param_lr, cv=5, n_jobs=-1, verbose=0)
grid_search_lr.fit(X_train, y_train)
best_lr = grid_search_lr.best_estimator_
print(grid_search_lr.best_params_)
print(f"최적 파라미터 Logistic Regression의 평가")
evaluate(best_lr, X_train, X_test, y_train, y_test)

print("최적화 평가 완료")
print("=" * 80)

전처리 완료
Decision Tree의 평가
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.84      0.83       157
           1       0.77      0.74      0.75       111

    accuracy                           0.80       268
   macro avg       0.79      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268

정확도 : 0.7985
정밀도 :  0.7664
재현율 : 0.7387
f1 스코어 : 0.7523
Roc Auc : 0.7798

Random Forest의 평가
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.84      0.82       157
           1       0.76      0.70      0.73       111

    accuracy                           0.78       268
   macro avg       0.78      0.77      0.77       268
weighted avg       0.78      0.78      0.78       268

정확도 : 0.7836
정밀도 :  0.7573
재현율 : 0.7027
f1 스코어 : 0.7290
Roc Auc : 0.8489

Logistic Regression의 평가
Classification Report:
               precision    recall  f1-score   support

    

KeyboardInterrupt: 

In [72]:
from sklearn.preprocessing import LabelEncoder

# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 속성 제거
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
    return df

# 레이블 인코딩 수행.
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 Data Preprocessing 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df


In [73]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

# 사용자 정의 함수
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    confusion = confusion_matrix(y_test, y_pred)

    print(f'오차 행렬:\n{confusion}')
    print(f'정확도: {accuracy:.4f}')
    print(f'정밀도: {precision:.4f}')
    print(f'재현율: {recall:.4f}')
    print(f'F1 스코어: {f1:.4f}')
    print(f'ROC AUC: {roc_auc:.4f}')
    print('')

In [74]:
# 원본 데이터를 재로딩 하고, feature데이터 셋과 Label 데이터 셋 추출.

y_titanic_df = df['Survived']
X_titanic_df= df.drop('Survived',axis=1)

X_titanic_df = transform_features(X_titanic_df)

In [75]:
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)
X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11, stratify=y_titanic_df)

In [76]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 결정트리, Random Forest, 로지스틱 회귀를 위한 사이킷런 Classifier 클래스 생성
dt_clf = DecisionTreeClassifier(random_state=10)
rf_clf = RandomForestClassifier(random_state=10)
lr_clf = LogisticRegression(max_iter=2000, random_state=10)
print('dt_clf 학습')
print('='*12)
train_and_evaluate(dt_clf, X_train, X_test, y_train, y_test)
print('rf_clf 학습')
print('='*12)
train_and_evaluate(rf_clf, X_train, X_test, y_train, y_test)
print('lr_clf 학습')
print('='*12)
train_and_evaluate(lr_clf, X_train, X_test, y_train, y_test)

dt_clf 학습
오차 행렬:
[[89 21]
 [19 50]]
정확도: 0.7765
정밀도: 0.7042
재현율: 0.7246
F1 스코어: 0.7143
ROC AUC: 0.7667

rf_clf 학습
오차 행렬:
[[92 18]
 [14 55]]
정확도: 0.8212
정밀도: 0.7534
재현율: 0.7971
F1 스코어: 0.7746
ROC AUC: 0.8837

lr_clf 학습
오차 행렬:
[[96 14]
 [18 51]]
정확도: 0.8212
정밀도: 0.7846
재현율: 0.7391
F1 스코어: 0.7612
ROC AUC: 0.8862



In [77]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':[2,3,5,10,12],
             'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8,10]}

grid_dclf = GridSearchCV(dt_clf , param_grid=parameters , scoring='accuracy' , cv=5)
grid_dclf.fit(X_train , y_train)

print('GridSearchCV 최적 하이퍼 파라미터 :',grid_dclf.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dclf.best_score_))
best_dclf = grid_dclf.best_estimator_

train_and_evaluate(best_dclf, X_train, X_test, y_train, y_test)


GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}
GridSearchCV 최고 정확도: 0.7992
오차 행렬:
[[99 11]
 [12 57]]
정확도: 0.8715
정밀도: 0.8382
재현율: 0.8261
F1 스코어: 0.8321
ROC AUC: 0.9028



In [78]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[10,100,200], 'max_depth':[2,3,5,10,12],
             'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8,10]}

grid_rfclf = GridSearchCV(rf_clf , param_grid=parameters , scoring='accuracy' , cv=5)
grid_rfclf.fit(X_train , y_train)


print('GridSearchCV 최적 하이퍼 파라미터 :',grid_rfclf.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_rfclf.best_score_))
best_rfclf = grid_rfclf.best_estimator_

train_and_evaluate(best_rfclf, X_train, X_test, y_train, y_test)

GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
GridSearchCV 최고 정확도: 0.8217
오차 행렬:
[[98 12]
 [15 54]]
정확도: 0.8492
정밀도: 0.8182
재현율: 0.7826
F1 스코어: 0.8000
ROC AUC: 0.8942



In [79]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 50, 100]
}
grid_lrclf = GridSearchCV(lr_clf , param_grid=param_grid , cv=5, verbose=0)
grid_lrclf.fit(X_train , y_train)

print('GridSearchCV 최적 하이퍼 파라미터 :',grid_lrclf.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_lrclf.best_score_))
best_lrclf = grid_lrclf.best_estimator_

train_and_evaluate(best_lrclf, X_train, X_test, y_train, y_test)

GridSearchCV 최적 하이퍼 파라미터 : {'C': 10}
GridSearchCV 최고 정확도: 0.7950
오차 행렬:
[[96 14]
 [18 51]]
정확도: 0.8212
정밀도: 0.7846
재현율: 0.7391
F1 스코어: 0.7612
ROC AUC: 0.8870

