In [3]:
import pandas as pd
import numpy as np
import warnings
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

warnings.filterwarnings("ignore") # 오류 문구 무시

In [4]:
wine = load_wine()

In [8]:
wine.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [13]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [14]:
X, y = wine["data"], wine["target"]
display(X.shape)
display(y.shape)

(178, 13)

(178,)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

## Best Model

In [16]:
def best_model(dt=None, rf=None, svm=None, sgd=None, lr=None):
    """_summary_
    
    Args:
        dt (_type_, optional): _description_. Defaults to None.
        rf (_type_, optional): _description_. Defaults to None.
        svm (_type_, optional): _description_. Defaults to None.
        sgd (_type_, optional): _description_. Defaults to None.
        lr (_type_, optional): _description_. Defaults to None.
        
    Output:
        best parameters
        best precision score
        best recall score
    """
    ###################
    ## Libary Import
    ###################
    from sklearn.datasets import load_digits
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn import metrics

    ###################
    ## Decision Trees
    ###################
    if dt:
        from sklearn.tree import DecisionTreeClassifier
        
        clf = DecisionTreeClassifier()
        params = {
            'max_depth':[1, 2, 3], 
            'min_samples_split':[2, 3]
        }
        grid_search = GridSearchCV(clf, param_grid=params, verbose=1, cv=3, refit=True, n_jobs=1)
        grid_search.fit(X_train, y_train)
        
    ###################
    ## Random Forest
    ###################
    elif rf:
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier()
        params = {
            'bootstrap': [True],
            'max_depth': [80, 90, 100, 110],
            'max_features': [2, 3],
            'min_samples_leaf': [3, 4, 5],
            'min_samples_split': [8, 10, 12],
            'n_estimators': [100, 200, 300, 1000]
        }
        grid_search = GridSearchCV(estimator=rfc, param_grid=params, cv=3, n_jobs=1, verbose=1, refit=True)
        grid_search.fit(X_train, y_train)
    
    ###################
    ## SVM
    ###################
    elif svm:
        from sklearn import svm
        
        clf = svm.SVC()
        params = {
            'C':[0.001,0.01,0.1,1,10,100],
            'gamma':[0.001,0.01,0.1,1,10,100] 
        }

        grid_search = GridSearchCV(clf, param_grid=params, cv=3, refit=True, n_jobs=1)
        grid_search.fit(X_train, y_train)
        
    ###################
    ## SGD
    ###################
    elif sgd:
        from sklearn.linear_model import SGDClassifier
        sgdc = SGDClassifier()
        params = {
            "loss" : ["hinge", "log", "squared_hinge", "modified_huber"],
            "alpha" : [0.0001, 0.001, 0.01, 0.1],
            "penalty" : ["l2", "l1", "none"]
            }
        
        grid_search = GridSearchCV(sgdc, param_grid=params, cv=3, verbose=1, n_jobs=1, refit=True)
        grid_search.fit(X_train, y_train)
        
    #######################
    ## Logistic Regression
    #######################
    elif lr:
        from sklearn.linear_model import LogisticRegression
        
        clf = LogisticRegression()
        params={
            "C":np.logspace(-3,3,7), 
            "penalty":["l1","l2"]
        }
        grid_search = GridSearchCV(clf, param_grid=params, cv=3, verbose=1, n_jobs=1, refit=True)
        grid_search.fit(X_train, y_train)

    ####################
    ## Best Parameters
    ####################
    estimator = grid_search.best_estimator_
    pred = estimator.predict(X_test)
    
    
    precision_score = metrics.precision_score(y_test, pred, average='macro')
    recall_score = metrics.recall_score(y_test, pred, average='macro')
    f1_score = metrics.f1_score(y_test, pred, average='macro')
    
    print('GridSearchCV Best Parameters: {}'.format(grid_search.best_params_))
    print('GridSearchCV Best F1-Score: {0:0.4f}'.format(f1_score))
    
    print('GridSearchCV Best Precision Score: {0:0.4f}'.format(precision_score))
    print('GridSearchCV Best Recall Score: {0:0.4f}'.format(recall_score)) 

In [19]:
# Desicion Tree
best_model(dt=True)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
GridSearchCV Best Parameters: {'max_depth': 3, 'min_samples_split': 3}
GridSearchCV Best F1-Score: 0.9469
GridSearchCV Best Precision Score: 0.9583
GridSearchCV Best Recall Score: 0.9411


In [20]:
# Random Forest
best_model(rf=True)

Fitting 3 folds for each of 288 candidates, totalling 864 fits
GridSearchCV Best Parameters: {'bootstrap': True, 'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 100}
GridSearchCV Best F1-Score: 0.9804
GridSearchCV Best Precision Score: 0.9778
GridSearchCV Best Recall Score: 0.9841


In [21]:
# SVM
best_model(svm=True)

GridSearchCV Best Parameters: {'C': 1, 'gamma': 0.001}
GridSearchCV Best F1-Score: 0.7261
GridSearchCV Best Precision Score: 0.7260
GridSearchCV Best Recall Score: 0.7268


In [22]:
# SGD
best_model(sgd=True)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
GridSearchCV Best Parameters: {'alpha': 0.01, 'loss': 'modified_huber', 'penalty': 'l1'}
GridSearchCV Best F1-Score: 0.6550
GridSearchCV Best Precision Score: 0.6714
GridSearchCV Best Recall Score: 0.6650


In [23]:
# Logistic Regression
best_model(lr=True)

Fitting 3 folds for each of 14 candidates, totalling 42 fits
GridSearchCV Best Parameters: {'C': 100.0, 'penalty': 'l2'}
GridSearchCV Best F1-Score: 0.9500
GridSearchCV Best Precision Score: 0.9500
GridSearchCV Best Recall Score: 0.9507


## Conclusion

### Best Model is Desicion Tree!