# [Choose the best algorithms and parameters.]

| | 観点 | 考慮点 | 解決策 |
|---- | :---- | :---- | :---- |
| 1 | アルゴリズムの選定 | 他にもっと高い正解率を出せるアルゴリズムがあるのではないか | 各アルゴリズムの正解率を比較する。|
| 2 | アルゴリズムの評価 | データに関して、さまざまなパターンで行っても安定して良い結果をえられるか | クロスバリデーション。|

###  1. Compare the accuracy rate of each algorithm.

In [5]:
#! From 03. Ayame code.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
from sklearn.utils.testing import all_estimators # *UPDATE* #

#!
#! Read iris data.
#!
iris_data = pd.read_csv("iris.csv", encoding="utf-8")

#!
#! Separate iris data into labels and input data.
#!
# Separate by CSV header name
y = iris_data.loc[:, "Name"]
x = iris_data.loc[:, ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"]]

#!
#! Divide into learning and testing.
#!
# Separate 80% for learning and 20% for testing.(Shuffle=Sort the original data randomly.)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, train_size = 0.8, shuffle = True)

#!
#! Learn.
#!
# *UPDATE* #
# *** Get all Classifier algorithms. *** #
warnings.filterwarnings('ignore')
allAlgorithms = all_estimators(type_filter = 'classifier')

for(name, algorithm) in allAlgorithms:
    #print(name)
    
    if name in {"CheckingClassifier", "ClassifierChain", "MultiOutputClassifier", "OneVsOneClassifier", "OneVsOneClassifier", \
               "OneVsRestClassifier", "OutputCodeClassifier"}:
        continue
    
    #! Create an object for each algorithm.
    clf = algorithm()
    
    #! fit= Machine learning.
    #print(x_train)
    #print(y_train)
    clf.fit(x_train, y_train)
    #! Evaluation.(accuracy_score()=Calculation of accuracy rate.)
    y_predict = clf.predict(x_test)
    #print(y_predict)
    print(name, "の正解率 : ", accuracy_score(y_test, y_predict))    


AdaBoostClassifier の正解率 :  0.9
BaggingClassifier の正解率 :  0.9333333333333333
BernoulliNB の正解率 :  0.26666666666666666
CalibratedClassifierCV の正解率 :  0.8
ComplementNB の正解率 :  0.6666666666666666
DecisionTreeClassifier の正解率 :  0.8333333333333334
DummyClassifier の正解率 :  0.26666666666666666
ExtraTreeClassifier の正解率 :  0.8333333333333334
ExtraTreesClassifier の正解率 :  0.9
GaussianNB の正解率 :  0.8666666666666667
GaussianProcessClassifier の正解率 :  0.9333333333333333
GradientBoostingClassifier の正解率 :  0.9
HistGradientBoostingClassifier の正解率 :  0.9
KNeighborsClassifier の正解率 :  0.9
LabelPropagation の正解率 :  0.8666666666666667
LabelSpreading の正解率 :  0.8666666666666667
LinearDiscriminantAnalysis の正解率 :  0.9333333333333333
LinearSVC の正解率 :  0.9
LogisticRegression の正解率 :  0.8666666666666667
LogisticRegressionCV の正解率 :  0.8333333333333334
MLPClassifier の正解率 :  0.9
MultinomialNB の正解率 :  0.8666666666666667
NearestCentroid の正解率 :  0.8666666666666667
NuSVC の正解率 :  0.9
PassiveAggressiveClassifier の正解率 :  0.9
Perce

TypeError: __init__() missing 1 required positional argument: 'estimators'

### ・Cross validation
   - Divide the data into three groups, A, B and C.
   - A and B are learning data, C is evaluation data, and the accuracy rate is shown.
   - B and C are learning data, A is evaluation data, and the accuracy rate is shown.
   - C and A are learning data, B is evaluation data, and the correct answer rate is shown.

In [6]:
import pandas as pd
from sklearn.utils.testing import all_estimators # *UPDATE* #
from sklearn.model_selection import KFold
import warnings
from sklearn.model_selection import cross_val_score

#!
#! Read iris data.
#!
iris_data = pd.read_csv("iris.csv", encoding="utf-8")

#!
#! Separate iris data into labels and input data.
#!
# Separate by CSV header name
y = iris_data.loc[:, "Name"]
x = iris_data.loc[:, ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"]]

#!
#! Learn.
#!
# *UPDATE* #
# *** Get all Classifier algorithms. *** #
warnings.filterwarnings('ignore')
allAlgorithms = all_estimators(type_filter = 'classifier')

#! Object for K-split cross validation.
kfold_cv = KFold(n_splits = 5, shuffle=True)

for(name, algorithm) in allAlgorithms:
    #print(name)
    
    # Insert TRY because some errors occur.
    try:
        #! Create an object for each algorithm.
        clf = algorithm()
        
        #! Target class with score method.
        if hasattr(clf, "score"):
            #!
            #! Cross validation.
            #!
            # clf: Classifier, x: Input data, y: Label, cv: Cross validation object.
            score = cross_val_score(clf, x, y, cv = kfold_cv)
            #print(y_predict)
            print(name, "の正解率 : ")    
            print(score)    
    except Exception as error:
        print(str(error))
    

AdaBoostClassifier の正解率 : 
[0.93333333 0.93333333 0.86666667 0.86666667 1.        ]
BaggingClassifier の正解率 : 
[1.         1.         0.83333333 0.96666667 0.9       ]
BernoulliNB の正解率 : 
[0.23333333 0.3        0.23333333 0.3        0.26666667]
CalibratedClassifierCV の正解率 : 
[0.93333333 0.86666667 0.9        0.96666667 0.96666667]
could not convert string to float: 'Iris-setosa'
__init__() missing 1 required positional argument: 'base_estimator'
ComplementNB の正解率 : 
[0.66666667 0.5        0.76666667 0.73333333 0.66666667]
DecisionTreeClassifier の正解率 : 
[0.9        0.93333333 0.93333333 1.         0.9       ]
DummyClassifier の正解率 : 
[0.33333333 0.3        0.23333333 0.3        0.43333333]
ExtraTreeClassifier の正解率 : 
[0.96666667 0.9        0.96666667 0.86666667 0.93333333]
ExtraTreesClassifier の正解率 : 
[0.96666667 0.93333333 0.9        0.96666667 0.96666667]
GaussianNB の正解率 : 
[1.         0.96666667 0.96666667 0.93333333 0.9       ]
GaussianProcessClassifier の正解率 : 
[0.93333333 0.93333333 

### ・Find the best parameters.
   - Grid search: 
      - High parameter tuning method.
      - A method in which the correct answer rates are compared for all patterns of specified parameters,<br>
        and a combination of parameters having the highest correct answer rate is selected.

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#! Read iris data.
iris_data = pd.read_csv("iris.csv", encoding="utf-8")

#! Separate iris data into labels and input data.
# Separate by CSV header name
y = iris_data.loc[:, "Name"]
x = iris_data.loc[:, ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"]]

#! Divide into learning and testing.
# Separate 80% for learning and 20% for testing.(Shuffle=Sort the original data randomly.)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, train_size = 0.8, shuffle = True)

#!
#! Specify the parameters used in grid search.
#!
parameters = [
    {"C": [1, 10, 100, 1000], "kernel":["linear"]},
    {"C": [1, 10, 100, 1000], "kernel":["rbf"], "gamma": [0.001, 0.0001]},
    {"C": [1, 10, 100, 1000], "kernel":["sigmoid"], "gamma": [0.001, 0.0001]},
             ]
#!
#! Perform a grid search.
#!
# Object for K-split cross validation.
kfold_cv = KFold(n_splits = 5, shuffle=True)
# Object for grid search.
clf = GridSearchCV(SVC(), parameters, cv = kfold_cv)

# Perform grid search by fit().
clf.fit(x_train, y_train)
print("<<Optimal parameters>>", clf.best_estimator_)
print("--------------------------------------------")
print("<<grid_scores>>\n", clf.cv_results_.keys())

#!
#! Evaluate with optimal parameters.
#!
y_predict = clf.predict(x_test)
print("<<Correct answer rate at evaluation>>", accuracy_score(y_test, y_predict))

<<Optimal parameters>> SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
--------------------------------------------
<<grid_scores>>
 dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'param_kernel', 'param_gamma', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])
<<Correct answer rate at evaluation>> 0.9666666666666667
