In [1]:
import pandas as pd

df = pd.read_csv('sonar.all-data')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 61 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0.0200  207 non-null    float64
 1   0.0371  207 non-null    float64
 2   0.0428  207 non-null    float64
 3   0.0207  207 non-null    float64
 4   0.0954  207 non-null    float64
 5   0.0986  207 non-null    float64
 6   0.1539  207 non-null    float64
 7   0.1601  207 non-null    float64
 8   0.3109  207 non-null    float64
 9   0.2111  207 non-null    float64
 10  0.1609  207 non-null    float64
 11  0.1582  207 non-null    float64
 12  0.2238  207 non-null    float64
 13  0.0645  207 non-null    float64
 14  0.0660  207 non-null    float64
 15  0.2273  207 non-null    float64
 16  0.3100  207 non-null    float64
 17  0.2999  207 non-null    float64
 18  0.5078  207 non-null    float64
 19  0.4797  207 non-null    float64
 20  0.5783  207 non-null    float64
 21  0.5071  207 non-null    float64
 22  0.

In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder().fit(df.iloc[:, -1])
y = le.transform(df.iloc[:, -1])
X = df.iloc[:, :-1]

print(X.shape)
print(y.sum())

(207, 60)
96


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, 
                                                    stratify=y,
                                                    random_state=0)
X_train.shape

(165, 60)

In [4]:
# 挑選 K-NN 超參數
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

for n in range(1, 6):
    knn = KNeighborsClassifier(n_neighbors=n)
    score_lst = cross_val_score(estimator=knn, 
                                X=X_train, y=y_train, 
                                cv=5, n_jobs=-1)
    print('5-fold CV accuracy (n_neighbors=%d) = %.3f, std = %.3f' % 
      (n, np.mean(score_lst), np.std(score_lst)))

clf1 = KNeighborsClassifier(n_neighbors=1)

5-fold CV accuracy (n_neighbors=1) = 0.764, std = 0.023
5-fold CV accuracy (n_neighbors=2) = 0.758, std = 0.027
5-fold CV accuracy (n_neighbors=3) = 0.794, std = 0.052
5-fold CV accuracy (n_neighbors=4) = 0.739, std = 0.085
5-fold CV accuracy (n_neighbors=5) = 0.715, std = 0.078


In [5]:
# 挑選隨機森林超參數
from sklearn.ensemble import RandomForestClassifier

for n in range(20, 201, 20):
    rf = RandomForestClassifier(n_estimators=n)
    score_lst = cross_val_score(estimator=rf, 
                                X=X_train, y=y_train, 
                                cv=5, n_jobs=-1)
    print('5-fold CV accuracy (n_neighbors=%d) = %.3f, std = %.3f' % 
      (n, np.mean(score_lst), np.std(score_lst)))

clf2 = RandomForestClassifier(n_estimators=120)

5-fold CV accuracy (n_neighbors=20) = 0.770, std = 0.049
5-fold CV accuracy (n_neighbors=40) = 0.806, std = 0.087
5-fold CV accuracy (n_neighbors=60) = 0.800, std = 0.068
5-fold CV accuracy (n_neighbors=80) = 0.830, std = 0.056
5-fold CV accuracy (n_neighbors=100) = 0.812, std = 0.045
5-fold CV accuracy (n_neighbors=120) = 0.800, std = 0.041
5-fold CV accuracy (n_neighbors=140) = 0.788, std = 0.086
5-fold CV accuracy (n_neighbors=160) = 0.830, std = 0.062
5-fold CV accuracy (n_neighbors=180) = 0.806, std = 0.083
5-fold CV accuracy (n_neighbors=200) = 0.836, std = 0.071


In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

clf3 = GaussianNB()
en_clf = VotingClassifier(estimators=[
    ('k-NN', clf1), ('rf', clf2), ('gnb', clf3)], 
                           voting='soft')
models = (clf1, clf2, clf3, en_clf)
names = {'K-NN', 'Random Forest', 'Naive Bayes', 'Voting'}

In [7]:
scores = {}
for name, model in zip(names, models):
    model.fit(X_train, y_train)
    scores[(name, 'Train score')] = model.score(X_train, y_train)
    scores[(name, 'Test score')] = model.score(X_test, y_test)
    
pd.Series(scores).unstack()

Unnamed: 0,Test score,Train score
K-NN,0.619048,0.727273
Naive Bayes,0.738095,1.0
Random Forest,0.761905,1.0
Voting,0.880952,1.0


In [8]:
en_clf = VotingClassifier(estimators=[
    ('k-NN', clf1), ('rf', clf2), ('gnb', clf3)], 
                           voting='soft', weights=[2,3,1])
models = (clf1, clf2, clf3, en_clf)
names = {'K-NN', 'Random Forest', 'Naive Bayes', 'Voting'}

In [9]:
scores = {}
for name, model in zip(names, models):
    model.fit(X_train, y_train)
    scores[(name, 'Train score')] = model.score(X_train, y_train)
    scores[(name, 'Test score')] = model.score(X_test, y_test)
    
pd.Series(scores).unstack()

Unnamed: 0,Test score,Train score
K-NN,0.619048,0.727273
Naive Bayes,0.880952,1.0
Random Forest,0.761905,1.0
Voting,0.880952,1.0
