## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [1]:
import numpy as np 
import pandas as pd
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import cross_val_score
data_path = 'data/'

train = pd.read_csv(data_path + 'train_1.csv', header = None)
train_labels = pd.read_csv(data_path + 'trainLabels_1.csv', header = None)
test = pd.read_csv(data_path + 'test_1.csv', header = None)

In [2]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train,train_labels, test_size = 0.30, random_state = 101)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((700, 40), (300, 40), (700, 1), (300, 1))

In [3]:
# NAIBE BAYES
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(x_train,y_train.values.ravel())
predicted= model.predict(x_test)
print('Naive Bayes',accuracy_score(y_test, predicted))

#KNN
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(x_train,y_train.values.ravel())
predicted= knn_model.predict(x_test)
print('KNN',accuracy_score(y_test, predicted))

#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier(n_estimators = 100,random_state = 99)
rfc_model.fit(x_train,y_train.values.ravel())
predicted = rfc_model.predict(x_test)
print('Random Forest',accuracy_score(y_test,predicted))

#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver = 'saga')
lr_model.fit(x_train,y_train.values.ravel())
lr_predicted = lr_model.predict(x_test)
print('Logistic Regression',accuracy_score(y_test, lr_predicted))

#SVM
from sklearn.svm import SVC

svc_model = SVC(gamma = 'auto')
svc_model.fit(x_train,y_train.values.ravel())
svc_predicted = svc_model.predict(x_test)
print('SVM',accuracy_score(y_test, svc_predicted))

#DECISON TREE
from sklearn.tree import DecisionTreeClassifier

dtree_model = DecisionTreeClassifier()
dtree_model.fit(x_train,y_train.values.ravel())
dtree_predicted = dtree_model.predict(x_test)
print('Decision Tree',accuracy_score(y_test, dtree_predicted))

#XGBOOST
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(x_train,y_train.values.ravel())
xgb_predicted = xgb.predict(x_test)
print('XGBoost',accuracy_score(y_test, xgb_predicted))

Naive Bayes 0.8066666666666666
KNN 0.9166666666666666
Random Forest 0.86
Logistic Regression 0.82
SVM 0.9033333333333333
Decision Tree 0.7233333333333334
XGBoost 0.87


* Feature Scaling

In [4]:
from sklearn.preprocessing import StandardScaler, Normalizer

norm = Normalizer()
x_norm_train = norm.fit_transform(x_train)
x_norm_test = norm.transform(x_test)
norm_train_data = norm.fit_transform(train)

In [5]:
# NAIBE BAYES
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(x_norm_train,y_train.values.ravel())
nb_predicted= nb_model.predict(x_norm_test)
print('Naive Bayes',accuracy_score(y_test, nb_predicted))
print('Naive Bayes',cross_val_score(nb_model,norm_train_data, train_labels.values.ravel(), cv=10).mean())

#KNN
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors = 5)
knn_model.fit(x_norm_train,y_train.values.ravel())
knn_predicted= knn_model.predict(x_norm_test)
print('KNN',accuracy_score(y_test, knn_predicted))
print('KNN',cross_val_score(knn_model,norm_train_data, train_labels.values.ravel(), cv=10).mean())

#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier(n_estimators = 100,random_state = 99)
rfc_model.fit(x_norm_train,y_train.values.ravel())
rfc_predicted = rfc_model.predict(x_norm_test)
print('Random Forest',accuracy_score(y_test,rfc_predicted))
print('Random Forest',cross_val_score(rfc_model,norm_train_data, train_labels.values.ravel(), cv=10).mean())

#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver = 'saga')
lr_model.fit(x_norm_train,y_train.values.ravel())
lr_predicted = lr_model.predict(x_norm_test)
print('Logistic Regression',accuracy_score(y_test, lr_predicted))
print('Logistic Regression',cross_val_score(lr_model,norm_train_data, train_labels.values.ravel(), cv=10).mean())

#SVM
from sklearn.svm import SVC

svc_model = SVC(gamma = 'auto')
svc_model.fit(x_norm_train,y_train.values.ravel())
svc_predicted = svc_model.predict(x_norm_test)
print('SVM',accuracy_score(y_test, svc_predicted))
print('SVM',cross_val_score(svc_model,norm_train_data, train_labels.values.ravel(), cv=10).mean())

#DECISON TREE
from sklearn.tree import DecisionTreeClassifier
dtree_model = DecisionTreeClassifier()
dtree_model.fit(x_norm_train,y_train.values.ravel())
dtree_predicted = dtree_model.predict(x_norm_test)
print('Decision Tree',accuracy_score(y_test, dtree_predicted))
print('Decision Tree',cross_val_score(dtree_model,norm_train_data, train_labels.values.ravel(), cv=10).mean())

#XGBOOST
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(x_norm_train,y_train.values.ravel())
xgb_predicted = xgb.predict(x_norm_test)
print('XGBoost',accuracy_score(y_test, xgb_predicted))
print('XGBoost',cross_val_score(xgb,norm_train_data, train_labels.values.ravel(), cv=10).mean())

Naive Bayes 0.8
Naive Bayes 0.808
KNN 0.8633333333333333
KNN 0.9019999999999999
Random Forest 0.8633333333333333
Random Forest 0.8699999999999999
Logistic Regression 0.8066666666666666
Logistic Regression 0.8220000000000001
SVM 0.7866666666666666
SVM 0.808
Decision Tree 0.79
Decision Tree 0.7889999999999999
XGBoost 0.86
XGBoost 0.8710000000000001


* PCA

In [7]:
from sklearn.decomposition import PCA

pca  = PCA(n_components=12)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
pca_train_data = pca.fit_transform(train)
explained_variance = pca.explained_variance_ratio_ 
print(explained_variance)

[0.25054403 0.2055048  0.08026473 0.05033658 0.04895951 0.04489903
 0.0417078  0.03127932 0.0230978  0.02100061 0.01619478 0.01264102]


In [9]:
# NAIBE BAYES
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
#nb_model.fit(pca_train_data,y_train.values.ravel())
#nb_predicted= nb_model.predict(x_norm_test)
#print('Naive Bayes',accuracy_score(y_test, nb_predicted))
print('Naive Bayes',cross_val_score(nb_model,pca_train_data, train_labels.values.ravel(), cv=10).mean())

#KNN
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors = 5)
#knn_model.fit(pca_train_data,y_train.values.ravel())
#knn_predicted= knn_model.predict(x_norm_test)
#print('KNN',accuracy_score(y_test, knn_predicted))
print('KNN',cross_val_score(knn_model,pca_train_data, train_labels.values.ravel(), cv=10).mean())

#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier(n_estimators = 100,random_state = 99)
#rfc_model.fit(pca_train_data,y_train.values.ravel())
#rfc_predicted = rfc_model.predict(x_norm_test)
#print('Random Forest',accuracy_score(y_test,rfc_predicted))
print('Random Forest',cross_val_score(rfc_model,pca_train_data, train_labels.values.ravel(), cv=10).mean())

#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver = 'saga')
#lr_model.fit(pca_train_data,y_train.values.ravel())
#lr_predicted = lr_model.predict(x_norm_test)
#print('Logistic Regression',accuracy_score(y_test, lr_predicted))
print('Logistic Regression',cross_val_score(lr_model,pca_train_data, train_labels.values.ravel(), cv=10).mean())

#SVM
from sklearn.svm import SVC

svc_model = SVC(gamma = 'auto')
#svc_model.fit(x_norm_train,y_train.values.ravel())
#svc_predicted = svc_model.predict(x_norm_test)
#print('SVM',accuracy_score(y_test, svc_predicted))
print('SVM',cross_val_score(svc_model,pca_train_data, train_labels.values.ravel(), cv=10).mean())

#DECISON TREE

from sklearn.tree import DecisionTreeClassifier

dtree_model = DecisionTreeClassifier()
#dtree_model.fit(x_norm_train,y_train.values.ravel())
#dtree_predicted = dtree_model.predict(x_norm_test)
#print('Decision Tree',accuracy_score(y_test, dtree_predicted))
print('Decision Tree',cross_val_score(dtree_model,pca_train_data, train_labels.values.ravel(), cv=10).mean())

#XGBOOST
from xgboost import XGBClassifier

xgb = XGBClassifier()
#xgb.fit(x_norm_train,y_train.values.ravel())
#xgb_predicted = xgb.predict(x_norm_test)
#print('XGBoost',accuracy_score(y_test, xgb_predicted))
print('XGBoost',cross_val_score(xgb,pca_train_data, train_labels.values.ravel(), cv=10).mean())

Naive Bayes 0.8400000000000001
KNN 0.907
Random Forest 0.9029999999999999
Logistic Regression 0.8240000000000001
SVM 0.905
Decision Tree 0.8029999999999999
XGBoost 0.866


In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC

x_all = np.r_[train,test]
print('x_all shape :',x_all.shape)

# USING THE GAUSSIAN MIXTURE MODEL 
lowest_bic = np.infty
bic = []
n_components_range = range(1, 7)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components,covariance_type=cv_type)
        gmm.fit(x_all)
        bic.append(gmm.aic(x_all))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
            
best_gmm.fit(x_all)
gmm_train = best_gmm.predict_proba(train)
gmm_test = best_gmm.predict_proba(test)


#Random Forest Classifier
rfc = RandomForestClassifier(random_state=99)

#USING GRID SEARCH
n_estimators = [10, 50, 100, 200,400]
max_depth = [3, 10, 20, 40]
param_grid = dict(n_estimators=n_estimators,max_depth=max_depth)

grid_search_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv = 10,scoring='accuracy',n_jobs=-1).fit(gmm_train, train_labels.values.ravel())
rfc_best = grid_search_rfc.best_estimator_
print('Random Forest Best Score',grid_search_rfc.best_score_)
print('Random Forest Best Parmas',grid_search_rfc.best_params_)
print('Random Forest Accuracy',cross_val_score(rfc_best,gmm_train, train_labels.values.ravel(), cv=10).mean())

#KNN 
knn = KNeighborsClassifier()

#USING GRID SEARCH
n_neighbors=[3,5,6,7,8,9,10]
param_grid = dict(n_neighbors=n_neighbors)

grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv = 10, n_jobs=-1,scoring='accuracy').fit(gmm_train,train_labels.values.ravel())
knn_best = grid_search_knn.best_estimator_
print('KNN Best Score', grid_search_knn.best_score_)
print('KNN Best Params',grid_search_knn.best_params_)
print('KNN Accuracy',cross_val_score(knn_best,gmm_train, train_labels.values.ravel(), cv=10).mean())

#SVM
svc = SVC()

#USING GRID SEARCH
parameters = [{'kernel':['linear'],'C':[1,10,100]},
              {'kernel':['rbf'],'C':[1,10,100],'gamma':[0.05,0.0001,0.01,0.001]}]
grid_search_svm = GridSearchCV(estimator=svc, param_grid=parameters, cv = 10, n_jobs=-1,scoring='accuracy').fit(gmm_train, train_labels.values.ravel())
svm_best = grid_search_svm.best_estimator_
print('SVM Best Score',grid_search_svm.best_score_)
print('SVM Best Params',grid_search_svm.best_params_)
print('SVM Accuracy',cross_val_score(svm_best,gmm_train, train_labels.values.ravel(), cv=10).mean())

x_all shape : (10000, 40)
Random Forest Best Score 0.996
Random Forest Best Parmas {'max_depth': 3, 'n_estimators': 10}
Random Forest Accuracy 0.9960000000000001
KNN Best Score 0.996
KNN Best Params {'n_neighbors': 3}
KNN Accuracy 0.9960000000000001
SVM Best Score 0.996
SVM Best Params {'C': 1, 'kernel': 'linear'}
SVM Accuracy 0.9960000000000001


In [11]:
rfc_best.fit(gmm_train,train_labels.values.ravel())
pred  = rfc_best.predict(gmm_test)
rfc_best_pred = pd.DataFrame(pred)

rfc_best_pred.index += 1

rfc_best_pred.columns = ['Solution']
rfc_best_pred['Id'] = np.arange(1,rfc_best_pred.shape[0]+1)
rfc_best_pred = rfc_best_pred[['Id', 'Solution']]

rfc_best_pred.to_csv('Submission_GMM_RFC.csv',index=False)