In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from MyProcess import *
import copy

In [5]:
df_train_raw = pd.read_csv('train.csv')
df_test_raw = pd.read_csv('test.csv')
df_all_raw = pd.concat([df_train_raw, df_test_raw], axis=0)

df_train = copy.copy(df_train_raw)
df_test = copy.copy(df_test_raw)
df_all = copy.copy(df_all_raw)

(418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
process = myProcessor(df_train)
df = process.preprocess_df()
X_train, X_test, y_train, y_test = process.get_splited_dataset(_test_size=0.9)
n_features = X_train.shape[1]
X_train

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S,1,2,3,Master,Miss,Mr,Mrs,else
107,3,1,0,0,-0.489394,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
306,1,0,0,0,1.585323,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
214,3,1,1,0,-0.489897,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
691,3,0,0,1,-0.375874,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
296,3,1,0,0,-0.500377,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852,3,0,1,1,-0.339069,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
134,2,1,0,0,-0.384258,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
726,2,0,3,0,-0.223284,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
517,3,1,0,0,-0.159901,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## 알고리즘 별 작업
1. 그리드 서치를 SBS 또는 PCA를 이용해서 차원 축소 및 선택
2. 그리드 서치, k-중첩 교차검증을 통해서 최적의 하이퍼 파라미터 선택

알고리즘 리스트
### 모델 생성 o (모수방법)
- 퍼셉트론
- 아달린
- 로지스틱회귀
- svm
- 커널 svm (svc)
### 모델 생성 x (비모수방법)
- 결정트리
- 랜덤포레스트
- knn

각각의 작업을 마치고나서 최종적으로 위 코드를 기반으로 module을 만들어서 전처리된 데이터 프레임에 대해서 여러 알고리즘 학습작업 반복작업을 아주 간소화 할것이다.

### 기대하는 작업 ###
df의 전처리 과정은 모든 데이터마다 다르고, 또 같은 데이터에서도 해석에 따라 다르겠지만, 전처리가 완료된 데이터 프레임에 대해서는 이 모듈을 다른 데이터셋에도 재활용할수있게 하고싶다.


# 퍼셉트론 PCA

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# 파이프라인 구성
pipe_pca = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', Perceptron())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_pca = {
    'pca__n_components': [x for x in range(1, n_features)],
    'classifier__penalty': [None, 'l2', 'l1', 'elasticnet'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__max_iter': [1000],
    'classifier__tol': [1e-3],
}

# 그리드 서치 객체 생성
grid_pca = GridSearchCV(pipe_pca, param_grid_pca, cv=5, scoring='accuracy')

# 모델 학습
grid_pca.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with PCA:")
print(grid_pca.best_params_)
print("Best cross-validation accuracy with PCA:")
print(grid_pca.best_score_)


Best parameters with PCA:
{'classifier__alpha': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'classifier__tol': 0.001, 'pca__n_components': 5}
Best cross-validation accuracy with PCA:
0.8038095238095238


# 퍼셉트론 SBS

In [7]:
from sklearn.feature_selection import SequentialFeatureSelector

# 퍼셉트론 모델 정의
perceptron = Perceptron(max_iter=1000, tol=1e-3)

# SBS 객체 생성
sbs = SequentialFeatureSelector(perceptron, n_features_to_select=8, direction='backward', cv=5)

# 파이프라인 구성
pipe_sbs = Pipeline([
    ('feature_selection', sbs),
    ('classifier', Perceptron())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_sbs = {
    'classifier__penalty': [None, 'l2', 'l1', 'elasticnet'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__max_iter': [1000],
    'classifier__tol': [1e-3],
}

# 그리드 서치 객체 생성
grid_sbs = GridSearchCV(pipe_sbs, param_grid_sbs, cv=5, scoring='accuracy')

# 모델 학습
grid_sbs.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with SBS:")
print(grid_sbs.best_params_)
print("Best cross-validation accuracy with SBS:")
print(grid_sbs.best_score_)


Best parameters with SBS:
{'classifier__alpha': 0.01, 'classifier__max_iter': 1000, 'classifier__penalty': 'elasticnet', 'classifier__tol': 0.001}
Best cross-validation accuracy with SBS:
0.7895238095238095


# 아달린 PCA

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# 파이프라인 구성
pipe_pca = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('regressor', SGDRegressor())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_pca = {
    'pca__n_components': [x for x in range(1, n_features)],
    'regressor__loss': ['squared_error'],
    'regressor__penalty': ['l2', 'l1', 'elasticnet'],
    'regressor__alpha': [0.0001, 0.001, 0.01],
    'regressor__max_iter': [1000],
    'regressor__tol': [1e-3],
}

# 그리드 서치 객체 생성
grid_pca = GridSearchCV(pipe_pca, param_grid_pca, cv=5, scoring='neg_mean_squared_error')

# 모델 학습
grid_pca.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with PCA:")
print(grid_pca.best_params_)
print("Best cross-validation MSE with PCA:")
print(-grid_pca.best_score_)


Best parameters with PCA:
{'pca__n_components': 14, 'regressor__alpha': 0.01, 'regressor__loss': 'squared_error', 'regressor__max_iter': 1000, 'regressor__penalty': 'l1', 'regressor__tol': 0.001}
Best cross-validation MSE with PCA:
0.14923212540326036


  _data = np.array(data, dtype=dtype, copy=copy,


# 아달린 SBS

In [9]:
from sklearn.feature_selection import SequentialFeatureSelector

# 아달린 모델 정의 (SGDRegressor 사용)
adaline = SGDRegressor(loss='squared_error', max_iter=1000, tol=1e-3)

# SBS 객체 생성
sbs = SequentialFeatureSelector(adaline, n_features_to_select=8, direction='backward', cv=5)

# 파이프라인 구성
pipe_sbs = Pipeline([
    ('feature_selection', sbs),
    ('regressor', SGDRegressor())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_sbs = {
    'regressor__penalty': ['l2', 'l1', 'elasticnet'],
    'regressor__alpha': [0.0001, 0.001, 0.01],
    'regressor__max_iter': [1000],
    'regressor__tol': [1e-3],
}

# 그리드 서치 객체 생성
grid_sbs = GridSearchCV(pipe_sbs, param_grid_sbs, cv=5, scoring='neg_mean_squared_error')

# 모델 학습
grid_sbs.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with SBS:")
print(grid_sbs.best_params_)
print("Best cross-validation MSE with SBS:")
print(-grid_sbs.best_score_)


Best parameters with SBS:
{'regressor__alpha': 0.01, 'regressor__max_iter': 1000, 'regressor__penalty': 'l2', 'regressor__tol': 0.001}
Best cross-validation MSE with SBS:
0.17408406674676244


# 로지스틱 PCA

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# 파이프라인 구성
pipe_pca = Pipeline([
    ('pca', PCA()),
    ('classifier', LogisticRegression())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_pca = {
    'pca__n_components': [x for x in range(1, n_features)],
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__solver': ['saga', 'liblinear'],
    'classifier__max_iter': [500, 1000, 2000],
}

# 그리드 서치 객체 생성
grid_pca = GridSearchCV(pipe_pca, param_grid_pca, cv=5, scoring='accuracy')

# 모델 학습
grid_pca.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with PCA:")
print(grid_pca.best_params_)
print("Best cross-validation accuracy with PCA:")
print(grid_pca.best_score_)




Best parameters with PCA:
{'classifier__C': 10, 'classifier__max_iter': 500, 'classifier__penalty': 'l1', 'classifier__solver': 'saga', 'pca__n_components': 10}
Best cross-validation accuracy with PCA:
0.8180952380952382


2250 fits failed out of a total of 6750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1125 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/yujin/anac

로지스틱 SBS

In [11]:
from sklearn.feature_selection import SequentialFeatureSelector

# 로지스틱 회귀 모델 정의
logreg = LogisticRegression(max_iter=1000, solver='saga')

# SBS 객체 생성
sbs = SequentialFeatureSelector(logreg, n_features_to_select=8, direction='backward', cv=5)

# 파이프라인 구성
pipe_sbs = Pipeline([
    ('feature_selection', sbs),
    ('classifier', LogisticRegression())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_sbs = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__solver': ['saga', 'elasticnet'],
}

# 그리드 서치 객체 생성
grid_sbs = GridSearchCV(pipe_sbs, param_grid_sbs, cv=5, scoring='accuracy')

# 모델 학습
grid_sbs.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with SBS:")
print(grid_sbs.best_params_)
print("Best cross-validation accuracy with SBS:")
print(grid_sbs.best_score_)


100 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/yujin/anaconda

Best parameters with SBS:
{'classifier__C': 1, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
Best cross-validation accuracy with SBS:
0.8333333333333334


# SVC PCA

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# 파이프라인 구성
pipe_pca = Pipeline([
    ('pca', PCA()),
    ('classifier', SVC())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_pca = {
    'pca__n_components': [x for x in range(1, n_features)],
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__kernel': ['linear'],
}

# 그리드 서치 객체 생성
grid_pca = GridSearchCV(pipe_pca, param_grid_pca, cv=5, scoring='accuracy')

# 모델 학습
grid_pca.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with PCA:")
print(grid_pca.best_params_)
print("Best cross-validation accuracy with PCA:")
print(grid_pca.best_score_)


Best parameters with PCA:
{'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 7}
Best cross-validation accuracy with PCA:
0.8047619047619048


# SVM SBS

In [13]:
from sklearn.feature_selection import SequentialFeatureSelector

# SVM 모델 정의
svm = SVC(kernel='linear')

# SBS 객체 생성
sbs = SequentialFeatureSelector(svm, n_features_to_select=8, direction='backward', cv=5)

# 파이프라인 구성
pipe_sbs = Pipeline([
    ('feature_selection', sbs),
    ('classifier', SVC())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_sbs = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__kernel': ['linear'],
}

# 그리드 서치 객체 생성
grid_sbs = GridSearchCV(pipe_sbs, param_grid_sbs, cv=5, scoring='accuracy')

# 모델 학습
grid_sbs.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with SBS:")
print(grid_sbs.best_params_)
print("Best cross-validation accuracy with SBS:")
print(grid_sbs.best_score_)


Best parameters with SBS:
{'classifier__C': 10, 'classifier__kernel': 'linear'}
Best cross-validation accuracy with SBS:
0.8047619047619048


# 커널 SVM PCA

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# 파이프라인 구성
pipe_pca = Pipeline([
    ('pca', PCA()),
    ('classifier', SVC())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_pca = {
    'pca__n_components': [x for x in range(1, n_features)],
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__kernel': ['rbf', 'poly', 'sigmoid'],
    'classifier__gamma': [0.001, 0.01, 0.1, 1],
}

# 그리드 서치 객체 생성
grid_pca = GridSearchCV(pipe_pca, param_grid_pca, cv=5, scoring='accuracy')

# 모델 학습
grid_pca.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with PCA:")
print(grid_pca.best_params_)
print("Best cross-validation accuracy with PCA:")
print(grid_pca.best_score_)


Best parameters with PCA:
{'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'poly', 'pca__n_components': 11}
Best cross-validation accuracy with PCA:
0.86


# 커널 SVM SBS

In [15]:
from sklearn.feature_selection import SequentialFeatureSelector

# SVM 모델 정의
svm = SVC()

# SBS 객체 생성
sbs = SequentialFeatureSelector(svm, n_features_to_select=8, direction='backward', cv=5)

# 파이프라인 구성
pipe_sbs = Pipeline([
    ('feature_selection', sbs),
    ('classifier', SVC())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_sbs = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__kernel': ['rbf', 'poly', 'sigmoid'],
    'classifier__gamma': [0.001, 0.01, 0.1, 1],
}

# 그리드 서치 객체 생성
grid_sbs = GridSearchCV(pipe_sbs, param_grid_sbs, cv=5, scoring='accuracy')

# 모델 학습
grid_sbs.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with SBS:")
print(grid_sbs.best_params_)
print("Best cross-validation accuracy with SBS:")
print(grid_sbs.best_score_)


Best parameters with SBS:
{'classifier__C': 1, 'classifier__gamma': 1, 'classifier__kernel': 'poly'}
Best cross-validation accuracy with SBS:
0.819047619047619


# 결정트리 PCA

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# 파이프라인 구성
pipe_pca = Pipeline([
    ('pca', PCA()),
    ('classifier', DecisionTreeClassifier())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_pca = {
    'pca__n_components': [x for x in range(1, n_features)],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [None, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
}

# 그리드 서치 객체 생성
grid_pca = GridSearchCV(pipe_pca, param_grid_pca, cv=5, scoring='accuracy')

# 모델 학습
grid_pca.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with PCA:")
print(grid_pca.best_params_)
print("Best cross-validation accuracy with PCA:")
print(grid_pca.best_score_)

Best parameters with PCA:
{'classifier__criterion': 'entropy', 'classifier__max_depth': 15, 'classifier__min_samples_split': 2, 'pca__n_components': 4}
Best cross-validation accuracy with PCA:
0.8866666666666667


# 결정트리 SBS

In [17]:
from sklearn.feature_selection import SequentialFeatureSelector

# 결정 트리 모델 정의
dtree = DecisionTreeClassifier()

# SBS 객체 생성
sbs = SequentialFeatureSelector(dtree, n_features_to_select=10, direction='backward', cv=5)

# 파이프라인 구성
pipe_sbs = Pipeline([
    ('feature_selection', sbs),
    ('classifier', DecisionTreeClassifier())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_sbs = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [None, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10],
}

# 그리드 서치 객체 생성
grid_sbs = GridSearchCV(pipe_sbs, param_grid_sbs, cv=5, scoring='accuracy')

# 모델 학습
grid_sbs.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with SBS:")
print(grid_sbs.best_params_)
print("Best cross-validation accuracy with SBS:")
print(grid_sbs.best_score_)


Best parameters with SBS:
{'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_split': 5}
Best cross-validation accuracy with SBS:
0.8590476190476191


# 랜덤포레스트 PCA

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# 파이프라인 구성
pipe_pca = Pipeline([
    ('pca', PCA()),
    ('classifier', RandomForestClassifier())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_pca = {
    'pca__n_components': [x for x in range(1, n_features)],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5, 10],
}

# 그리드 서치 객체 생성
grid_pca = GridSearchCV(pipe_pca, param_grid_pca, cv=5, scoring='accuracy', n_jobs=-1)

# 모델 학습
grid_pca.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with PCA:")
print(grid_pca.best_params_)
print("Best cross-validation accuracy with PCA:")
print(grid_pca.best_score_)


Best parameters with PCA:
{'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50, 'pca__n_components': 13}
Best cross-validation accuracy with PCA:
0.86


  _data = np.array(data, dtype=dtype, copy=copy,


# 랜덤포레스트 SBS

In [20]:
from sklearn.feature_selection import SequentialFeatureSelector

# 랜덤 포레스트 모델 정의
rf = RandomForestClassifier()

# SBS 객체 생성
sbs = SequentialFeatureSelector(rf, n_features_to_select=10, direction='backward', cv=5)

# 파이프라인 구성
pipe_sbs = Pipeline([
    ('feature_selection', sbs),
    ('classifier', RandomForestClassifier())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_sbs = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5, 10],
}

# 그리드 서치 객체 생성
grid_sbs = GridSearchCV(pipe_sbs, param_grid_sbs, cv=5, scoring='accuracy', n_jobs=-1)

# 모델 학습
grid_sbs.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with SBS:")
print(grid_sbs.best_params_)
print("Best cross-validation accuracy with SBS:")
print(grid_sbs.best_score_)


KeyboardInterrupt: 

# KNN PCA

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# 파이프라인 구성
pipe_pca = Pipeline([
    ('pca', PCA()),
    ('classifier', KNeighborsClassifier())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_pca = {
    'pca__n_components': [x for x in range(1, n_features)],
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan'],
}

# 그리드 서치 객체 생성
grid_pca = GridSearchCV(pipe_pca, param_grid_pca, cv=5, scoring='accuracy', n_jobs=-1)

# 모델 학습
grid_pca.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with PCA:")
print(grid_pca.best_params_)
print("Best cross-validation accuracy with PCA:")
print(grid_pca.best_score_)


# KNN SBS

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

# KNN 모델 정의
knn = KNeighborsClassifier()

# SBS 객체 생성
sbs = SequentialFeatureSelector(knn, n_features_to_select=10, direction='backward', cv=5)

# 파이프라인 구성
pipe_sbs = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', sbs),
    ('classifier', KNeighborsClassifier())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_sbs = {
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan'],
}

# 그리드 서치 객체 생성
grid_sbs = GridSearchCV(pipe_sbs, param_grid_sbs, cv=5, scoring='accuracy', n_jobs=-1)

# 모델 학습
grid_sbs.fit(X_train.values, y_train.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with SBS:")
print(grid_sbs.best_params_)
print("Best cross-validation accuracy with SBS:")
print(grid_sbs.best_score_)


## 알고리즘 간의 작업
1. 중첩교차검증을 통해 상위 모델들만 따로 선택
2. 선택된 모델들을 통해 앙상블 학습 진행
3. 실제 데이터 예측

- 데이터 전처리를 원하는대로 바꿔가며 위 과정을 최적의 결과가 나올때까지 반복