# 교차 검증 단순화
- scikit-learn의 model_selection 모듈 내에 모델 검증 관련 기능 활용
- 교차 검증 데이터 기반, 검증 결과 처리

## <hr> 모듈 로딩 및 데이터 준비

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 생선 데이터 준비 -> 회귀
fishDF = pd.read_csv("../data/fish.csv")

# 붓꽃 데이터 준비 -> 분류
irisDF = pd.read_csv("../data/iris.csv")

In [3]:
fishDF.head() # 회귀로 볼거임 무게가 타겟

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [4]:
irisDF.head() # 분류

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


## <hr>2. 데이터 준비 => 피쳐 & 타겟 분리

In [6]:
# 타겟 : Weight
# feature : Length	Diagonal	Height	Width

fish_targetSR = fishDF.iloc[:,1]
fish_featureSDF = fishDF.iloc[:,2:]

In [9]:
fish_targetSR.head(2), fish_featureSDF.head(2)

(0    242.0
 1    290.0
 Name: Weight, dtype: float64,
    Length  Diagonal  Height   Width
 0    25.4      30.0   11.52  4.0200
 1    26.3      31.2   12.48  4.3056)

In [7]:
# iris도 나누기

iris_targetSR = irisDF.iloc[:,-1]
iris_featureSDF = irisDF.iloc[:,:-1]

In [8]:
iris_targetSR.head(2), iris_featureSDF.head(2)

(0    Setosa
 1    Setosa
 Name: variety, dtype: object,
    sepal.length  sepal.width  petal.length  petal.width
 0           5.1          3.5           1.4          0.2
 1           4.9          3.0           1.4          0.2)

## <hr>3. 데이터 전처리
- feature scaling

In [17]:
# 학습용 테스트용 분리 
from sklearn.model_selection import train_test_split
# 생선 데이터
fish_X_train, fish_X_test, fish_y_train, fish_y_test = train_test_split(fish_featureSDF,
                                                                         fish_targetSR,
                                                                        test_size=0.2,
                                                                         random_state = 5)

# 붓꽃 데이터
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_featureSDF,
                                                                        iris_targetSR,
                                                                        test_size=0.2,
                                                                        stratify=iris_targetSR,
                                                                         random_state = 5)

In [19]:
from sklearn.preprocessing import StandardScaler

# 물고기 스케일링
fish_Scaler = StandardScaler()
fish_Scaler.fit(fish_X_train)

scaled_fish_X_train = fish_Scaler.transform(fish_X_train)
scaled_fish_X_test = fish_Scaler.transform(fish_X_test)

# 붓꽃 스케일링
iris_Scaler = StandardScaler()
iris_Scaler.fit(iris_X_train)

scaled_iris_X_train = iris_Scaler.transform(iris_X_train)
scaled_iris_X_test = iris_Scaler.transform(iris_X_test)

## <hr> 4. 학습 

### 4-1 생선 무게 예측 모델

In [21]:
# 모듈 로딩
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict
# 다 보고 싶으면 validate

In [22]:
# 교차 검증으로 학습 진행
# 준비 : 모델 인스턴스, 학습용 피쳐 데이터, 학습용 라벨 데이터 

lr_model = LinearRegression()

In [33]:
# 학습/검증에 대한 평가 모든 결과에 대한 처리
result = cross_validate(lr_model, 
                        scaled_fish_X_train, 
                        fish_y_train, 
                        scoring=("r2", "neg_mean_squared_error"),
                        return_train_score=True,
                       return_estimator=True)

print(result)

# 회귀니까 score는 r2

{'fit_time': array([0.00099826, 0.00099993, 0.00099993, 0.        , 0.00107145]), 'score_time': array([0.00199986, 0.        , 0.        , 0.0010004 , 0.00094247]), 'estimator': [LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()], 'test_r2': array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905]), 'train_r2': array([0.87426416, 0.88779401, 0.88061108, 0.90297504, 0.89833592]), 'test_neg_mean_squared_error': array([ -8767.84902315, -17815.75093903, -12344.87825138, -22006.47049028,
       -39450.52608702]), 'train_neg_mean_squared_error': array([-16078.44783606, -13972.57866943, -15268.42472495, -13223.98109532,
       -10586.01039978])}


In [36]:
resultDF = pd.DataFrame(result)
resultDF

Unnamed: 0,fit_time,score_time,estimator,test_r2,train_r2,test_neg_mean_squared_error,train_neg_mean_squared_error
0,0.000998,0.002,LinearRegression(),0.921047,0.874264,-8767.849023,-16078.447836
1,0.001,0.0,LinearRegression(),0.843854,0.887794,-17815.750939,-13972.578669
2,0.001,0.0,LinearRegression(),0.885924,0.880611,-12344.878251,-15268.424725
3,0.0,0.001,LinearRegression(),0.64672,0.902975,-22006.47049,-13223.981095
4,0.001071,0.000942,LinearRegression(),0.790319,0.898336,-39450.526087,-10586.0104


index는 cv개수

In [35]:
best_model = resultDF.iloc[0]["estimator"]
best_model.coef_, best_model.intercept_

(array([ 373.98470744, -159.77931033,   90.53431501,   50.22123874]),
 408.52250924970195)

In [38]:
### CV에 score만 추출
cross_val_score(lr_model, scaled_fish_X_train, fish_y_train)

array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905])

In [None]:
## CV에 predict만 추출
cross_val_predict(lr_model, scaled_fish_X_train, fish_y_train)

이젠 나도 잘 모르겠다

C는 값이 클수록 규제가 약함 <-> 알파랑 반대되는 개념 
커널은 차원? 다른 방식?으로 바라보는 것

# 교차검증과 튜닝까지 한번에 진행
- 단점 : 시간이 오래 걸림

In [39]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [40]:
from sklearn.linear_model import LogisticRegression

In [47]:
est = LogisticRegression(max_iter=10000,solver="liblinear")
params = {"penalty" : ['l1', 'l2'],
         }

In [48]:
gscv = GridSearchCV(est, param_grid=params, return_train_score=True)

gscv.fit(scaled_iris_X_train, iris_y_train)

In [51]:
cv_resultDF = pd.DataFrame(gscv.cv_results_)

In [53]:
cv_resultDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.001051,8e-05,0.0002,0.000401,l1,{'penalty': 'l1'},0.875,1.0,0.958333,0.958333,...,0.941667,0.042492,1,0.9375,0.927083,0.9375,0.947917,0.9375,0.9375,0.006588
1,0.000808,0.000404,0.000328,0.000417,l2,{'penalty': 'l2'},0.875,0.958333,0.958333,0.958333,...,0.916667,0.052705,2,0.90625,0.916667,0.916667,0.927083,0.90625,0.914583,0.007795


In [54]:
gscv.best_params_ # penalty가 l1일 때 최고 

{'penalty': 'l1'}

In [55]:
gscv.best_score_

0.9416666666666668

In [56]:
best_model=gscv.best_estimator_

# 데이터에 적합한 모델 찾기

In [57]:
from sklearn.utils.discovery import all_estimators

In [64]:
models=all_estimators("classifier")

for model_name, model in models:
    try:
        print(model().fit(scaled_iris_X_train, iris_y_train))
    except Exception as e:
        print(e)

AdaBoostClassifier()
BaggingClassifier()
BernoulliNB()
CalibratedClassifierCV()
Negative values in data passed to CategoricalNB (input X)
__init__() missing 1 required positional argument: 'base_estimator'
Negative values in data passed to ComplementNB (input X)
DecisionTreeClassifier()
DummyClassifier()
ExtraTreeClassifier()
ExtraTreesClassifier()
GaussianNB()
GaussianProcessClassifier()
GradientBoostingClassifier()
HistGradientBoostingClassifier()
KNeighborsClassifier()
LabelPropagation()
LabelSpreading()
LinearDiscriminantAnalysis()
LinearSVC()
LogisticRegression()
LogisticRegressionCV()




MLPClassifier()
__init__() missing 1 required positional argument: 'estimator'
Negative values in data passed to MultinomialNB (input X)
NearestCentroid()
NuSVC()
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
PassiveAggressiveClassifier()
Perceptron()
QuadraticDiscriminantAnalysis()
RadiusNeighborsClassifier()
RandomForestClassifier()
RidgeClassifier()
RidgeClassifierCV()
SGDClassifier()
SVC()
__init__() missing 1 required positional argument: 'estimators'
__init__() missing 1 required positional argument: 'estimators'




In [None]:
# 예외처리는 니 알아서 해보소....