##### CV Validation
- scikit-learn의 module selection 내 모델 검증관련 기능 활용
- 교차 검증 데이터 기반 검증 결과 처리

In [1]:
# 1. Load Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

fishDF = pd.read_csv('../../DATA/fish.csv')
irisDF = pd.read_csv('../../DATA/iris.csv')

In [2]:
# 1-2. Check Data
print(fishDF.head(), fishDF.shape)
print('-'*50)
print(irisDF.head(), irisDF.shape)

  Species  Weight  Length  Diagonal   Height   Width
0   Bream   242.0    25.4      30.0  11.5200  4.0200
1   Bream   290.0    26.3      31.2  12.4800  4.3056
2   Bream   340.0    26.5      31.1  12.3778  4.6961
3   Bream   363.0    29.0      33.5  12.7300  4.4555
4   Bream   430.0    29.0      34.0  12.4440  5.1340 (159, 6)
--------------------------------------------------
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa (150, 5)


<hr>

2. Split feature and target

In [3]:
fish_target = fishDF['Weight']
# 특이사항 : species를 두고 Weight를 타깃으로 설정 -> 아니다 제거한다
fish_feature = fishDF.drop(['Weight', 'Species'], axis=1)
print(fish_target.head(2))
print(fish_feature.head(2)) 

0    242.0
1    290.0
Name: Weight, dtype: float64
   Length  Diagonal  Height   Width
0    25.4      30.0   11.52  4.0200
1    26.3      31.2   12.48  4.3056


In [4]:
iris_target = irisDF['species']
iris_feature = irisDF.drop(['species'], axis=1)
print(iris_target.head(2))
print(iris_feature.head(2))

0    setosa
1    setosa
Name: species, dtype: object
   sepal_length  sepal_width  petal_length  petal_width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           1.4          0.2


<hr>

3. Data Preprocessing
1) feature scaling
2) data normalization

In [5]:
# 1. Split data into train and val
from sklearn.model_selection import train_test_split

# 1) fish -> train, test : Regression
fish_X_train, fish_X_test, fish_y_train, fish_y_test = train_test_split(fish_feature,
                                                                        fish_target, 
                                                                        test_size=0.2, random_state=5)

# 2) iris -> train, test : Classification
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_feature,
                                                                        iris_target,
                                                                        test_size=0.2, random_state=5,
                                                                        stratify=iris_target)

In [6]:
# 2. Scaling
from sklearn.preprocessing import StandardScaler

fish_scaler = StandardScaler()
fish_X_train_scaled = fish_scaler.fit_transform(fish_X_train)
fish_X_test_scaled = fish_scaler.transform(fish_X_test)

iris_scaler = StandardScaler()
iris_X_train_scaled = iris_scaler.fit_transform(iris_X_train)
iris_X_test_scaled = iris_scaler.transform(iris_X_test)

<hr>

4. Train Model

In [7]:
# 0. Set Model instance
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()

In [8]:
# 1. Predict Weight of fish
# 1) Use CV
from sklearn.model_selection import cross_validate

# Return every evaluations
# - cv= : number of split
# - return_train_score : As literally
result = cross_validate(lr_model, fish_X_train_scaled, fish_y_train, 
                        return_estimator=True,
                        return_train_score=True, cv=5)
resultDF = pd.DataFrame(result)
resultDF

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,0.135994,0.004013,LinearRegression(),0.921047,0.874264
1,0.0,0.0,LinearRegression(),0.843854,0.887794
2,0.0,0.002397,LinearRegression(),0.885924,0.880611
3,0.002023,0.0,LinearRegression(),0.64672,0.902975
4,0.0,0.0,LinearRegression(),0.790319,0.898336


In [9]:
# best model
best_model = resultDF.iloc[0]['estimator']
print(best_model.coef_)
print(best_model.intercept_)

[ 373.98470744 -159.77931033   90.53431501   50.22123874]
408.522509249702


In [10]:
# If only needed Scores: cross_val_score
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr_model, fish_X_train_scaled, fish_y_train, cv=5)
scores

array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905])

In [11]:
# If only needed Predict : cross_val_predict
from sklearn.model_selection import cross_val_predict

predictions = cross_val_predict(lr_model, fish_X_train_scaled, fish_y_train, cv=5)
predictions

array([ 9.09792517e+01,  9.85612151e+01,  3.87029719e+02,  1.13011547e+02,
        6.81676563e+02,  2.82456988e+02,  5.34379642e+02,  3.61848302e+02,
        6.12934598e+02,  1.70756130e+02,  5.53222970e+02,  1.69433076e+01,
       -2.53895688e+01,  8.14926155e+02,  6.97225129e+01,  3.38157931e+02,
        4.76306355e+02,  7.67659158e+02,  6.55686457e+02,  1.80300946e+02,
        8.45315559e+02,  2.92145322e+02,  6.08539351e+02,  9.02782406e+02,
        6.99788981e+02,  9.40316876e+02,  7.47628344e+02,  3.28419355e+02,
        7.89622699e+02,  9.09130831e+02, -1.98986854e+02,  1.81089559e+02,
        6.36731679e+02, -1.09209894e+02,  3.57087822e+02,  7.88250361e+02,
        3.25180589e+02,  6.56473977e+02, -2.37032025e+02,  4.55882834e+01,
        9.57130255e+01, -2.10830505e+02,  1.28969696e+02, -2.21199132e+02,
       -1.10282630e+02,  6.39911566e+02,  2.12288357e+02,  2.41098815e+02,
        2.61932359e+02, -2.58301758e+02,  2.93250859e+01,  8.87950700e+02,
        2.46460034e+02,  

In [12]:
# Tunning : GridSearchCV

<hr>

### GridSearchCV

In [13]:
# 결정 경계선
# - 서포터 벡터 : 선을 긋는 데 도와준 요소
# - 그리고 수많은 파라미터...

In [14]:
# 0. set module
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

In [15]:
# Set model
est = LogisticRegression(max_iter=10000, solver='liblinear')

# Set params; 
params = {'penalty': ['l1', 'l2', 'elasticnet'], 
          'solver' : ['lbfgs', 'auto', 'liblinear', 'newton_cg', 'lbfgs', 'sag', 'saga']}

In [16]:
# Set GridSearchCV
gscv = GridSearchCV(est, param_grid=params, return_train_score=True)


<hr>

#### 데이터에 적합한 모델 찾기

In [17]:
from sklearn.utils import all_estimators

In [18]:
all_estimators('classifier')    # show every Classifier!

[('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
 ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
 ('BernoulliNB', sklearn.naive_bayes.BernoulliNB),
 ('CalibratedClassifierCV', sklearn.calibration.CalibratedClassifierCV),
 ('CategoricalNB', sklearn.naive_bayes.CategoricalNB),
 ('ClassifierChain', sklearn.multioutput.ClassifierChain),
 ('ComplementNB', sklearn.naive_bayes.ComplementNB),
 ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
 ('DummyClassifier', sklearn.dummy.DummyClassifier),
 ('ExtraTreeClassifier', sklearn.tree._classes.ExtraTreeClassifier),
 ('ExtraTreesClassifier', sklearn.ensemble._forest.ExtraTreesClassifier),
 ('GaussianNB', sklearn.naive_bayes.GaussianNB),
 ('GaussianProcessClassifier',
  sklearn.gaussian_process._gpc.GaussianProcessClassifier),
 ('GradientBoostingClassifier',
  sklearn.ensemble._gb.GradientBoostingClassifier),
 ('HistGradientBoostingClassifier',
  sklearn.ensemble._hist_gradi

In [19]:
# def 각

models = all_estimators(type_filter='classifier')

for name, model in models:
    try:
        print(model().fit(iris_X_train_scaled, iris_y_train))
    except Exception as e:
        print(e)
    
# ValueError : 
# 1) try, except
# 2) 
# - 쭉 가다가 

AdaBoostClassifier()
BaggingClassifier()
BernoulliNB()
CalibratedClassifierCV()
Negative values in data passed to CategoricalNB (input X)
__init__() missing 1 required positional argument: 'base_estimator'
Negative values in data passed to ComplementNB (input X)
DecisionTreeClassifier()
DummyClassifier()
ExtraTreeClassifier()
ExtraTreesClassifier()
GaussianNB()
GaussianProcessClassifier()
GradientBoostingClassifier()
HistGradientBoostingClassifier()
KNeighborsClassifier()
LabelPropagation()
LabelSpreading()
LinearDiscriminantAnalysis()
LinearSVC()
LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegressionCV()
MLPClassifier()
__init__() missing 1 required positional argument: 'estimator'
Negative values in data passed to MultinomialNB (input X)
NearestCentroid()
NuSVC()
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
PassiveAggressiveClassifier()
Perceptron()
QuadraticDiscriminantAnalysis()
RadiusNeighborsClassifier()
RandomForestClassifier()
RidgeClassifier()
RidgeClassifierCV()
SGDClassifier()
SVC()
__init__() missing 1 required positional argument: 'estimators'
__init__() missing 1 required positional argument: 'estimators'


