### **Chapter 01. 검증 세트**

In [2]:
import pandas as pd

df = pd.read_csv("wine_csv_data.csv")

df.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [3]:
df.info() # information -> info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB


In [4]:
df.describe() # 평균, 표준편차, 사분위수 등

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [5]:
df.columns

Index(['alcohol', 'sugar', 'pH', 'class'], dtype='object')

In [6]:
x = df[['alcohol', 'sugar', 'pH']]
x.head()

Unnamed: 0,alcohol,sugar,pH
0,9.4,1.9,3.51
1,9.8,2.6,3.2
2,9.8,2.3,3.26
3,9.8,1.9,3.16
4,9.4,1.9,3.51


In [7]:
y = df['class']
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: class, dtype: float64

In [8]:
x = x.to_numpy()
y = y.to_numpy()

print(x.shape, y.shape)

(6497, 3) (6497,)


In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    random_state=42)

# 분할 기본값: 0.25 (25%) 만큼을 시험용 데이터 세트로 쪼개줌!
print(x_train.shape, x_test.shape) # 4872:1625 = 3:1

(4872, 3) (1625, 3)


In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.2, # 기본값 = 0.25
                                                    # train_size=0.8 # (1-test_size)
                                                    random_state=42)

print(x_train.shape, x_test.shape) # 5197:1300 = 4:1 (20%)

(5197, 3) (1300, 3)


In [11]:
x_sub, x_val, y_sub, y_val = train_test_split(x_train, y_train,
                                              test_size=0.2,
                                              random_state=42)

print(x_sub.shape, x_val.shape) # 4157 + 1040 = 5197

(4157, 3) (1040, 3)


In [12]:
from sklearn.preprocessing import StandardScaler # 변환기 클래스

ss = StandardScaler()

ss.fit(x_sub) # x_sub의 mean, std 계산

x_sub_scaled = ss.transform(x_sub)
x_val_scaled = ss.transform(x_val)

In [13]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression() # 특성의 개수 3개 -> coef_ = 3개, intercept_ = 1개

lr.fit(x_sub_scaled, y_sub)

print("학습용:", lr.score(x_sub_scaled, y_sub)) # 정확도!! (분류 문제라서) (회귀 문제일 때는 R^2)
print("시험용:", lr.score(x_val_scaled, y_val))

# 과소적합(Underfitting)에 빠졌군!!

학습용: 0.7789271108972817
시험용: 0.7826923076923077


In [14]:
lr.coef_

array([[ 0.50386868,  1.63789847, -0.67214448]])

In [15]:
# LogisticRegression 알고리즘의 하이퍼 파라미터
# C: 규제 강도 컨트롤함!
# penalty: 'L1', 'L2' 어느 규제를 적용할 지 선택

### **Chapter 02. 교차 검증**

In [16]:
from sklearn.preprocessing import StandardScaler # 변환기 클래스

ss = StandardScaler()

ss.fit(x_train) # mean, std 계산

x_train_scaled = ss.transform(x_train)
x_test_scaled = ss.transform(x_test)

In [17]:
from sklearn.model_selection import cross_validate

lr = LogisticRegression()

# 알아서 검증 데이터셋 만들어 줌 -> x_train_scaled를 사용
scores = cross_validate(lr, x_train_scaled, y_train) # k-fold (k의 기본값 = 5)

print(scores)
'''
{'fit_time': array([0.67020321, 0.23636603, 0.05385613, 0.03888988, 0.02194071]), 
'score_time': array([0.00498605, 0.00398993, 0.00199533, 0.00199461, 0.00299168]), 
'test_score': array([0.77403846, 0.775     , 0.80558229, 0.76997113, 0.78248316])}
'''

{'fit_time': array([0.04787183, 0.01994848, 0.01895046, 0.01595116, 0.01648402]), 'score_time': array([0.00213385, 0.00516009, 0.00199366, 0.00186706, 0.00199437]), 'test_score': array([0.77403846, 0.775     , 0.80558229, 0.76997113, 0.78248316])}


"\n{'fit_time': array([0.67020321, 0.23636603, 0.05385613, 0.03888988, 0.02194071]), \n'score_time': array([0.00498605, 0.00398993, 0.00199533, 0.00199461, 0.00299168]), \n'test_score': array([0.77403846, 0.775     , 0.80558229, 0.76997113, 0.78248316])}\n"

In [18]:
import numpy as np

print(np.mean(scores['test_score']))

print(sum(scores['test_score'])/len(scores['test_score']))

0.7814150070333901
0.7814150070333901


In [19]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold() # 분할기 (Splitter)

scores = cross_validate(lr, x_train_scaled, y_train,
                        cv=skf) # 또는 cv=StratifiedKFold()

print(np.mean(scores['test_score']))

0.7814150070333901


In [20]:
scores = cross_validate(LogisticRegression(), x_train_scaled, y_train,
                        cv=StratifiedKFold(n_splits=10, # n_splits = (k-fold의 k값! 기본값=5)
                                           shuffle=True, # shuffle = 샘플을 무작위로 섞는 옵션, 기본값=False
                                           random_state=42 # random_state = 42 (실제 환경에서는 사용 X, 수업용 목적임!)
                                          )
                       )

print(scores['test_score'])
print(np.mean(scores['test_score']))

[0.775      0.76923077 0.77692308 0.75384615 0.78076923 0.79038462
 0.80192308 0.7938343  0.78612717 0.77263969]
0.7800678079146287


### **Chapter 03. 하이퍼 파라미터 튜닝**

In [21]:
from sklearn.model_selection import GridSearchCV

parameters = {'C':[1e-3, 1e-2, 1e-1, 1e0, 1, 2, 3, 4]}

gs = GridSearchCV(LogisticRegression(), parameters,
                  n_jobs=-1 # 기본값=1, -1로 지정하면 모든 core 자원을 활용하기 때문에 빠른 결과를 얻을 수 있다!
                  # k=5 <- 5-fold CV가 기본값으로 적용됨!
                 )

# CPU -> 계산(연산)을 담당하고 있을 것임!
# GPU -> 지금 이용하지 않고 있음!

In [22]:
gs.fit(x_train_scaled, y_train)

In [23]:
lr = gs.best_estimator_

print("학습용:", lr.score(x_train_scaled, y_train)) # 정확도!! (분류 문제라서) (회귀 문제일 때는 R^2)
print("시험용:", lr.score(x_test_scaled, y_test))

학습용: 0.7808350971714451
시험용: 0.7776923076923077


In [35]:
gs.best_params_ # parameter --> params

{'C': 1, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.1}

In [25]:
gs.cv_results_

{'mean_fit_time': array([0.33773313, 0.02193522, 0.02236362, 0.01864095, 0.02336936,
        0.0179069 , 0.03050947, 0.01512265]),
 'std_fit_time': array([0.20691288, 0.01628114, 0.01882168, 0.00730021, 0.01417311,
        0.00953239, 0.01749091, 0.00231461]),
 'mean_score_time': array([0.00270991, 0.00218139, 0.00253215, 0.00217662, 0.00211115,
        0.01296496, 0.00338955, 0.00243883]),
 'std_score_time': array([0.0006221 , 0.00067323, 0.00044208, 0.00047135, 0.00045364,
        0.01201836, 0.00101855, 0.00045533]),
 'param_C': masked_array(data=[0.001, 0.01, 0.1, 1.0, 1, 2, 3, 4],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.001},
  {'C': 0.01},
  {'C': 0.1},
  {'C': 1.0},
  {'C': 1},
  {'C': 2},
  {'C': 3},
  {'C': 4}],
 'split0_test_score': array([0.75769231, 0.76442308, 0.77211538, 0.77403846, 0.77403846,
        0.77403846, 0.77403846, 0.77403846]),
 'split1_test_score': arr

In [26]:
print(gs.cv_results_['mean_test_score']) # 8개 <-- C의 경우의 수

[0.75678296 0.77294847 0.7812227  0.78141501 0.78141501 0.78141501
 0.78179999 0.78141519]


In [27]:
parameters = {'C':[1e-3, 1e-2, 1e-1, 1e0, 1, 2, 3, 4], # 8
              'penalty': ['l1', 'l2'], # 2
              'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], # 5
              'solver': ['liblinear', 'saga']} # 기울기 값을 최소화시키는 방향으로 진행하는데 # 2

gs = GridSearchCV(LogisticRegression(), parameters,
                  n_jobs=-1 # 기본값=1, -1로 지정하면 모든 core 자원을 활용하기 때문에 빠른 결과를 얻을 수 있다!
                  # k=5 <- 5-fold CV가 기본값으로 적용됨!
                 )

In [28]:
gs.fit(x_train_scaled, y_train)

lr = gs.best_estimator_

print("학습용:", lr.score(x_train_scaled, y_train)) # 정확도!! (분류 문제라서) (회귀 문제일 때는 R^2)
print("시험용:", lr.score(x_test_scaled, y_test))

학습용: 0.7823744467962286
시험용: 0.7761538461538462


In [29]:
print(gs.best_params_)

{'C': 1, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.1}


In [30]:
# 랜덤 서치!!!
from scipy.stats import uniform, randint
    
ran_generator1 = randint(0, 10) # 정수값 난수 생성기 객체를 반환해줌!

print(ran_generator1.rvs(100))

[8 9 2 7 0 1 3 6 8 5 7 6 9 6 1 3 5 4 1 3 6 4 1 8 5 6 8 7 2 3 9 6 3 7 5 9 3
 2 0 6 5 4 8 2 5 1 9 6 6 3 4 6 0 2 2 8 1 1 5 9 7 5 5 1 3 9 1 2 2 8 8 0 8 2
 6 0 5 3 1 7 5 0 1 1 6 4 3 2 9 3 7 4 0 5 4 2 8 2 6 8]


In [31]:
ran_generator2 = uniform(0, 1) # 실수값 난수 생성기 객체
print(ran_generator2.rvs(10))

[0.4903867  0.09936075 0.16461333 0.24705178 0.58301854 0.17812905
 0.96267349 0.9709747  0.58189271 0.17694309]


In [32]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {'C': randint(1,10),
              'penalty': ['l1', 'l2'],
              'tol': uniform(1e-5, 1e-1),
              'solver': ['liblinear', 'saga']}

rs = RandomizedSearchCV(LogisticRegression(), parameters,
                       n_jobs=-1)

In [33]:
rs.fit(x_train_scaled, y_train)

lr = rs.best_estimator_

print("학습용:", lr.score(x_train_scaled, y_train)) # 정확도!! (분류 문제라서) (회귀 문제일 때는 R^2)
print("시험용:", lr.score(x_test_scaled, y_test))

학습용: 0.7817971906869348
시험용: 0.7769230769230769


In [34]:
print(rs.best_params_)

{'C': 7, 'penalty': 'l2', 'solver': 'liblinear', 'tol': 0.0838207398469326}
