# 로지스틱 회귀
* 모델 1
* upsampling + MinMax 적용한 데이터 사용
* 평가지표 비교하여 랜덤포레스트가 더 성능이 좋다고 평가됨

In [1]:
# 라이브러리 로드
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from scipy.special import softmax

In [3]:
# 데이터 로딩
df = pd.read_csv('./MinMax_model01.csv', encoding='utf-8')
df


Unnamed: 0,일시,WIND_SPEED,GUST_WIND,SIG_WAVE,MAX_WAVE,HPA,특보강도
0,2012-08-02 04:00:00,0.334646,0.409742,0.107143,0.128205,0.463588,1.0
1,2018-06-15 18:00:00,0.350394,0.438395,0.107143,0.115385,0.529307,1.0
2,2018-04-24 12:00:00,0.188976,0.375358,0.089286,0.089744,0.467140,1.0
3,2018-03-22 06:00:00,0.094488,0.300860,0.285714,0.282051,0.536412,1.0
4,2018-03-17 08:00:00,0.192913,0.249284,0.107143,0.115385,0.870337,1.0
...,...,...,...,...,...,...,...
199504,2018-03-20 10:00:00,0.578740,0.756447,0.392857,0.410256,0.735346,2.0
199505,2012-04-03 07:00:00,0.212598,0.469914,0.196429,0.217949,0.197158,2.0
199506,2018-07-03 20:00:00,0.622047,0.767908,0.410714,0.435897,0.111901,2.0
199507,2018-03-20 09:00:00,0.531496,0.704871,0.357143,0.371795,0.721137,2.0


In [4]:
# obj -> datetime
df['일시'] = pd.to_datetime(df['일시'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199509 entries, 0 to 199508
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   일시          199509 non-null  datetime64[ns]
 1   WIND_SPEED  199509 non-null  float64       
 2   GUST_WIND   199509 non-null  float64       
 3   SIG_WAVE    199509 non-null  float64       
 4   MAX_WAVE    199509 non-null  float64       
 5   HPA         199509 non-null  float64       
 6   특보강도        199509 non-null  float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 10.7 MB


In [5]:
# 인덱스
df = df.set_index('일시')
df

Unnamed: 0_level_0,WIND_SPEED,GUST_WIND,SIG_WAVE,MAX_WAVE,HPA,특보강도
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-08-02 04:00:00,0.334646,0.409742,0.107143,0.128205,0.463588,1.0
2018-06-15 18:00:00,0.350394,0.438395,0.107143,0.115385,0.529307,1.0
2018-04-24 12:00:00,0.188976,0.375358,0.089286,0.089744,0.467140,1.0
2018-03-22 06:00:00,0.094488,0.300860,0.285714,0.282051,0.536412,1.0
2018-03-17 08:00:00,0.192913,0.249284,0.107143,0.115385,0.870337,1.0
...,...,...,...,...,...,...
2018-03-20 10:00:00,0.578740,0.756447,0.392857,0.410256,0.735346,2.0
2012-04-03 07:00:00,0.212598,0.469914,0.196429,0.217949,0.197158,2.0
2018-07-03 20:00:00,0.622047,0.767908,0.410714,0.435897,0.111901,2.0
2018-03-20 09:00:00,0.531496,0.704871,0.357143,0.371795,0.721137,2.0


In [6]:
# 코드
print(pd.unique(df['특보강도']))

[1. 0. 2.]


In [7]:
# 데이터셋 구성
X= df.drop(['특보강도'], axis=1)
y= df['특보강도']

* 훈련 시작~

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=123)

In [9]:
X

Unnamed: 0_level_0,WIND_SPEED,GUST_WIND,SIG_WAVE,MAX_WAVE,HPA
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-08-02 04:00:00,0.334646,0.409742,0.107143,0.128205,0.463588
2018-06-15 18:00:00,0.350394,0.438395,0.107143,0.115385,0.529307
2018-04-24 12:00:00,0.188976,0.375358,0.089286,0.089744,0.467140
2018-03-22 06:00:00,0.094488,0.300860,0.285714,0.282051,0.536412
2018-03-17 08:00:00,0.192913,0.249284,0.107143,0.115385,0.870337
...,...,...,...,...,...
2018-03-20 10:00:00,0.578740,0.756447,0.392857,0.410256,0.735346
2012-04-03 07:00:00,0.212598,0.469914,0.196429,0.217949,0.197158
2018-07-03 20:00:00,0.622047,0.767908,0.410714,0.435897,0.111901
2018-03-20 09:00:00,0.531496,0.704871,0.357143,0.371795,0.721137


In [10]:
y

일시
2012-08-02 04:00:00    1.0
2018-06-15 18:00:00    1.0
2018-04-24 12:00:00    1.0
2018-03-22 06:00:00    1.0
2018-03-17 08:00:00    1.0
                      ... 
2018-03-20 10:00:00    2.0
2012-04-03 07:00:00    2.0
2018-07-03 20:00:00    2.0
2018-03-20 09:00:00    2.0
2018-03-22 03:00:00    2.0
Name: 특보강도, Length: 199509, dtype: float64

In [11]:
# 표준화 전처리
ss = StandardScaler()
ss.fit(x_train)
train_scaled = ss.transform(x_train)
test_scaled = ss.transform(x_test)


In [12]:
# k - 최근접 이웃 분류기의 확률 예측
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, y_train)
print(kn.score(train_scaled, y_train))
print(kn.score(test_scaled, y_test))

0.9922685095265246
0.9853892035486943


In [13]:
# 샘플 예측
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba,decimals=4))

[[1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [14]:
# 로지스틱 회귀 훈련
lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(train_scaled, y_train)
print(lr.score(train_scaled, y_train))
print(lr.score(test_scaled,y_test))

0.8232471006910724
0.8245200741817452


In [15]:
model = LogisticRegression()
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
print(model.coef_)

[[ -4.21358554  -6.14233443 -12.38416326 -14.50739884   1.74007717]
 [ -1.04494319   1.49447083  -5.00383613   9.8827669   -0.94387622]
 [  5.25852873   4.64786359  17.38799939   4.62463194  -0.79620096]]


In [17]:
print(model.score(x_train, y_train))

0.8230090158952929


In [18]:
# 5개의 샘플 예측
print(lr.predict(test_scaled[:5]))

[0. 0. 0. 0. 2.]


In [19]:
# 10개의 샘플 예측 확률
proba = lr.predict_proba(test_scaled[:10])
print(np.round(proba, decimals=3)) # 반올림의미

[[0.82  0.179 0.001]
 [0.765 0.233 0.002]
 [0.644 0.347 0.009]
 [0.968 0.032 0.   ]
 [0.    0.007 0.993]
 [0.078 0.874 0.048]
 [0.    0.237 0.763]
 [0.143 0.834 0.023]
 [0.208 0.757 0.035]
 [0.    0.032 0.968]]


In [20]:
# 클래스 정보
print(lr.classes_)

[0. 1. 2.]


In [21]:
# 로지스틱 회귀가 학습한 계수
print(lr.coef_, lr.intercept_)

[[-0.68156831 -1.20469453 -2.79945335 -1.87985098  0.30106139]
 [-0.18168921  0.29441077 -1.79046171  2.66901212 -0.16171918]
 [ 0.86325752  0.91028376  4.58991506 -0.78916113 -0.13934221]] [-2.43526861  1.952231    0.48303761]


In [22]:
# 선형 방적식은 어떨까?
print(lr.coef_.shape, lr.intercept_.shape)

# 열은 5개, 행은 3개

(3, 5) (3,)


* softmax

In [23]:
# 소프트 맥스 => 다중분류에서 출력 결과를 정규화해서 합이 1이 되도록 함
# z값 출력 (z1~z3까지의 값)
decisions = lr.decision_function(test_scaled[:5])
print(np.round(decisions, decimals=2))

[[  2.78   1.25  -4.03]
 [  2.38   1.19  -3.57]
 [  1.63   1.01  -2.64]
 [  4.41   0.99  -5.4 ]
 [-11.48   3.3    8.19]]


In [24]:
# 소프트맥스 함수
# 앞서 했던 proba랑 비교 => 일치
proba = softmax(decisions, axis=1)
print(np.round(proba, decimals=3))


[[0.82  0.179 0.001]
 [0.765 0.233 0.002]
 [0.644 0.347 0.009]
 [0.968 0.032 0.   ]
 [0.    0.007 0.993]]


## 성능지표

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [26]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
pred = lr.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# 로지스틱 회귀 평가지표
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.88      0.89      0.88     13343
         1.0       0.73      0.75      0.74     13273
         2.0       0.87      0.83      0.85     13286

    accuracy                           0.82     39902
   macro avg       0.82      0.82      0.82     39902
weighted avg       0.82      0.82      0.82     39902



In [28]:
cm = confusion_matrix(y_test, pred)

In [29]:
# 오차행렬
cm

array([[11915,  1422,     6],
       [ 1689,  9902,  1682],
       [    0,  2223, 11063]], dtype=int64)