# 다항 로지스틱 회귀모형 실습

__[예제]__   
iris 데이터의 Species 를 분류하는 다항 로지스틱 회귀분석을 실시하고 오분류표를 만들어라

In [1]:
import pandas as pd 
iris = pd.read_csv('./data/iris.csv')

X = iris.drop(['target'],axis=1)
y = iris.target

In [16]:
y.value_counts()

Iris-setosa        50
Iris-virginica     50
Iris-versicolor    50
Name: target, dtype: int64

In [18]:
# 훈련셋 평가셋 분리하기
from sklearn.model_selection import train_test_split

#y 값을 기준으로 50:50:50개씩 1:1:1 비율로 샘플링이 되도록 stratify=y 적용
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.7,
                                                   test_size = 0.3, random_state=123)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(105, 4) (45, 4) (105,) (45,)


In [20]:
y_test.value_counts()

Iris-setosa        15
Iris-virginica     15
Iris-versicolor    15
Name: target, dtype: int64

## sklearn 모델 생성

* sklearn 모델은 l2 패널티를 이용하여 전통적 통계모델에서 다중공선성의 문제를 내부적으로 해결해준다. 
* 독립변수 간의 상관성이 높은 변수라면, l2패널티를 0에 가깝게하여 변수를 삭제하는 것과 같은 효과를 보인다. 

In [23]:
# 모델 적합하기
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print(model)

LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 모델 평가

In [24]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
# 테스트셋 예측
predicted = model.predict(X_test)

# 오분류표 생성
cm = confusion_matrix(y_test, predicted)
cmtb = pd.DataFrame(cm, columns=['prediected_setosa', 'predicted_versicolor', 'predicted_virginica'],
                   index = ['setosa', 'versicolor', 'virginica'])

cmtb

Unnamed: 0,prediected_setosa,predicted_versicolor,predicted_virginica
setosa,15,0,0
versicolor,0,14,1
virginica,0,0,15


In [28]:
 model.predict(X_test)

array(['Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-versicolor',
       'Iris-setosa', 'Iris-virginica', 'Iris-virginica', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',
       'Iris-virginica', 'Iris-setosa', 'Iris-versicolor',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor'], dtype=object)

In [29]:
# 정확도
print('Accuracy Score: ',accuracy_score(y_test, predicted))

print('\n')
# 분류 리포트 생성하기
class_report = classification_report(y_test, predicted)
print(class_report)

Accuracy Score:  0.9777777777777777


                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        15
Iris-versicolor       1.00      0.93      0.97        15
 Iris-virginica       0.94      1.00      0.97        15

       accuracy                           0.98        45
      macro avg       0.98      0.98      0.98        45
   weighted avg       0.98      0.98      0.98        45



In [32]:
roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')

0.9985185185185186

In [33]:
help(roc_auc_score)

Help on function roc_auc_score in module sklearn.metrics._ranking:

roc_auc_score(y_true, y_score, *, average='macro', sample_weight=None, max_fpr=None, multi_class='raise', labels=None)
    Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
    from prediction scores.
    
    Note: this implementation can be used with binary, multiclass and
    multilabel classification, but some restrictions apply (see Parameters).
    
    Read more in the :ref:`User Guide <roc_metrics>`.
    
    Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_classes)
        True labels or binary label indicators. The binary and multiclass cases
        expect labels with shape (n_samples,) while the multilabel case expects
        binary label indicators with shape (n_samples, n_classes).
    
    y_score : array-like of shape (n_samples,) or (n_samples, n_classes)
        Target scores.
    
        * In the binary case, it corresponds to an array 

## 다항 로지스틱 회귀 계수 해석 


In [8]:
# 회귀계수 확인하기
print('Intercept: \n', model.intercept_)
print('Coefficient: \n', model.coef_)

Intercept: 
 [  9.42940015   2.10066833 -11.53006848]
Coefficient: 
 [[-0.45747705  0.87262687 -2.30840796 -0.96053751]
 [ 0.37578083 -0.19466078 -0.16297032 -0.75289644]
 [ 0.08169622 -0.6779661   2.47137828  1.71343395]]


In [9]:
# 오즈비 계산하기
import numpy as np
np.exp(model.coef_)

array([[ 0.63287835,  2.39318921,  0.09941941,  0.38268713],
       [ 1.45612796,  0.82311383,  0.8496164 ,  0.47100035],
       [ 1.08512612,  0.50764845, 11.83875275,  5.54798029]])

In [10]:
pd.DataFrame(np.exp(model.coef_), columns=X_train.columns, index = model.classes_)

Unnamed: 0,sepal length,sepal width,petal length,petal width
Iris-setosa,0.632878,2.393189,0.099419,0.382687
Iris-versicolor,1.456128,0.823114,0.849616,0.471
Iris-virginica,1.085126,0.507648,11.838753,5.54798



*<b> 다른 변수가 일정할 때, sepal width가 1단위 증가하면 Iris-setosa로 분류될 확률이 2.393189배 증가한다. </b>