## 로지스틱 회귀
선형 방정식의 결과에 로지스틱 함수 적용

In [1]:
import pandas as pd

In [3]:
fish = pd.read_csv('https://bit.ly/fish_csv_data')
fish.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [4]:
pd.unique(fish['Species'])

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [12]:
fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()
fish_target = fish['Species'].to_numpy()

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train_input, test_input, train_target, test_target = train_test_split(fish_input, fish_target, random_state=42)

In [15]:
from sklearn.preprocessing import StandardScaler

In [17]:
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

In [19]:
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors = 3)
kn.fit(train_scaled, train_target)
kn.score(train_scaled, train_target), kn.score(test_scaled, test_target)

(0.8907563025210085, 0.85)

In [20]:
kn.classes_

array(['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish'],
      dtype=object)

In [21]:
kn.predict(test_scaled[:5])

array(['Perch', 'Smelt', 'Pike', 'Perch', 'Perch'], dtype=object)

In [23]:
kn.predict_proba(test_scaled[:5])

array([[0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.66666667, 0.        , 0.33333333,
        0.        , 0.        ],
       [0.        , 0.        , 0.66666667, 0.        , 0.33333333,
        0.        , 0.        ]])

In [31]:
bream_smelt_indexes = (train_target == 'Bream') | (train_target== 'Smelt')

In [32]:
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]

In [33]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt)

In [34]:
lr.predict(train_bream_smelt[:5])

array(['Bream', 'Smelt', 'Bream', 'Bream', 'Bream'], dtype=object)

In [36]:
lr.coef_, lr.intercept_

(array([[-0.4037798 , -0.57620209, -0.66280298, -1.01290277, -0.73168947]]),
 array([-2.16155132]))

In [37]:
lr.decision_function(train_bream_smelt[:5])

array([-6.02927744,  3.57123907, -5.26568906, -4.24321775, -6.0607117 ])

In [43]:
lr = LogisticRegression(C = 20, max_iter = 1000)

lr.fit(train_scaled, train_target)
lr.score(train_scaled, train_target), lr.score(test_scaled, test_target)

(0.9327731092436975, 0.925)

In [44]:
lr.predict(train_scaled[:5])

array(['Bream', 'Pike', 'Smelt', 'Perch', 'Parkki'], dtype=object)

In [45]:
lr.predict_proba(train_scaled[:5])

array([[9.97709415e-01, 6.11184257e-04, 7.29882059e-08, 7.59625494e-08,
        1.83505462e-05, 1.67625181e-15, 1.66090169e-03],
       [4.39369141e-07, 1.80985046e-11, 1.12121706e-03, 9.98859133e-01,
        1.79354649e-05, 1.08335570e-06, 1.91698291e-07],
       [1.62946420e-09, 2.33096777e-03, 5.07228560e-02, 9.87551986e-06,
        5.66183569e-03, 9.41270774e-01, 3.68921800e-06],
       [5.13685561e-06, 5.06675335e-04, 8.60130195e-01, 5.14595531e-04,
        1.34899273e-01, 4.25714309e-04, 3.51840974e-03],
       [9.00040183e-03, 7.83200767e-01, 6.60234680e-03, 1.53751575e-04,
        1.87400783e-01, 1.05527869e-04, 1.35364219e-02]])

In [48]:
lr.decision_function(train_scaled[:5])

array([[ 13.07725831,   5.67943945,  -3.35341645,  -3.31347386,
          2.17370031, -20.94266464,   6.67915689],
       [ -2.30948011, -12.406746  ,   5.53510528,  12.32730428,
          1.39971527,  -1.40700141,  -3.1388973 ],
       [-11.87102256,   2.30252028,   5.38261337,  -3.16145958,
          3.18998491,   8.3034676 ,  -4.14610402],
       [ -6.04989434,  -1.45846505,   5.97850357,  -1.44295427,
          4.12594817,  -1.632567  ,   0.47942891],
       [  0.13076642,   4.59688626,  -0.17907765,  -3.93891993,
          3.16674674,  -4.31528301,   0.53888116]])

In [49]:
lr.coef_, lr.intercept_

(array([[-1.49001259, -1.02909653,  2.59342992,  7.70358183, -1.20070797],
        [ 0.19618191, -2.01072007, -3.77974344,  6.50492519, -1.99481478],
        [ 3.562807  ,  6.34355461, -8.48969786, -5.75757213,  3.79306162],
        [-0.10458363,  3.60320663,  3.93067948, -3.61740078, -1.7506979 ],
        [-1.40060998, -6.0750259 ,  5.25969918, -0.87219289,  1.86043812],
        [-1.38528461,  1.49213609,  1.39225441, -5.67734711, -4.40097409],
        [ 0.6215019 , -2.32405484, -0.9066217 ,  1.71600589,  3.69369499]]),
 array([-0.09204689, -0.26289902,  3.25100925, -0.14740759,  2.65498221,
        -6.78787045,  1.38423249]))