In [None]:
import pandas as pd
import numpy as np

In [None]:
# Data 수집 (csv)
df = pd.read_csv('https://bit.ly/fish_csv_data')
pd.unique(df['Species'])

In [None]:
#         row, col
X = df.iloc[:, 1:]  # 학습시킬 데이터 matrix (M x N)
y = df.iloc[:,  0]  # 결과 데이터 vector # (M x 1)

In [None]:
from sklearn.model_selection import train_test_split
# 데이터 쪼개기
train_input, test_input, train_target, test_target = train_test_split(
    X, y, random_state=42
)

In [None]:
from sklearn.preprocessing import StandardScaler
# 표준화 (Standardization)
scaler = StandardScaler()

scaler.fit(train_input)

train_scaled = scaler.transform(train_input)
test_scaled = scaler.transform(test_input)

In [116]:
# KNN 분류기
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target)
# 훈련 점수 / 테스트 점수
kn.score(train_scaled, train_target), kn.score(test_scaled, test_target)

(0.8907563025210085, 0.85)

In [None]:
kn.predict(test_scaled[:5])

In [None]:
proba = kn.predict_proba(test_scaled[:5])
# 반올림
print(kn.classes_)
np.round(proba, decimals=3)

In [None]:
# 시그모이드 함수로 음수/양수 모든 값을 0 ~ 1 사이의 값으로 변환
# -5 ~ 5의 값을 전부 0 ~ 1 로 바꾸는 예시코드
import numpy as np
import matplotlib.pyplot as plt
z = np.arange(-5, 5, 0.1)
phi = 1 / (1 + np.exp(-z))
plt.plot(z, phi)
plt.xlabel('z')
plt.ylabel('phi')
plt.show()

In [109]:
# Logistic Regression => 0 ~ 1 사이의 값(확률)

# bream 이랑 smelt 만 뽑아서 훈련/테스트 (이진분류)
bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')
train_bs = train_scaled[bream_smelt_indexes]
target_bs = train_target[bream_smelt_indexes]

In [97]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_bs, target_bs)

In [100]:
lr.predict(train_bs[:5])

array(['Bream', 'Smelt', 'Bream', 'Bream', 'Bream'], dtype=object)

In [107]:
lr.predict_proba(train_bs[:5])

array([[0.99760007, 0.00239993],
       [0.02737325, 0.97262675],
       [0.99486386, 0.00513614],
       [0.98585047, 0.01414953],
       [0.99767419, 0.00232581]])

In [122]:
# 다중 분류
lr = LogisticRegression(max_iter=1000, C=20)
lr.fit(train_scaled, train_target)

print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))

0.9327731092436975
0.925


In [128]:
proba = lr.predict_proba(test_scaled[:5])
print(lr.classes_)
np.round(proba, 3)

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


array([[0.   , 0.014, 0.842, 0.   , 0.135, 0.007, 0.003],
       [0.   , 0.003, 0.044, 0.   , 0.007, 0.946, 0.   ],
       [0.   , 0.   , 0.034, 0.934, 0.015, 0.016, 0.   ],
       [0.011, 0.034, 0.305, 0.006, 0.567, 0.   , 0.076],
       [0.   , 0.   , 0.904, 0.002, 0.089, 0.002, 0.001]])

In [136]:
result = lr.decision_function(test_scaled[:5])
np.round(result, 2)  # 실제 확률 값


array([[ -6.51,   1.04,   5.17,  -2.76,   3.34,   0.35,  -0.63],
       [-10.88,   1.94,   4.78,  -2.42,   2.99,   7.84,  -4.25],
       [ -4.34,  -6.24,   3.17,   6.48,   2.36,   2.43,  -3.87],
       [ -0.69,   0.45,   2.64,  -1.21,   3.26,  -5.7 ,   1.26],
       [ -6.4 ,  -1.99,   5.82,  -0.13,   3.5 ,  -0.09,  -0.7 ]])

In [138]:
from scipy.special import softmax
proba = softmax(result, axis=1)
np.round(proba, 3)

array([[0.   , 0.014, 0.842, 0.   , 0.135, 0.007, 0.003],
       [0.   , 0.003, 0.044, 0.   , 0.007, 0.946, 0.   ],
       [0.   , 0.   , 0.034, 0.934, 0.015, 0.016, 0.   ],
       [0.011, 0.034, 0.305, 0.006, 0.567, 0.   , 0.076],
       [0.   , 0.   , 0.904, 0.002, 0.089, 0.002, 0.001]])