In [2]:
import pandas as pd
fish = pd.read_csv('https://bit.ly/fish_csv_data')
fish.head() # 데이터 프레임의 첫 번째 행 부터 5개 추출

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [3]:
# pandas.unique : 명시된 컬럼에 중복 제거
print(pd.unique(fish['Species']))

['Bream' 'Roach' 'Whitefish' 'Parkki' 'Perch' 'Pike' 'Smelt']


In [5]:
# Specties : 타켓 데이터(정답 데이터), 그 외 - 학습 데이터'
import numpy as np
fish_input = fish[['Weight','Length','Diagonal','Height','Width']].to_numpy()
print(fish_input[:5])

[[242.      25.4     30.      11.52     4.02  ]
 [290.      26.3     31.2     12.48     4.3056]
 [340.      26.5     31.1     12.3778   4.6961]
 [363.      29.      33.5     12.73     4.4555]
 [430.      29.      34.      12.444    5.134 ]]


In [9]:
fish_target = fish['Species'].to_numpy()  # 타겟 데이터(정답 데이터)

In [10]:
from sklearn.model_selection import train_test_split # 학습 세트, 테스트 세트

train_input, test_input, train_target, test_target = train_test_split(
    fish_input, fish_target)

In [11]:
# 데이터 전처리 -  표준 점수 변환
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input) # 훈련 세트 훈련 데이터를 표준 점수로 변환
test_scaled = ss.transform(test_input) # 테스트 세트 데이터를 표준 점수로 변환
print(train_scaled[:5])

[[-1.06211083 -1.47342696 -1.50690043 -1.51558146 -1.84515612]
 [-0.80188647 -0.59352996 -0.66168384 -0.81003437 -0.24649461]
 [-0.49209556 -0.41021809 -0.49264052 -0.40609868 -0.40600889]
 [ 1.11073566  0.96462097  0.84280168  0.76619318  1.72186471]
 [ 0.45074634  0.50634129  0.6906627   1.38155344  0.67687023]]


In [12]:
# k-최근접 이웃 분류 - 분류, 확율 예측
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target) # 학습

print(kn.score(train_scaled, train_target)) # 훈련 세트 정확도
print(kn.score(test_scaled, test_target)) # 테스트 세트 정확도

0.907563025210084
0.725


In [13]:
# 클래스 목록 (분류 항목) - 알파벳 순서대로
print(kn.classes_)

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


In [14]:
print(kn.predict(test_scaled[:5]))

['Perch' 'Perch' 'Pike' 'Perch' 'Bream']


In [16]:
# 분류 기준 확류 kn.predict_proba(..)
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=3))

[[0.    0.    1.    0.    0.    0.    0.   ]
 [0.    0.    1.    0.    0.    0.    0.   ]
 [0.    0.    0.    1.    0.    0.    0.   ]
 [0.    0.    0.667 0.    0.333 0.    0.   ]
 [1.    0.    0.    0.    0.    0.    0.   ]]


In [17]:
distances, indexes = kn.kneighbors(test_scaled[2:3])
print(train_target[indexes]) # 확률로 보기에는 조금 어색 -> 로지스틱 회귀


[['Pike' 'Pike' 'Pike']]


In [20]:
# 로지스틱 회귀 - 이진 분류(도미, 방어)
bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]
print(target_bream_smelt[:5])

['Smelt' 'Bream' 'Smelt' 'Bream' 'Bream']


In [21]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt) # 훈련

In [22]:
print(lr.predict(train_bream_smelt[:5]))

['Smelt' 'Bream' 'Smelt' 'Bream' 'Bream']


In [23]:
# 확률 predict_proba
print(np.round(lr.predict_proba(train_bream_smelt[:5]), decimals=3))

[[0.034 0.966]
 [0.993 0.007]
 [0.029 0.971]
 [0.999 0.001]
 [0.98  0.02 ]]


In [24]:
print(lr.classes_)

['Bream' 'Smelt']


In [25]:
# 가중치(기울기)와 절편 
print(lr.coef_, lr.intercept_)

[[-0.45602524 -0.62529069 -0.70501653 -0.99724969 -0.76490452]] [-2.04668198]


In [26]:
# 학습을 통해서 구한 가중치와 절편으로 방정식의 결과 값을 구하는 함수 decision_function
decisions = lr.decision_function(train_bream_smelt[:5])
print(decisions)  # 결과값을 0~1 사이로 변환 - 시그모이드 함수

[ 3.34415863 -4.95126763  3.50209638 -7.15721827 -3.91747867]


In [27]:
# 시그모이드 함수로 값을 0~1 사이로 변환
from scipy.special import expit  # 시그모이드 함수 
print(np.round(expit(decisions), decimals=3))

[0.966 0.007 0.971 0.001 0.02 ]


In [28]:
# 로지스틱 회귀 - 다중 분류 / 확률 - 소프트맥스 함수
lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(train_scaled, train_target) # 학습 

print(lr.score(train_scaled, train_target)) # 학습 세트
print(lr.score(test_scaled, test_target)) # 테스트 세트

0.9411764705882353
0.85


In [29]:
print(lr.predict(test_scaled[:5]))

['Perch' 'Perch' 'Pike' 'Perch' 'Bream']


In [30]:
print(lr.classes_)

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


In [31]:
proba = lr.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=3))

[[0.007 0.    0.892 0.001 0.    0.    0.099]
 [0.    0.003 0.832 0.    0.157 0.001 0.007]
 [0.    0.    0.071 0.903 0.02  0.    0.006]
 [0.001 0.003 0.759 0.009 0.191 0.    0.037]
 [0.999 0.    0.    0.    0.    0.    0.001]]


In [32]:
print(lr.coef_, lr.intercept_)

[[-1.05659403 -0.78484375  2.34110536  7.67743913 -0.66059656]
 [-0.60231575 -2.19715217 -3.33095998  6.91253761 -2.09953576]
 [ 4.23271714  5.36213529 -8.81266516 -6.5326481   5.02078486]
 [ 0.07478801  3.44596281  3.58708453 -3.00506475 -1.62803208]
 [-3.68573608 -5.20003838  4.49822882 -0.39932598  2.66874327]
 [-1.40337848  0.78559524  1.04617534 -5.09557389 -4.41623152]
 [ 2.44051919 -1.41165905  0.67103108  0.44263598  1.11486778]] [ 0.16065003 -0.29694411  2.8535189  -0.0896373   1.5520009  -6.95472602
  2.77513761]


In [33]:
# decision_function
decisions = lr.decision_function(test_scaled[:5])
print(np.round(decisions, decimals=3))

[[  4.203  -5.234   9.056   2.439  -0.21  -17.116   6.862]
 [ -5.875  -0.258   5.369  -2.153   3.703  -1.338   0.553]
 [ -4.006  -8.448   3.851   6.395   2.589  -1.691   1.31 ]
 [ -3.048  -1.324   4.239  -0.196   2.86   -3.757   1.225]
 [ 15.446   5.129  -3.322  -0.941  -1.726 -22.518   7.932]]


In [34]:
from scipy.special import softmax

proba = softmax(decisions, axis=1)  # 행별로(axis=1)
print(np.round(proba, decimals=3))

[[0.007 0.    0.892 0.001 0.    0.    0.099]
 [0.    0.003 0.832 0.    0.157 0.001 0.007]
 [0.    0.    0.071 0.903 0.02  0.    0.006]
 [0.001 0.003 0.759 0.009 0.191 0.    0.037]
 [0.999 0.    0.    0.    0.    0.    0.001]]
