In [1]:
import pandas as pd
fish = pd.read_csv('https://bit.ly/fish_csv_data')
fish.head() # 데이터 프레임의 첫번째 행부터 5개 추출

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [2]:
# pandas unique : 명시된 컬럼에 중복 제거
print(pd.unique(fish['Species']))

['Bream' 'Roach' 'Whitefish' 'Parkki' 'Perch' 'Pike' 'Smelt']


In [5]:
# Species : 타겟 데이터(정답 데이터), 그외 - 학습 데이터
import numpy as np
fish_input = fish[['Weight', 'Length', 'Diagonal', 'Height', 'Width']].to_numpy()
print(fish_input[:5])

[[242.      25.4     30.      11.52     4.02  ]
 [290.      26.3     31.2     12.48     4.3056]
 [340.      26.5     31.1     12.3778   4.6961]
 [363.      29.      33.5     12.73     4.4555]
 [430.      29.      34.      12.444    5.134 ]]


In [6]:
fish_target = fish['Species'].to_numpy()  # 타겟 데이터(정답 데이터)

In [7]:
from sklearn.model_selection import train_test_split  # 학습 세트, 테스트 세트 분리

train_input, test_input, train_target, test_target = train_test_split(
    fish_input, fish_target
)

In [8]:
# 데이터 전처리 - 표준 점수 변환 
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)  # 훈련 세트 훈련 데이터를 표준 점수로 변환
test_scaled = ss.transform(test_input) # 테스트 세트 데이터를 표준 점수로 변환
print(train_scaled[:5])

[[-0.43640294 -0.10450739 -0.21942844 -0.41551352  0.06656912]
 [ 3.46762594  3.19775616  3.10840273  0.41084745  1.78141367]
 [-0.65949031 -0.47244762 -0.55136045 -0.65148955 -0.42849823]
 [ 1.4458967   0.99931329  1.17638821  2.27051087  1.35161376]
 [ 0.81846349  0.58538054  0.76785651  1.68192887  0.66851263]]


In [9]:
# k-최근접 이웃 분류 - 분류, 확율 예측
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target) # 학습

print(kn.score(train_scaled, train_target)) # 훈련 세트 정확도
print(kn.score(test_scaled, test_target)) # 테스트 세트 정확도

0.8991596638655462
0.725


In [10]:
# 클래스 목록 (분류 항목) - 알파벳 순서대로 
print(kn.classes_)

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


In [11]:
print(kn.predict(test_scaled[:5]))

['Perch' 'Perch' 'Perch' 'Pike' 'Perch']


In [13]:
# 분류 기준 확률 kn.predict_proba(..)
proba = kn.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=3))


[[0.    0.    1.    0.    0.    0.    0.   ]
 [0.    0.    0.667 0.    0.333 0.    0.   ]
 [0.    0.    0.667 0.    0.333 0.    0.   ]
 [0.    0.    0.    1.    0.    0.    0.   ]
 [0.    0.    0.667 0.    0.333 0.    0.   ]]


In [19]:
distances, indexes = kn.kneighbors(test_scaled[2:3])
print(train_target[indexes])  # 확률로 보기에는 조금 어색 -> 로지스틱 회귀 

[['Perch' 'Perch' 'Roach']]


In [21]:
# 로지스틱 회귀 - 이진 분류(도미, 방어)
bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]
print(target_bream_smelt[:5])

['Bream' 'Bream' 'Smelt' 'Bream' 'Smelt']


In [22]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt) # 훈련

In [23]:
print(lr.predict(train_bream_smelt[:5]))

['Bream' 'Bream' 'Smelt' 'Bream' 'Smelt']


In [25]:
# 확률 predict_proba
print(np.round(lr.predict_proba(train_bream_smelt[:5]), decimals=3))

[[1.    0.   ]
 [0.997 0.003]
 [0.033 0.967]
 [0.999 0.001]
 [0.026 0.974]]


In [26]:
print(lr.classes_)

['Bream' 'Smelt']


In [27]:
# 가중치(기울기)와 절편 
print(lr.coef_, lr.intercept_)

[[-0.43373578 -0.61832734 -0.70804417 -1.04167419 -0.77285945]] [-2.31732276]


In [28]:
# 학습을 통해서 구한 가중치와 절편으로 방정식의 결과 값을 구하는 함수 decision_function
decisions = lr.decision_function(train_bream_smelt[:5])
print(decisions)  # 결과값을 0~1 사이로 변환 - 시그모이드 함수


[-7.80503747 -5.84664097  3.38479691 -7.57358917  3.6207215 ]


In [30]:
# 시그모이드 함수로 값을 0~1 사이로 변환
from scipy.special import expit  # 시그모이드 함수 
print(np.round(expit(decisions), decimals=3))

[0.    0.003 0.967 0.001 0.974]


In [35]:
# 로지스틱 회귀 - 다중 분류 / 확률 - 소프트맥스 함수
lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(train_scaled, train_target) # 학습 

print(lr.score(train_scaled, train_target)) # 학습 세트
print(lr.score(test_scaled, test_target)) # 테스트 세트

0.9159663865546218
0.9


In [36]:
print(lr.predict(test_scaled[:5]))

['Perch' 'Perch' 'Perch' 'Pike' 'Roach']


In [37]:
print(lr.classes_)

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']


In [38]:
proba = lr.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=3))

[[0.001 0.    0.982 0.    0.002 0.    0.014]
 [0.001 0.003 0.744 0.007 0.195 0.001 0.049]
 [0.    0.006 0.548 0.002 0.41  0.002 0.032]
 [0.    0.    0.003 0.992 0.001 0.001 0.003]
 [0.001 0.023 0.292 0.002 0.625 0.001 0.055]]


In [39]:
print(lr.coef_, lr.intercept_)

[[-1.67531636 -1.18232654  2.60225622  7.836148   -0.71792213]
 [-0.04467367 -1.76217426 -3.60487685  6.41822983 -2.12484656]
 [ 3.25256818  4.92528423 -8.72162749 -6.06027811  5.73842698]
 [-0.01599182  3.2372586   3.39947125 -3.00346623 -1.44586178]
 [-2.08345062 -5.64523948  4.68975793 -1.04122175  2.67132802]
 [-1.27745593  1.28394227  0.94720644 -4.7868861  -4.23761722]
 [ 1.84432023 -0.85674481  0.68781249  0.63747436  0.11649269]] [-0.07698997 -0.39531641  2.52357374 -0.51524849  1.93799793 -6.13056161
  2.6565448 ]


In [40]:
# decision_function
decisions = lr.decision_function(test_scaled[:5])
print(np.round(decisions, decimals=3))

[[  1.76   -4.287   8.583   0.656   2.584 -13.644   4.347]
 [ -3.07   -1.411   4.065  -0.558   2.729  -3.103   1.349]
 [ -4.293  -0.559   3.882  -1.781   3.592  -1.892   1.052]
 [ -3.436  -8.312   1.535   7.456   0.874   0.191   1.69 ]
 [ -3.073   0.275   2.829  -2.157   3.589  -2.629   1.165]]


In [41]:
from scipy.special import softmax

proba = softmax(decisions, axis=1)  # 행별로(axis=1)
print(np.round(proba, decimals=3))

[[0.001 0.    0.982 0.    0.002 0.    0.014]
 [0.001 0.003 0.744 0.007 0.195 0.001 0.049]
 [0.    0.006 0.548 0.002 0.41  0.002 0.032]
 [0.    0.    0.003 0.992 0.001 0.001 0.003]
 [0.001 0.023 0.292 0.002 0.625 0.001 0.055]]
