##### 목표 : 생선 분류 모델
- 데이터 : fish.csv
- feature : 5개; Weight, Length, Diagonal,  Height, Width
- target : 1개; Species
- 방법 : 지도학습 + 다중 분류

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [2]:
# Load Data
data_file = '../../DATA/fish.csv'
fishDF = pd.read_csv(data_file)
fishDF.head(), fishDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Species   159 non-null    object 
 1   Weight    159 non-null    float64
 2   Length    159 non-null    float64
 3   Diagonal  159 non-null    float64
 4   Height    159 non-null    float64
 5   Width     159 non-null    float64
dtypes: float64(5), object(1)
memory usage: 7.6+ KB


(  Species  Weight  Length  Diagonal   Height   Width
 0   Bream   242.0    25.4      30.0  11.5200  4.0200
 1   Bream   290.0    26.3      31.2  12.4800  4.3056
 2   Bream   340.0    26.5      31.1  12.3778  4.6961
 3   Bream   363.0    29.0      33.5  12.7300  4.4555
 4   Bream   430.0    29.0      34.0  12.4440  5.1340,
 None)

In [3]:
# 2. Seperate feature and target
featureDF = fishDF.drop('Species', axis=1)
targetSR = fishDF['Species']
targetSR.nunique()  # 7

# 1) Count each class
targetSR.value_counts() / targetSR.shape[0]     # ratios of Species

Species
Perch        0.352201
Bream        0.220126
Roach        0.125786
Pike         0.106918
Smelt        0.088050
Parkki       0.069182
Whitefish    0.037736
Name: count, dtype: float64

In [4]:
# 2-2. 학습용/테스트용 데이터셋 준비
X_train, X_test, y_train, y_test = train_test_split(featureDF, targetSR, 
                                                    random_state=11, 
                                                    stratify=targetSR)

print(f'[Train Dataset] {X_train.shape}, {y_train.shape}')
print(f'[Test Dataset] {X_test.shape}, {y_test.shape}')

[Train Dataset] (119, 5), (119,)
[Test Dataset] (40, 5), (40,)


In [5]:
# 2-3. 모듈 인스턴스 생성 및 학습
model = LogisticRegression(max_iter=20000, solver='liblinear')
model.fit(X_train, y_train)

In [6]:
# 학습 후 결정된 모델 파라미터 확인
print('classes_ :', model.classes_)                     # target의 종류
print('feature_name_in_ :', model.feature_names_in_)    # feature 4개
print('n_iter_ :', model.n_iter_)                       # 8개
# The n_iter_ attribute is used to store the number of iterations a model has been trained for.
print('coef_ :', model.coef_)                           # 가중치는 5*7개
print('intercept_ :', model.intercept_)                 # 절편 3개

classes_ : ['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']
feature_name_in_ : ['Weight' 'Length' 'Diagonal' 'Height' 'Width']
n_iter_ : [20 22 19 18 17 16 19]
coef_ : [[ 1.31151754e-02 -1.64944474e+00  8.28009603e-01  1.41621596e+00
  -4.15067211e-01]
 [-2.10617657e-02  3.33701594e-01 -9.64909143e-01  2.19381184e+00
   2.66611705e-02]
 [-1.97453975e-03  2.60616873e+00 -2.66412260e+00 -7.93176726e-03
   1.91659551e+00]
 [ 1.01422059e-02  2.55168743e-01  1.51461260e-01 -1.94779290e+00
  -8.36602128e-01]
 [-9.89829706e-03 -1.72578825e+00  1.53807538e+00 -5.12880032e-01
   1.65750894e+00]
 [-7.29426634e-02  3.82049401e-01  1.62783679e-01 -1.55364795e+00
  -5.97839461e-01]
 [ 5.67536044e-03 -5.15807250e-01  2.50622960e-01 -2.45458510e-01
   8.38223029e-01]]
intercept_ : [-0.27362899  0.07982094 -0.34682853 -1.23222237 -1.32590576  0.41907035
 -0.35145235]


##### 4) 성적 평가

In [7]:
# 성적 확인 => 좋다
print('[Train Dataset]', model.score(X_train, y_train))
print('[Test Dataset]', model.score(X_test, y_test))

[Train Dataset] 0.9495798319327731
[Test Dataset] 0.975


##### 5) 모델 활용

In [8]:
# 
y_pre = model.predict(X_test.iloc[[0]])

y_pre, y_test[:1]   # 예측값 일치

(array(['Bream'], dtype=object),
 1    Bream
 Name: Species, dtype: object)

In [9]:
pd.DataFrame(model.predict_proba(X_test.iloc[[0]]))

Unnamed: 0,0,1,2,3,4,5,6
0,0.504331,0.310863,0.000376,2.252092e-07,0.172952,6.179376e-13,0.011478


In [10]:
# 5개 데이터에 대한 생선 분류 예측
pd.DataFrame(np.round(model.predict_proba(X_test.iloc[:5]), 4), columns=model.classes_)

Unnamed: 0,Bream,Parkki,Perch,Pike,Roach,Smelt,Whitefish
0,0.5043,0.3109,0.0004,0.0,0.173,0.0,0.0115
1,0.1575,0.7304,0.0443,0.0,0.0573,0.0,0.0104
2,0.7717,0.0244,0.0009,0.0,0.1796,0.0,0.0234
3,0.0008,0.0887,0.7186,0.0023,0.1554,0.0042,0.0299
4,0.0003,0.0211,0.7531,0.0089,0.176,0.0094,0.0313


In [11]:
result = model.predict_proba(X_test.iloc[:5]).argmax(axis=1)    
# .argmax() 의미 : a method used in Python to find the index of the maximum value in a NumPy array.
result

array([0, 1, 0, 2, 2], dtype=int64)

In [12]:
# 정답 비교
print(model.classes_[result])
print('정답 :', y_test[:5].to_list())

['Bream' 'Parkki' 'Bream' 'Perch' 'Perch']
정답 : ['Bream', 'Parkki', 'Bream', 'Perch', 'Perch']


In [13]:
# [ OvR : OneVsRestClassifier ]
# : One vs Rest 였다!

# 파일 만들기 : '03_OvR.ipynb'
# fin = open('03_OvR.ipynb', 'w')   ㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠ

In [14]:
# fin. 인줄 알았지만 아니어따

6) 모델 성능 평가
- 정확도
- 정밀도
- 재현율
- F1 스코어
- Confusion Metrics
- Classification Report

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix, classification_report

In [16]:
# classification_report : 

y_pre = model.predict(X_test)
print(classification_report(y_test, y_pre))

              precision    recall  f1-score   support

       Bream       1.00      1.00      1.00         9
      Parkki       1.00      1.00      1.00         3
       Perch       0.93      1.00      0.97        14
        Pike       1.00      1.00      1.00         4
       Roach       1.00      1.00      1.00         5
       Smelt       1.00      1.00      1.00         4
   Whitefish       0.00      0.00      0.00         1

    accuracy                           0.97        40
   macro avg       0.85      0.86      0.85        40
weighted avg       0.95      0.97      0.96        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
# feature, target 재지정
featureDF = fishDF.drop['species']

TypeError: 'method' object is not subscriptable

In [38]:
# 
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
targetSR = encoder.fit_transform(targetSR)

X_train, X_test, y_train, y_test = train_test_split(featureDF, targetSR, test_size=0.3, random_state=42)

In [39]:
model.fit(X_train, y_train)

In [40]:
# 

f1_score(y_test, model.predict(X_test), average='weighted')

0.9087962962962962

In [36]:
recall_score(y_test, model.predict(X_test), average='micro')

ValueError: Found input variables with inconsistent numbers of samples: [48, 111]

In [21]:
confusion_matrix(y_test, model.predict(X_test))

array([[ 9,  0,  0,  0,  0,  0,  0],
       [ 0,  3,  0,  0,  0,  0,  0],
       [ 0,  0, 14,  0,  0,  0,  0],
       [ 0,  0,  0,  4,  0,  0,  0],
       [ 0,  0,  0,  0,  5,  0,  0],
       [ 0,  0,  0,  0,  0,  4,  0],
       [ 0,  0,  1,  0,  0,  0,  0]], dtype=int64)

- f1_score() 사용 설명

In [None]:
# Actual Condition <-> Predicted condition
# : both Positive = True Positive
#   predict만 긍정 = Falsse Positive
# : Negative도 동일

# - 긍정/부정을 각각의 입장에서 바라봄
# [ 재현율 : 정답지 입장 ] : 진짜 정답은 몇 개인가
#                           TP / (TP + FN)

# [ 정밀도 : 모듈 입장 ] : 내가 긍정이라 한 것 중 몇 개 맞췄나
#                         TP / ( TP + FP)
                          
# => 둘 다 높은 값 : f1_score()

# ==> 하지만 둘 다 높긴 힘들고, 더 선호하는 특성이 있다.
#     예) 암진단 : 정밀도보단 재현율이 더 낫다

# ==> 이제 r2, mae, rmse에 이 수치도 보아야한다

In [None]:
# fin.