# 펭귄 분류 ML

## 0. 필요 의존성

In [1]:
# Data 연산
import pandas as pd
import numpy as np

# 시각화
import seaborn as sns
import matplotlib.pyplot as plt

# 전처리 
from sklearn.impute import SimpleImputer # 결측치 처리
from sklearn.preprocessing import LabelEncoder # 라벨링
from sklearn.preprocessing import StandardScaler, MinMaxScaler # 표준화/정규화

from sklearn.model_selection import StratifiedKFold  # 편중 방지(cv 값 생성)
from sklearn.model_selection import train_test_split # 데이터 분할

# 모델
from sklearn.ensemble import RandomForestClassifier # 분류 모델
from sklearn.tree import DecisionTreeClassifier # 분류 모델
from sklearn.linear_model import LogisticRegression # 회귀 모델
from sklearn.metrics import accuracy_score # 모델 평가

# ETC
import warnings # Deprecate Warning 제거
warnings.filterwarnings(action='ignore')

## 1. 데이터 파악

In [2]:
df = pd.read_csv('../data/penguins_size.csv')
df_raw = df.copy() # 원본 데이터 혹시 모르니 keep

df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [4]:
df.describe()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [5]:
df['island'].unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [6]:
# 불필요한 컬럼 제거(상상력으로 최대한 제거하지 않고 활용 요망)

# 이름과 고유번호 같은 값 아니면 제거하지 말자 - 내가 모르는 연관이 있을 수 있음
#df.drop(['island', 'sex'], axis = 1, inplace=True) # inplace=True : 원본을 변경(재할당)
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


## 2. 결측치 처리

In [7]:
df.isnull().sum()

df['culmen_length_mm'] = df['culmen_length_mm'].fillna(df['culmen_length_mm'].mean())
df['culmen_depth_mm'] = df['culmen_depth_mm'].fillna(df['culmen_depth_mm'].mean())
df['flipper_length_mm'] = df['flipper_length_mm'].fillna(df['flipper_length_mm'].mean())
df['body_mass_g'] = df['body_mass_g'].fillna(df['body_mass_g'].mean())

## 3. 이상치 제거

In [8]:
def remove_iqr_same(df, columns):
    
    df_clean = df.copy()

    ranges = []
    
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
    
        lower = Q1 - (1.5 * IQR)
        upper = Q3 + (1.5 * IQR)
    
        ranges.append((lower, upper))
    
    for index, col in enumerate(columns):
        lower, upper = ranges[index]
        df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]

    return df_clean

In [9]:
df = remove_iqr_same(df, ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g'])

## 4. 범주형 인코딩

In [10]:
labeling_columns_simple(df, ['species', 'island', 'sex'])

NameError: name 'labeling_columns_simple' is not defined

## 5. 스케일링

In [None]:
# 스케일링 
std_scaler = StandardScaler() # 표준화(머신러닝에서 가장 일반적)
std_scaler.fit_transform(df)

array([[-1.0298023 ,  1.84407623, -0.88708123, ..., -1.42248782,
        -0.56578921,  0.8170105 ],
       [-1.0298023 ,  1.84407623, -0.81349399, ..., -1.06535169,
        -0.50316788, -0.97312716],
       [-1.0298023 ,  1.84407623, -0.66631952, ..., -0.42250666,
        -1.19200251, -0.97312716],
       ...,
       [ 1.21229891, -0.91402039,  1.1917582 , ...,  1.50602843,
         1.93906399,  0.8170105 ],
       [ 1.21229891, -0.91402039,  0.23512413, ...,  0.79175618,
         1.25022936, -0.97312716],
       [ 1.21229891, -0.91402039,  1.09977416, ...,  0.8631834 ,
         1.50071468,  0.8170105 ]], shape=(344, 7))

## 6. 데이터 분할

In [None]:
# 데이터 분리

X = df.drop('species', axis=1)
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=7)

## 7. ML 학습 및 모델 예측

In [None]:
# 분류 모델

dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()
lr_model = LogisticRegression()

In [None]:
dt_model.fit(X_train, y_train) # (테스트 문제, 테스트 답) - 지도 학습
result = dt_model.predict(X_test) # (테스트 문제) => 예측한 답(result) -> 실제 답(y_test)과 비교
accuracy_score(result, y_test)

0.9420289855072463

In [None]:
rf_model.fit(X_train, y_train) 
result = rf_model.predict(X_test) 
accuracy_score(result, y_test)

0.9855072463768116

In [None]:
lr_model.fit(X_train, y_train) 
result = lr_model.predict(X_test) 
accuracy_score(result, y_test)

1.0