In [None]:
# 파이프라인 테스트: 실행 환경 확인
print("hello world")

hello world


In [None]:
# ===============================================
# 1단계: 필요한 라이브러리 임포트
# ===============================================
from sklearn.datasets import load_iris          # 데이터셋 로드 함수
from sklearn.tree import DecisionTreeClassifier # 의사결정나무 분류 모델
from sklearn.model_selection import train_test_split # 학습/테스트 데이터 분할 함수

# Iris 데이터셋 로드 (sklearn 내장 데이터셋)
# - 150개 샘플, 4개 특성 (꽃받침 길이/너비, 꽃잎 길이/너비)
# - 3개 클래스 (Setosa, Versicolor, Virginica)
data = load_iris()  # numpy의 array 형태로 반환
data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [None]:
# ===============================================
# 2단계: 특성 데이터(X) 추출
# ===============================================
X = data.data  # 입력 특성 데이터 (150, 4) shape
# 머신러닝 관례: 특성(features)은 대문자 X로 표기

In [None]:
# 데이터 형태 확인 (행 개수, 열 개수)
# shape: (샘플 수, 특성 수) = (150, 4)
X.shape

(150, 4)

In [None]:
# ===============================================
# 3단계: 타겟 데이터(y) 추출
# ===============================================
y = data.target  # 정답 레이블 (0: Setosa, 1: Versicolor, 2: Virginica)
# 머신러닝 관례: 타겟(target)은 소문자 y로 표기
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
# ===============================================
# 4단계: 데이터 탐색을 위한 DataFrame 생성
# ===============================================
import pandas as pd
import numpy as np

# 판다스 DataFrame으로 변환 (데이터 전처리 및 시각화에 용이)
df = pd.DataFrame(data=data.data, columns=data.feature_names)
df['label'] = data.target  # 타겟 열 추가
df.head()  # 처음 5개 행 확인

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
# ===============================================
# 5단계: 학습/테스트 데이터 분할
# ===============================================
# train_test_split: 데이터를 학습용과 테스트용으로 분할
# - test_size=0.2: 전체 데이터의 20%를 테스트 세트로 사용 (120개 학습, 30개 테스트)
# - random_state=18: 재현 가능한 결과를 위한 난수 시드 고정
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18)

In [None]:
# 분할된 데이터의 형태 확인
# X_train: (120, 4), X_test: (30, 4), y_train: (120,), y_test: (30,)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(120, 4) (30, 4) (120,) (30,)


In [None]:
# ===============================================
# 6단계: 모델 생성 및 학습
# ===============================================
# 의사결정나무(Decision Tree) 분류 모델 생성
# - random_state=11: 트리 구성의 재현성을 위한 난수 시드
# 주의: 전처리나 데이터가 바뀌면 결과가 달라질 수 있음
model = DecisionTreeClassifier(random_state=11)

# 학습 데이터로 모델 학습 (fit)
model.fit(X_train, y_train)

0,1,2
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`.",'gini'
,"splitter  splitter: {""best"", ""random""}, default=""best"" The strategy used to choose the split at each node. Supported strategies are ""best"" to choose the best split and ""random"" to choose the best random split.",'best'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: int, float or {""sqrt"", ""log2""}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at  each split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. note::  The search for a split does not stop until at least one  valid partition of the node samples is found, even if it requires to  effectively inspect more than ``max_features`` features.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``""best""``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.",11
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


In [None]:
# ===============================================
# 7단계: 테스트 데이터로 예측 수행
# ===============================================
# 학습된 모델을 사용하여 테스트 데이터의 클래스 예측
pred = model.predict(X_test)
pred

array([1, 1, 1, 0, 0, 0, 1, 0, 2, 1, 2, 1, 0, 2, 0, 1, 0, 2, 0, 0, 1, 2,
       2, 1, 2, 0, 0, 0, 2, 2])

In [None]:
# 실제 정답 레이블 확인 (예측값과 비교하기 위함)
y_test

array([1, 1, 1, 0, 0, 0, 2, 0, 2, 1, 2, 1, 0, 2, 0, 1, 0, 2, 0, 0, 1, 2,
       2, 1, 2, 0, 0, 0, 2, 2])

In [None]:
# ===============================================
# 8단계: 모델 성능 평가
# ===============================================
# accuracy_score: 예측값과 실제값이 얼마나 일치하는지 계산
from sklearn.metrics import accuracy_score

# 정확도 계산: (올바르게 예측한 샘플 수) / (전체 샘플 수)
accuracy = accuracy_score(y_test, pred)
accuracy  # 결과: 0.0 ~ 1.0 사이의 값 (1.0이 완벽한 정확도)

0.9666666666666667