In [5]:
import pandas as pd

wine = pd.read_csv('/home/lws/kulws2025/kubig2025/ml_dl_python/data/wine.csv')

data = wine[['alcohol', 'sugar', 'pH']]
target = wine['class']

wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [10]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)
print(train_input.shape, test_input.shape)


(5197, 3) (1300, 3)


In [11]:
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size=0.2, random_state=42)

print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [12]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


In [13]:
from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.01113296, 0.00855756, 0.0120647 , 0.01189518, 0.00818658]), 'score_time': array([0.00535202, 0.00233746, 0.00301099, 0.00220394, 0.00230193]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [14]:
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [None]:
# 위의 코드와 동일 기본적으로 회귀, 분류에 따라 자동으로 분류기 적용
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [None]:
# 분류기 랜덤
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


In [None]:
# 파라미터 범위에서 가장 좋은 파라미터 추출 cv=분류기(shuffle=true)하면 교차검증 랜덤
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease' : [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [18]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [19]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [20]:
print(gs.cv_results_['params'][gs.best_index_])

{'min_impurity_decrease': 0.0001}


In [23]:
params = {'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001), 'max_depth':range(5, 20, 1), 'min_samples_split':range(2, 100, 10)}
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [24]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [26]:
# 파라미터 범위 랜덤 서치
from scipy.stats import uniform, randint  # 균등분포 샘폴링
rgen = randint(0, 10)
rgen.rvs(10)

array([1, 1, 3, 4, 6, 0, 6, 8, 8, 6])

In [27]:
np.unique(rgen.rvs(1000), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 98,  96, 103, 106,  97, 102,  85, 111, 109,  93]))

In [28]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.57960441, 0.2250933 , 0.53528459, 0.63795995, 0.73660822,
       0.02012556, 0.05264624, 0.64584768, 0.02419487, 0.59661254])

In [37]:
params = {'min_impurity_decrease' : uniform(0.0001, 0.001), 'max_depth': randint(20, 50), 'min_samples_split': randint(2, 25), 'min_samples_leaf': randint(1, 25)}

from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

print(rs.best_estimator_)

DecisionTreeClassifier(max_depth=39,
                       min_impurity_decrease=np.float64(0.00034102546602601173),
                       min_samples_leaf=7, min_samples_split=13,
                       random_state=42)


In [33]:
print(rs.cv_results_['mean_test_score'])
print(np.max(rs.cv_results_['mean_test_score']))

[0.86511513 0.86261235 0.86838528 0.86588547 0.86376731 0.86434497
 0.86280503 0.86280484 0.86357592 0.86357555 0.86280503 0.8626142
 0.86472977 0.86954283 0.86203543 0.86761827 0.86222884 0.86473033
 0.86877082 0.86184423 0.86126657 0.86511494 0.8626142  0.86203543
 0.86511476 0.86607722 0.86222773 0.86684682 0.86261309 0.86338436
 0.8629977  0.86242171 0.86184478 0.86165211 0.86049808 0.86530706
 0.86280521 0.86684775 0.86203524 0.86318983 0.86780947 0.86761624
 0.86126694 0.86934867 0.86857889 0.86530743 0.86434497 0.86415303
 0.86838602 0.86530688 0.86145813 0.86684626 0.8618446  0.86145961
 0.86338454 0.86569131 0.86242152 0.86376805 0.86203543 0.86376916
 0.86511457 0.86184275 0.86338454 0.86242004 0.86107481 0.86203654
 0.86184478 0.86434552 0.86184478 0.86338473 0.86299993 0.8641534
 0.86338269 0.85972662 0.86415303 0.86665433 0.86261253 0.86222884
 0.86858111 0.86472903 0.86242097 0.86261457 0.86742448 0.86434497
 0.86684682 0.86184423 0.86107481 0.86877193 0.86338362 0.862421

In [35]:
dt = rs.best_estimator_
print(dt.score(train_input, train_target))
print(dt.score(test_input, test_target))

0.8928227823744468
0.86


In [None]:
rans = RandomizedSearchCV(DecisionTreeClassifier(splitter= 'random', random_state=42), params, n_iter=100, n_jobs=-1, random_state=42)
rans.fit(train_input, train_target)

print(rans.best_estimator_)
print(np.max(rans.cv_results_['mean_test_score']))  # 교차 검증 데이터 결과의 평균에서 제일 큰 값

DecisionTreeClassifier(max_depth=43,
                       min_impurity_decrease=np.float64(0.00011407982271508446),
                       min_samples_leaf=19, min_samples_split=18,
                       random_state=42, splitter='random')
0.8458726956392981


In [None]:
dt = rans.best_estimator_
print(dt.score(train_input, train_target))   # 훈련 데이터에 대한 성능평가
print(dt.score(test_input, test_target))

0.8043101789493938
0.786923076923077
