In [6]:
# 분류 & 결정 트리
# 문제1

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

digits = load_digits()
X, y = digits.data, digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8416666666666667


In [7]:
# 문제2

model_limited_depth = DecisionTreeClassifier(max_depth=3, random_state=42)
model_limited_depth.fit(X_train, y_train)
y_pred_limited = model_limited_depth.predict(X_test)

model_unlimited_depth = DecisionTreeClassifier(random_state=42)
model_unlimited_depth.fit(X_train, y_train)
y_pred_unlimited = model_unlimited_depth.predict(X_test)

accuracy_limited = accuracy_score(y_test, y_pred_limited)
accuracy_unlimited = accuracy_score(y_test, y_pred_unlimited)

print("Accuracy with max_depth=3:", accuracy_limited)
print("Accuracy without max_depth limit:", accuracy_unlimited)

Accuracy with max_depth=3: 0.4722222222222222
Accuracy without max_depth limit: 0.8416666666666667


In [8]:
# 문제3

feature_importances = model_limited_depth.feature_importances_
print("\nFeature Importances:")
for i, importance in enumerate(feature_importances):
    print(f"Feature {i}: {importance:.4f}")


Feature Importances:
Feature 0: 0.0000
Feature 1: 0.0000
Feature 2: 0.0000
Feature 3: 0.0000
Feature 4: 0.0000
Feature 5: 0.0000
Feature 6: 0.0000
Feature 7: 0.0000
Feature 8: 0.0000
Feature 9: 0.0000
Feature 10: 0.0000
Feature 11: 0.0000
Feature 12: 0.0000
Feature 13: 0.0000
Feature 14: 0.0000
Feature 15: 0.0000
Feature 16: 0.0000
Feature 17: 0.0000
Feature 18: 0.0000
Feature 19: 0.0000
Feature 20: 0.0000
Feature 21: 0.2492
Feature 22: 0.0000
Feature 23: 0.0000
Feature 24: 0.0000
Feature 25: 0.0000
Feature 26: 0.0000
Feature 27: 0.0000
Feature 28: 0.0000
Feature 29: 0.0000
Feature 30: 0.0000
Feature 31: 0.0000
Feature 32: 0.0000
Feature 33: 0.0000
Feature 34: 0.0000
Feature 35: 0.0000
Feature 36: 0.2256
Feature 37: 0.0000
Feature 38: 0.0000
Feature 39: 0.0000
Feature 40: 0.0000
Feature 41: 0.0000
Feature 42: 0.3434
Feature 43: 0.0000
Feature 44: 0.0000
Feature 45: 0.0000
Feature 46: 0.0000
Feature 47: 0.0000
Feature 48: 0.0000
Feature 49: 0.0000
Feature 50: 0.0000
Feature 51: 0.0000


In [9]:
# 문제4

X_train_70, X_test_30, y_train_70, y_test_30 = train_test_split(X, y, test_size=0.3, random_state=42)
model_70_30 = DecisionTreeClassifier(random_state=42)
model_70_30.fit(X_train_70, y_train_70)
accuracy_70_30 = accuracy_score(y_test_30, model_70_30.predict(X_test_30))

accuracy_80_20 = accuracy_score(y_test, model.predict(X_test))

print("\nAccuracy with 70:30 split:", accuracy_70_30)
print("Accuracy with 80:20 split:", accuracy_80_20)


Accuracy with 70:30 split: 0.8425925925925926
Accuracy with 80:20 split: 0.8416666666666667


In [10]:
# 문제5

from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("\nBest Hyperparameters:", best_params)
print("Best Cross-Validation Accuracy:", best_score)


Best Hyperparameters: {'max_depth': None, 'min_samples_split': 2}
Best Cross-Validation Accuracy: 0.8510646535036779


In [16]:
# 앙상블
# 문제1

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

wine = load_wine()
X, y = wine.data, wine.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=11)

In [17]:
# 문제2

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

svc_clf = SVC(probability=True, random_state=11)
svc_clf.fit(X_train, y_train)

rf_clf = RandomForestClassifier(n_estimators=100, random_state=11)
rf_clf.fit(X_train, y_train)

gb_clf = GradientBoostingClassifier(random_state=11)
gb_clf.fit(X_train, y_train)

svc_acc = accuracy_score(y_test, svc_clf.predict(X_test))
rf_acc = accuracy_score(y_test, rf_clf.predict(X_test))
gb_acc = accuracy_score(y_test, gb_clf.predict(X_test))

print(f"SVC Accuracy: {svc_acc:.4f}")
print(f"RandomForest Accuracy: {rf_acc:.4f}")
print(f"GradientBoosting Accuracy: {gb_acc:.4f}")

SVC Accuracy: 0.8000
RandomForest Accuracy: 0.9778
GradientBoosting Accuracy: 0.9556


In [18]:
# 문제3

from sklearn.ensemble import VotingClassifier

hard_voting_clf = VotingClassifier(
    estimators=[
        ('svc', svc_clf),
        ('rf', rf_clf),
        ('gb', gb_clf)
    ],
    voting='hard'
)
hard_voting_clf.fit(X_train, y_train)

hard_voting_acc = accuracy_score(y_test, hard_voting_clf.predict(X_test))
print(f"Hard Voting Classifier Accuracy: {hard_voting_acc:.4f}")

Hard Voting Classifier Accuracy: 0.9556


In [19]:
# 문제4

soft_voting_clf = VotingClassifier(
    estimators=[
        ('svc', svc_clf),
        ('rf', rf_clf),
        ('gb', gb_clf)
    ],
    voting='soft'
)
soft_voting_clf.fit(X_train, y_train)

soft_voting_acc = accuracy_score(y_test, soft_voting_clf.predict(X_test))
print(f"Soft Voting Classifier Accuracy: {soft_voting_acc:.4f}")

Soft Voting Classifier Accuracy: 0.9556


In [21]:
# 문제5

from sklearn.ensemble import BaggingClassifier

bagging_clf = BaggingClassifier(
    estimator=GradientBoostingClassifier(),
    n_estimators=50,
    bootstrap=True,
    oob_score=True,
    random_state=11
)

bagging_clf.fit(X_train, y_train)

print(f"OOB Score: {bagging_clf.oob_score_:.4f}")

OOB Score: 0.9474


In [22]:
# 랜덤포레스트
# 문제1

import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

wine = load_wine()
X, y = wine.data, wine.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

Random Forest Accuracy: 1.0000


In [23]:
# 문제2

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

wine = load_wine()
X, y = wine.data, wine.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7]
}

gb_clf = GradientBoostingClassifier(random_state=42)
grid_cv = GridSearchCV(gb_clf, param_grid=params, cv=5, n_jobs=-1)
grid_cv.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_cv.best_params_)
y_pred = grid_cv.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Gradient Boosting Accuracy with Best Params: {accuracy:.4f}")

최적 하이퍼파라미터: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}
Gradient Boosting Accuracy with Best Params: 0.9444


In [24]:
# 서포트 백터 머신
# 문제1

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=200, noise=0.25)

rbf_kernel_svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('svm_clf', SVC(kernel='rbf', gamma=1, C=0.001))
])

rbf_kernel_svm_clf.fit(X, y)