In [6]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.metrics import accuracy_score
import numpy as np

In [7]:
# Step 1: Load the wine dataset
wine = load_wine()
X = wine.data
y = wine.target

In [8]:
# Step 2: Split the dataset into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:

# Step 3: Hyperparameter tuning using RandomizedSearchCV for Decision Tree
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

tree = DecisionTreeClassifier()
random_search = RandomizedSearchCV(tree, param_distributions=param_dist, n_iter=100, random_state=42)
random_search.fit(X_train, y_train)

best_tree = random_search.best_estimator_

print("Best parameters for Decision Tree:", random_search.best_params_)

Best parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 3, 'max_features': 7, 'min_samples_leaf': 2}


In [11]:

# Evaluate the accuracy of the best Decision Tree
y_pred_tree = best_tree.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print("Accuracy of Decision Tree:", accuracy_tree)


Accuracy of Decision Tree: 0.8888888888888888


In [12]:
# Step 4: Grow a random forest
# Create 10 subsets of the training dataset using ShuffleSplit
ss = ShuffleSplit(n_splits=10, train_size=0.8, random_state=42)

In [13]:
# Train 10 decision trees on each subset
forest = []
for train_index, _ in ss.split(X_train):
    tree = DecisionTreeClassifier(**random_search.best_params_)
    tree.fit(X_train[train_index], y_train[train_index])
    forest.append(tree)

# Evaluate all the trees on the test dataset
accuracy_forest = []
for tree in forest:
    y_pred_tree = tree.predict(X_test)
    accuracy_forest.append(accuracy_score(y_test, y_pred_tree))

print("Accuracy of Random Forest:", np.mean(accuracy_forest))

Accuracy of Random Forest: 0.9222222222222222
