In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
import matplotlib.pyplot as plt
import math

# Define the dataset

In [2]:
data = [
    [30, 'high', 'no', 'fair', 'no'],
    [30, 'high', 'no', 'excellent', 'no'],
    [31, 'medium', 'no', 'fair', 'yes'],
    [40, 'low', 'no', 'fair', 'yes'],
    [40, 'low', 'yes', 'fair', 'yes'],
    [40, 'low', 'yes', 'excellent', 'no'],
    [31, 'medium', 'yes', 'excellent', 'yes'],
    [30, 'high', 'no', 'fair', 'no'],
    [30, 'medium', 'yes', 'fair', 'yes'],
    [31, 'medium', 'yes', 'excellent', 'yes'],
    [31, 'high', 'no', 'excellent', 'yes'],
    [40, 'medium', 'no', 'fair', 'yes'],
    [40, 'high', 'yes', 'fair', 'yes'],
    [31, 'medium', 'no', 'excellent', 'no']
]

# Convert the dataset into a DataFrame

In [3]:
df = pd.DataFrame(data, columns=['age', 'income', 'student', 'credit_rating', 'buys_computer'])

# Define the features and target variable

In [4]:
X = df.drop(columns=['buys_computer'])
y = df['buys_computer']

# Define a class for the decision tree node

In [5]:
class DecisionTreeNode:
    def __init__(self, feature=None, value=None, left=None, right=None, target_class=None):
        self.feature = feature  # Index of feature to split on
        self.value = value  # Value of the feature
        self.left = left  # Left subtree
        self.right = right  # Right subtree
        self.target_class = target_class  # Target class if the node is a leaf

# Define a function to calculate entropy

In [6]:
def calculate_entropy(y):
    class_counts = y.value_counts()
    entropy = 0
    for count in class_counts:
        probability = count / len(y)
        entropy -= probability * math.log2(probability)
    return entropy

# Define a function to calculate information gain for a particular feature


In [7]:
def calculate_information_gain(X, y, feature, split_value):
    total_entropy = calculate_entropy(y)

    # Split the dataset based on the feature and value
    left_indices = X[feature] <= split_value
    right_indices = X[feature] > split_value
    left_entropy = calculate_entropy(y[left_indices])
    right_entropy = calculate_entropy(y[right_indices])

    # Calculate the information gain
    left_weight = sum(left_indices) / len(y)
    right_weight = sum(right_indices) / len(y)
    information_gain = total_entropy - (left_weight * left_entropy + right_weight * right_entropy)

    return information_gain

# Define a function to build the decision tree recursively

In [8]:
def build_decision_tree(X, y):
    if len(set(y)) == 1:  # If all samples have the same class
        return DecisionTreeNode(target_class=y.iloc[0])

    if len(X.columns) == 0:  # If there are no features left to split on
        return DecisionTreeNode(target_class=y.mode()[0])

    best_information_gain = 0
    best_feature = None
    best_split_value = None

    # Find the best feature and split value
    for feature in X.columns:
        unique_values = X[feature].unique()
        for value in unique_values:
            information_gain = calculate_information_gain(X, y, feature, value)
            if information_gain > best_information_gain:
                best_information_gain = information_gain
                best_feature = feature
                best_split_value = value

    # Split the dataset based on the best feature and split value
    left_indices = X[best_feature] <= best_split_value
    right_indices = X[best_feature] > best_split_value
    left_subtree = build_decision_tree(X[left_indices], y[left_indices])
    right_subtree = build_decision_tree(X[right_indices], y[right_indices])

    return DecisionTreeNode(feature=best_feature, value=best_split_value, left=left_subtree, right=right_subtree)


# Define a function to make predictions using the decision tree

In [9]:
def predict(tree, sample):
    if tree.target_class is not None:
        return tree.target_class
    feature_index = X.columns.get_loc(tree.feature)
    if sample[feature_index] <= tree.value:
        return predict(tree.left, sample)
    else:
        return predict(tree.right, sample)

# Build the decision tree

In [10]:
decision_tree = build_decision_tree(X, y)

# Make predictions for new samples

In [11]:
data2 = [
    [20, 'low', 'yes', 'excellent'],
    [30, 'high', 'no', 'fair'],
    [40, 'medium', 'yes', 'fair'],
    [50, 'low', 'no', 'fair'],
    [25, 'high', 'yes', 'excellent'],
    [35, 'medium', 'no', 'fair'],
    [45, 'low', 'yes', 'excellent'],
    [55, 'high', 'no', 'fair'],
]

for sample in data2:
    prediction = predict(decision_tree, sample)
    print(f"For sample {sample}, predicted class: {prediction}")

For sample [20, 'low', 'yes', 'excellent'], predicted class: yes
For sample [30, 'high', 'no', 'fair'], predicted class: no
For sample [40, 'medium', 'yes', 'fair'], predicted class: yes
For sample [50, 'low', 'no', 'fair'], predicted class: yes
For sample [25, 'high', 'yes', 'excellent'], predicted class: no
For sample [35, 'medium', 'no', 'fair'], predicted class: yes
For sample [45, 'low', 'yes', 'excellent'], predicted class: no
For sample [55, 'high', 'no', 'fair'], predicted class: yes
