In [1]:
import pandas as pd
import numpy as np
import math

data = pd.read_csv("data_dt.csv")
data = data.drop("Day", axis=1)

def entropy(target_column):
    elements, counts = np.unique(target_column, return_counts=True)
    entropy = 0
    for count in counts:
        probability = count / len(target_column)
        entropy -= probability * math.log2(probability)
    return entropy

def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    values, counts = np.unique(data[feature], return_counts=True)
    weighted_entropy = 0
    for value, count in zip(values, counts):
        subset = data[data[feature] == value]
        subset_entropy = entropy(subset[target])
        weighted_entropy += (count / len(data)) * subset_entropy
    return total_entropy - weighted_entropy

def best_split(data, features, target):
    information_gains = []
    for feature in features:
        information_gains.append(information_gain(data, feature, target))
    return features[np.argmax(information_gains)]

def id3(data, original_data, features, target_attribute_name, parent_node_class=None):
    # If all target values are the same, return this value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]

    # If dataset is empty, return the mode target feature value in the original dataset
    elif len(data) == 0:
        return np.unique(original_data[target_attribute_name])[np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1])]

    # If there are no features left to split the data, return the mode target feature value of the current node
    elif len(features) == 0:
        return parent_node_class

    # Otherwise, grow the tree
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]

        best_feature = best_split(data, features, target_attribute_name)
        tree = {best_feature: {}}

        features = [i for i in features if i != best_feature]

        for value in np.unique(data[best_feature]):
            value = value
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3(sub_data, data, features, target_attribute_name, parent_node_class)
            tree[best_feature][value] = subtree

        return tree

def classify(sample, tree, default=None):
    attribute = next(iter(tree))

    if sample[attribute] in tree[attribute].keys():
        result = tree[attribute][sample[attribute]]
        if isinstance(result, dict):
            return classify(sample, result)
        else:
            return result
    else:
        return default

# Build the decision tree
target_attribute_name = "Decision"
features = data.columns.difference([target_attribute_name])
tree = id3(data, data, features, target_attribute_name)

# Classify a new sample
new_sample = {"Outlook": "Sunny", "Temperature": "Low", "Humidity": "Medium", "Wind": "Weak"}
result = classify(new_sample, tree, "No")

print(f"Prediction for the new sample: {result}")


Prediction for the new sample: Yes


In [2]:
import pandas as pd
import numpy as np
import math

data = pd.read_csv("data_dt.csv")
data = data.drop("Day", axis=1)

def entropy(target_column):
    elements, counts = np.unique(target_column, return_counts=True)
    entropy = 0
    for count in counts:
        probability = count / len(target_column)
        entropy -= probability * math.log2(probability)
    return entropy

def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    values, counts = np.unique(data[feature], return_counts=True)
    weighted_entropy = 0
    for value, count in zip(values, counts):
        subset = data[data[feature] == value]
        subset_entropy = entropy(subset[target])
        weighted_entropy += (count / len(data)) * subset_entropy
    return total_entropy - weighted_entropy

def gain_ratio(data, feature, target):
    info_gain = information_gain(data, feature, target)
    values, counts = np.unique(data[feature], return_counts=True)
    split_info = 0
    for count in counts:
        probability = count / len(data)
        split_info -= probability * math.log2(probability)
    return info_gain / split_info

def best_split(data, features, target):
    gain_ratios = []
    for feature in features:
        gain_ratios.append(gain_ratio(data, feature, target))
    return features[np.argmax(gain_ratios)]

def c45(data, original_data, features, target_attribute_name, parent_node_class=None):
    # If all target values are the same, return this value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]

    # If dataset is empty, return the mode target feature value in the original dataset
    elif len(data) == 0:
        return np.unique(original_data[target_attribute_name])[np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1])]

    # If there are no features left to split the data, return the mode target feature value of the current node
    elif len(features) == 0:
        return parent_node_class

    # Otherwise, grow the tree
    parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]

    best_feature = best_split(data, features, target_attribute_name)
    tree = {best_feature: {}}

    features = [i for i in features if i != best_feature]

    for value in np.unique(data[best_feature]):
        value = value
        sub_data = data.where(data[best_feature] == value).dropna()
        subtree = c45(sub_data, data, features, target_attribute_name, parent_node_class)
        tree[best_feature][value] = subtree

    return tree

# Function to classify a new sample
def classify(sample, tree, default=None):
    attribute = next(iter(tree))

    if sample[attribute] in tree[attribute].keys():
        result = tree[attribute][sample[attribute]]
        if isinstance(result, dict):
            return classify(sample, result)
        else:
            return result
    else:
        return default

# Build the C4.5 decision tree
target_attribute_name = "Decision"
features = data.columns.difference([target_attribute_name])
tree = c45(data, data, features, target_attribute_name)

# Classify a new sample
new_sample = {"Outlook": "Sunny", "Temperature": "High", "Humidity": "Medium", "Wind": "Weak"}
result = classify(new_sample, tree, "No")

print(f"Prediction for the new sample: {result}")

Prediction for the new sample: Yes


In [4]:
import pandas as pd
import numpy as np

# Define the dataset
data = pd.read_csv("data_dt_CART.csv")

# Define the DecisionTree class
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return np.argmax(np.bincount(y))

        num_samples, num_features = X.shape
        best_gini = 1.0
        best_feature = None
        best_split = None

        for feature in range(num_features):
            unique_values = np.unique(X[:, feature])
            for value in unique_values:
                left_indices = np.where(X[:, feature] <= value)[0]
                right_indices = np.where(X[:, feature] > value)[0]

                left_gini = self._calculate_gini(y[left_indices])
                right_gini = self._calculate_gini(y[right_indices])
                gini = (len(left_indices) / num_samples) * left_gini + (len(right_indices) / num_samples) * right_gini

                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_split = value

        if best_gini == 1.0:
            return np.argmax(np.bincount(y))

        left_indices = np.where(X[:, best_feature] <= best_split)[0]
        right_indices = np.where(X[:, best_feature] > best_split)[0]

        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return (best_feature, best_split, left_tree, right_tree)

    def _calculate_gini(self, y):
        if len(y) == 0:
            return 0
        p0 = np.sum(y == 0) / len(y)
        p1 = np.sum(y == 1) / len(y)
        return 1 - p0**2 - p1**2

    def predict(self, X):
        predictions = []
        for x in X:
            node = self.tree
            while isinstance(node, tuple):
                feature, split, left, right = node
                if x[feature] <= split:
                    node = left
                else:
                    node = right
            predictions.append(node)
        return np.array(predictions)

# Prepare data using only "Temperature" and "Humidity"
X = data[["Temperature", "Humidity"]].values
y = (data["Decision"] == "Yes").astype(int).values

# Build the CART decision tree
tree = DecisionTree(max_depth=4)
tree.fit(X, y)

'''# Sample for which the prediction should be "Yes"
new_sample = np.array([75, 80])  # Temperature: 75, Humidity: 80
new_sample = np.reshape(new_sample, (1, -1))
prediction = tree.predict(new_sample)

if prediction[0] == 1:
    result = "Yes"
else:
    result = "No"

print(f"Prediction for the new sample: {result}")'''

samples = [
    (75, 80),  # Temperature: 75, Humidity: 80
    (70, 70),  # Temperature: 70, Humidity: 70
    (83, 78),  # Temperature: 83, Humidity: 78
    (68, 80),  # Temperature: 68, Humidity: 80
    (81, 75),  # Temperature: 81, Humidity: 75
]

for sample in samples:
    prediction = tree.predict(np.array(sample).reshape(1, -1))
    if prediction[0] == 1:
        result = "Yes"
    else:
        result = "No"
    print(f"Prediction for the sample {sample}: {result}")

Prediction for the sample (75, 80): Yes
Prediction for the sample (70, 70): No
Prediction for the sample (83, 78): Yes
Prediction for the sample (68, 80): Yes
Prediction for the sample (81, 75): Yes
