In [1]:
import numpy as np
import pandas as pd

# Define a class for Decision Tree Node
class Node:
    def __init__(self, attribute=None, threshold=None, left=None, right=None, value=None):
        self.attribute = attribute  # Index of attribute to split on
        self.threshold = threshold  # Threshold value for numerical attributes
        self.left = left            # Left child
        self.right = right          # Right child
        self.value = value          # Value if node is a leaf

# Function to calculate entropy
def entropy(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

# Function to calculate information gain
def information_gain(X, y, attribute, threshold):
    left_indices = X[:, attribute] <= threshold
    right_indices = ~left_indices
    left_y = y[left_indices]
    right_y = y[right_indices]
    parent_entropy = entropy(y)
    left_entropy = entropy(left_y)
    right_entropy = entropy(right_y)
    left_weight = len(left_y) / len(y)
    right_weight = len(right_y) / len(y)
    child_entropy = left_weight * left_entropy + right_weight * right_entropy
    info_gain = parent_entropy - child_entropy
    return info_gain

# Function to find best attribute and threshold to split on
def find_best_split(X, y):
    best_info_gain = -np.inf
    best_attribute = None
    best_threshold = None
    n_samples, n_features = X.shape
    for attribute in range(n_features):
        thresholds = np.unique(X[:, attribute])
        for threshold in thresholds:
            info_gain = information_gain(X, y, attribute, threshold)
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_attribute = attribute
                best_threshold = threshold
    return best_attribute, best_threshold

# Function to build the decision tree
def build_tree(X, y, depth=0, max_depth=None):
    if len(np.unique(y)) == 1 or depth == max_depth:
        leaf_value = max(set(y), key=y.tolist().count)
        return Node(value=leaf_value)
    else:
        best_attribute, best_threshold = find_best_split(X, y)
        if best_attribute is not None:
            left_indices = X[:, best_attribute] <= best_threshold
            right_indices = ~left_indices
            left_child = build_tree(X[left_indices], y[left_indices], depth+1, max_depth)
            right_child = build_tree(X[right_indices], y[right_indices], depth+1, max_depth)
            return Node(attribute=best_attribute, threshold=best_threshold, left=left_child, right=right_child)
        else:
            return Node(value=max(set(y), key=y.tolist().count))

# Function to predict using the decision tree
def predict(tree, X):
    if tree.value is not None:
        return tree.value
    else:
        if X[tree.attribute] <= tree.threshold:
            return predict(tree.left, X)
        else:
            return predict(tree.right, X)

# Load dataset
dataset = pd.read_csv('Social_Network_Ads.csv')

# Extract features and target variable
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Split the dataset into training and testing sets
def train_test_split(X, y, test_size=0.25, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    test_size = int(len(X) * test_size)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Standardize features
def standardize(X_train, X_test):
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std
    return X_train, X_test

X_train, X_test = standardize(X_train, X_test)

# Build the decision tree
tree = build_tree(X_train, y_train, max_depth=3)

# Predict
y_pred = [predict(tree, x) for x in X_test]

# Evaluate
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

print("Accuracy:", accuracy(y_test, y_pred))


Accuracy: 0.94
