In [8]:
import numpy as np
from collections import Counter

In [9]:
import numpy as np

def entropy(y):
    """Calculate the entropy of a dataset."""
    # Count the occurrences of each class
    counts = np.bincount(y)
    probabilities = counts / len(y)
    # Filter out zero probabilities to avoid math error in np.log2
    probabilities = probabilities[probabilities > 0]
    # Calculate entropy
    return -np.sum(probabilities * np.log2(probabilities))

def conditional_entropy(x, y):
    """Calculate the conditional entropy of y given x."""
    # Entropy accumulator
    entropy_acc = 0
    # Iterate over each category in x to calculate conditional entropy
    for value in np.unique(x):
        # Subset y based on the current category in x
        subset_y = y[x == value]
        # Calculate the probability of the current category
        probability = len(subset_y) / len(y)
        # Add to the conditional entropy accumulator
        entropy_acc += probability * entropy(subset_y)
    return entropy_acc

def information_gain(X, y, feature_index):
    """Calculate the Information Gain of a feature."""
    # Calculate the entropy of the entire dataset
    total_entropy = entropy(y)
    # Calculate the conditional entropy of y given the feature
    feature_entropy = conditional_entropy(X[:, feature_index], y)
    # Calculate Information Gain
    return total_entropy - feature_entropy

def find_root_node(X, y):
    """Find the feature index that should be used as the root node."""
    # Calculate information gain for each feature
    gains = [information_gain(X, y, feature_index) for feature_index in range(X.shape[1])]
    # Find the index of the feature with the highest information gain
    return np.argmax(gains)


In [10]:
import numpy as np

def bin_continuous_values(x, n_bins=3, strategy="width"):
    """Bin continuous values into categorical."""
    if strategy == "width":
        # Equal width binning
        # Define bins with equal width
        bins = np.linspace(np.min(x), np.max(x), n_bins + 1)
        # Digitize the continuous values based on the defined bins
        return np.digitize(x, bins) - 1
    elif strategy == "frequency":
        # Equal frequency binning (quantiles)
        # Determine quantiles to split the data into equal frequency bins
        quantiles = np.quantile(x, np.linspace(0, 1, n_bins + 1))
        # Digitize the continuous values based on the quantiles
        return np.digitize(x, quantiles) - 1
    else:
        # Raise an error if an unsupported strategy is provided
        raise ValueError("Unsupported binning strategy")



In [12]:
class Node:
    def __init__(self, feature_index=None, threshold=None, value=None, left=None, right=None):
        self.feature_index = feature_index  # Index of the feature to split on
        self.threshold = threshold          # Threshold value for binary splits
        self.value = value                  # Value to return if this node is a leaf
        self.left = left                    # Left child (for binary splits)
        self.right = right                  # Right child (for binary splits)

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        # Consider stopping criteria such as max depth or pure leaf nodes

        # Stopping criteria
        if depth == self.max_depth or len(np.unique(y)) == 1:
            # Return leaf node with majority class
            return Node(value=np.bincount(y).argmax())

        # Find the best split using information gain
        best_feature_index = find_root_node(X, y)
        best_information_gain = information_gain(X, y, best_feature_index)

        # Check if no information gain
        if best_information_gain == 0:
            # Return leaf node with majority class
            return Node(value=np.bincount(y).argmax())

        # Split the dataset based on the best feature and threshold
        best_threshold = np.median(X[:, best_feature_index])
        left_indices = X[:, best_feature_index] <= best_threshold
        right_indices = ~left_indices

        # Recursively build left and right subtrees
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        # Create and return the current node
        return Node(feature_index=best_feature_index, threshold=best_threshold, left=left_subtree, right=right_subtree)

    def predict(self, X):
        predictions = []
        for sample in X:
            node = self.root
            while node.left:
                if sample[node.feature_index] <= node.threshold:
                    node = node.left
                else:
                    node = node.right
            predictions.append(node.value)
        return predictions

# Read the dataset
data = pd.read_csv("/loan-train.csv")

# Preprocess the data

# Handling missing values
data.fillna(method='ffill', inplace=True)  # Forward fill missing values

# Convert categorical variables into numerical ones using one-hot encoding
data = pd.get_dummies(data)

# Split data into features and target variable
X = data.drop(columns=['Loan_Status_Y']).values
y = data['Loan_Status_Y'].values

# Initialize and train the decision tree
tree = DecisionTree(max_depth=5)
tree.fit(X, y)


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
