In [1]:
#Calculate entropy

#Calculate information gain

#Find best attribute selection measure - find best splitting attribute

#Divide data set into subsets

#Splitting data set

#Evaluate best split

#S

In [1]:
#Importing pandas
import pandas as pd

#Importing the datasets
wp = pd.read_csv('website-phishing.csv')
bcp = pd.read_csv('bcp.csv')
ar = pd.read_csv('arrhythmia.csv')

#Checking for missing values
print("Missing values in W-P:", wp.isnull().sum().sum())
print("Missing values in BCP:", bcp.isnull().sum().sum())
print("Missing values in AR:", ar.isnull().sum().sum())

Missing values in W-P: 0
Missing values in BCP: 0
Missing values in AR: 0


In [3]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes_ = len(np.unique(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(predicted_class=predicted_class)

        if depth < self.max_depth:
            feature, threshold = self._best_criteria(X, y)
            if feature is not None:
                indices_left = X[:, feature] < threshold
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = feature
                node.threshold = threshold
                node.left_child = self._grow_tree(X_left, y_left, depth + 1)
                node.right_child = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _best_criteria(self, X, y):
        best_gini = 1
        best_feature, best_threshold = None, None
        for feature_index in range(self.n_features_):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_indices = X[:, feature_index] < threshold
                gini = self._gini_impurity(y[left_indices], y[~left_indices])
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature_index
                    best_threshold = threshold
        return best_feature, best_threshold

    def _gini_impurity(self, left_y, right_y):
        p_left = len(left_y) / (len(left_y) + len(right_y))
        p_right = len(right_y) / (len(left_y) + len(right_y))
        gini = 1.0 - (p_left**2 + p_right**2)
        return gini

    def _predict(self, inputs):
        node = self.tree_
        while node.left_child:
            if inputs[node.feature_index] < node.threshold:
                node = node.left_child
            else:
                node = node.right_child
        return node.predicted_class

class Node:
    def __init__(self, predicted_class):
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left_child = None
        self.right_child = None

# Example usage:
# Assuming you have data X and labels y for each dataset wp, bcp, and ar.
# For website-phishing.csv
X_wp = wp.iloc[:, :-1].values  # Features are all columns except the last
y_wp = wp.iloc[:, -1].values   # Labels are the last column

# For bcp.csv
X_bcp = bcp.iloc[:, :-1].values  # Features are all columns except the last
y_bcp = bcp.iloc[:, -1].values   # Labels are the last column

# For arrhythmia.csv
X_ar = ar.iloc[:, :-1].values  # Features are all columns except the last
y_ar = ar.iloc[:, -1].values   # Labels are the last column


from sklearn.model_selection import train_test_split

# For website-phishing.csv
X_train_wp, X_test_wp, y_train_wp, y_test_wp = train_test_split(X_wp, y_wp, test_size=0.2, random_state=42)

# For bcp.csv
X_train_bcp, X_test_bcp, y_train_bcp, y_test_bcp = train_test_split(X_bcp, y_bcp, test_size=0.2, random_state=42)

# For arrhythmia.csv
X_train_ar, X_test_ar, y_train_ar, y_test_ar = train_test_split(X_ar, y_ar, test_size=0.2, random_state=42)


# Instantiate DecisionTree
tree = DecisionTree(max_depth=1)

# Fit the model on your data
tree.fit(X_wp, y_wp)  # For website-phishing.csv
tree.fit(X_bcp, y_bcp)  # For bcp.csv
tree.fit(X_ar, y_ar)  # For arrhythmia.csv

# Predict on test data (assuming you have it)
# y_pred = tree.predict(X_test)

from sklearn.metrics import accuracy_score

# Assuming you have a test set (X_test, y_test) for each dataset
# Replace X_test_wp, y_test_wp, X_test_bcp, y_test_bcp, X_test_ar, and y_test_ar with your test data

# For website-phishing.csv
y_pred_wp = tree.predict(X_test_wp)
accuracy_wp = accuracy_score(y_test_wp, y_pred_wp)
print("Accuracy for website-phishing dataset:", accuracy_wp)

# For bcp.csv
y_pred_bcp = tree.predict(X_test_bcp)
accuracy_bcp = accuracy_score(y_test_bcp, y_pred_bcp)
print("Accuracy for bcp dataset:", accuracy_bcp)

# For arrhythmia.csv
y_pred_ar = tree.predict(X_test_ar)
accuracy_ar = accuracy_score(y_test_ar, y_pred_ar)
print("Accuracy for arrhythmia dataset:", accuracy_ar)


Accuracy for website-phishing dataset: 0.3984622342831298
Accuracy for bcp dataset: 0.0
Accuracy for arrhythmia dataset: 0.4835164835164835
