In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score


df = pd.read_csv('cancer.csv')


df = df.drop("STDs: Time since first diagnosis", axis=1)
df = df.drop("STDs: Time since last diagnosis", axis=1)


# Replace "?" with NaN values and then drop rows containing NaN values
df.replace("?", pd.NA, inplace=True)
df.dropna(inplace=True)


# If you want to reset the index after dropping rows
df.reset_index(drop=True, inplace=True)


def calculate_accuracy(inputted_predictions, actual):
    correct = sum(inputted_predictions == actual)
    total = len(actual)
    accuracy = correct / total
    return accuracy


# Separate features and target variable
X = df.drop('Biopsy', axis=1)
y = df['Biopsy']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

FileNotFoundError: [Errno 2] No such file or directory: 'cancer.csv'

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # Value if the node is a leaf node


class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.n_features = X.shape[1]
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_samples_per_class = [np.sum(y == i) for i in range(self.n_classes)]
        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or np.max(n_samples_per_class) == n_samples:
            leaf_value = np.argmax(n_samples_per_class)
            return Node(value=leaf_value)

        # Find the best split
        best_gini = np.inf
        best_criteria = None
        best_sets = None
        for feature_index in range(n_features):
            feature_values = np.unique(X[:, feature_index])
            for threshold in feature_values:
                left_indices = np.where(X[:, feature_index] <= threshold)[0]
                right_indices = np.where(X[:, feature_index] > threshold)[0]
                gini = self._gini(y[left_indices], y[right_indices])
                if gini < best_gini:
                    best_gini = gini
                    best_criteria = (feature_index, threshold)
                    best_sets = (left_indices, right_indices)

        # Create subtrees
        left = self._grow_tree(X[best_sets[0]], y[best_sets[0]], depth + 1)
        right = self._grow_tree(X[best_sets[1]], y[best_sets[1]], depth + 1)
        return Node(feature_index=best_criteria[0], threshold=best_criteria[1], left=left, right=right)

    def _gini(self, *groups):
        total_samples = sum(len(group) for group in groups)
        gini = 0.0
        for group in groups:
            size = float(len(group))
            if size == 0:
                continue
            score = 0.0
            for class_val in range(self.n_classes):
                p = [np.sum(group == class_val) / size for group in groups]
                score += p[class_val] ** 2
            gini += (1.0 - score) * (size / total_samples)
        return gini

    def _predict(self, x, tree):
        if tree.value is not None:
            return tree.value
        feature_value = x[tree.feature_index]
        subtree = tree.right
        if feature_value <= tree.threshold:
            subtree = tree.left
        return self._predict(x, subtree)

    def predict(self, X):
        return [self._predict(x, self.tree) for x in X]

def find_best_split(X, y):
    n_samples, n_features = X.shape
    best_gini = np.inf
    best_criteria = None
    best_sets = None
    for feature_index in range(n_features):
        feature_values = np.unique(X[:, feature_index])
        for threshold in feature_values:
            left_indices = np.where(X[:, feature_index] <= threshold)[0]
            right_indices = np.where(X[:, feature_index] > threshold)[0]
            gini = DecisionTree()._gini(y[left_indices], y[right_indices])
            if gini < best_gini:
                best_gini = gini
                best_criteria = (feature_index, threshold)
                best_sets = (left_indices, right_indices)
    return best_criteria, best_sets

def build_tree(X, y, depth=0, max_depth=None):
    n_samples, n_features = X.shape
    n_samples_per_class = [np.sum(y == i) for i in range(2)]
    # Stopping criteria
    if (max_depth is not None and depth >= max_depth) or np.max(n_samples_per_class) == n_samples:
        leaf_value = np.argmax(n_samples_per_class)
        return Node(value=leaf_value)

    # Find the best split
    (best_feature_index, best_threshold), (left_indices, right_indices) = find_best_split(X, y)

    # Create subtrees
    left = build_tree(X[left_indices], y[left_indices], depth + 1, max_depth)
    right = build_tree(X[right_indices], y[right_indices], depth + 1, max_depth)
    return Node(feature_index=best_feature_index, threshold=best_threshold, left=left, right=right)

def predict_tree(x, tree):
    if tree.value is not None:
        return tree.value
    feature_value = x[tree.feature_index]
    subtree = tree.right
    if feature_value <= tree.threshold:
        subtree = tree.left
    return predict_tree(x, subtree)

def predict(X, tree):
    return [predict_tree(x, tree) for x in X]

# Load the dataset
df = pd.read_csv('cancer.csv')

# Drop columns not needed
df = df[['Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology', 'Biopsy']]

# Drop rows with missing values
df.dropna(inplace=True)

# Separate features and target variable
X = df.drop('Biopsy', axis=1)
y = df['Biopsy']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the decision tree
decision_tree = DecisionTree(max_depth=1)  # Start with depth 1
decision_tree.fit(X_train.values, y_train.values)

# Initial accuracy
initial_predictions = decision_tree.predict(X_test.values)
initial_accuracy = accuracy_score(initial_predictions, y_test.values)
print("Initial Accuracy:", initial_accuracy)

# Iteratively build and test the tree
for depth in range(2, 5):  # Choose maximum depth
    print(f"Testing with maximum depth {depth}")
    new_tree = build_tree(X_train.values, y_train.values, max_depth=depth)
    new_predictions = predict(X_test.values, new_tree)
    new_accuracy = accuracy_score(new_predictions, y_test.values)
    print(f"Accuracy with max depth {depth}:", new_accuracy)


FileNotFoundError: [Errno 2] No such file or directory: 'cancer.csv'