In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score


df = pd.read_csv('cancer.csv')


df = df.drop("STDs: Time since first diagnosis", axis=1)
df = df.drop("STDs: Time since last diagnosis", axis=1)


# Replace "?" with NaN values and then drop rows containing NaN values

# If you want to reset the index after dropping rows
df.reset_index(drop=True, inplace=True)


def calculate_accuracy(inputted_predictions, actual):
    correct = sum(inputted_predictions == actual)
    total = len(actual)
    accuracy = correct / total
    return accuracy


# Separate features and target variable
X = df.drop('Biopsy', axis=1)
y = df['Biopsy']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # Value if the node is a leaf node


class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth


    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.n_features = X.shape[1]
        self.tree = self._grow_tree(X, y)


    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_samples_per_class = [np.sum(y == i) for i in range(self.n_classes)]
        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or np.max(n_samples_per_class) == n_samples:
            leaf_value = np.argmax(n_samples_per_class)
            return Node(value=leaf_value)


        # Find the best split
        best_gini = np.inf
        best_criteria = None
        best_sets = None
        for feature_index in range(n_features):
            feature_values = np.unique(X[:, feature_index])
            for threshold in feature_values:
                left_indices = np.where(X[:, feature_index] <= threshold)[0]
                right_indices = np.where(X[:, feature_index] > threshold)[0]
                gini = self._gini(y[left_indices], y[right_indices])
                if gini < best_gini:
                    best_gini = gini
                    best_criteria = (feature_index, threshold)
                    best_sets = (left_indices, right_indices)


        # Create subtrees
        left = self._grow_tree(X[best_sets[0]], y[best_sets[0]], depth + 1)
        right = self._grow_tree(X[best_sets[1]], y[best_sets[1]], depth + 1)
        return Node(feature_index=best_criteria[0], threshold=best_criteria[1], left=left, right=right)


    def _gini(self, *groups):
        total_samples = sum(len(group) for group in groups)
        gini = 0.0
        for group in groups:
            size = float(len(group))
            if size == 0:
                continue
            score = 0.0
            for class_val in range(self.n_classes):
                p = [np.sum(group == class_val) / size for group in groups]
                score += p[class_val] ** 2
            gini += (1.0 - score) * (size / total_samples)
        return gini


    def _predict(self, x, tree):
        if tree.value is not None:
            return tree.value
        feature_value = x[tree.feature_index]
        subtree = tree.right
        if feature_value <= tree.threshold:
            subtree = tree.left
        return self._predict(x, subtree)


    def predict(self, X):
        return [self._predict(x, self.tree) for x in X]


# Instantiate the DecisionTree classifier
decision_tree = DecisionTree(max_depth=2)


# Fit the model to the training data
decision_tree.fit(X_train.values, y_train.values)




# Calculate test accuracy
dtc_test_predictions = decision_tree.predict(X_test.values)
dtc_test_accuracy = accuracy_score(dtc_test_predictions, y_test.values)
print("Decision Tree Classification Test Accuracy:", dtc_test_accuracy)


# Calculate train accuracy
dtc_train_predictions = decision_tree.predict(X_train.values)
dtc_train_accuracy = accuracy_score(dtc_train_predictions, y_train.values)
print("Decision Tree Classification Train Accuracy:", dtc_train_accuracy)


dtc_test_accuracy = calculate_accuracy(dtc_test_predictions, y_test.values)
print("Decision Tree Classification Test Accuracy (Alternate Method):", dtc_test_accuracy)
dtc_train_accuracy = calculate_accuracy(dtc_train_predictions, y_train.values)
print("Decision Tree Classification Train Accuracy (Alternate Method):", dtc_train_accuracy)


Decision Tree Classification Test Accuracy: 0.936046511627907
Decision Tree Classification Train Accuracy: 0.9358600583090378
Decision Tree Classification Test Accuracy (Alternate Method): 0.936046511627907
Decision Tree Classification Train Accuracy (Alternate Method): 0.9358600583090378


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


df = pd.read_csv('cancer.csv')


df = df.drop(["STDs: Time since first diagnosis", "STDs: Time since last diagnosis"], axis=1)


# If you want to reset the index after dropping rows
df.reset_index(drop=True, inplace=True)


def calculate_accuracy(inputted_predictions, actual):
    correct = sum(inputted_predictions == actual)
    total = len(actual)
    accuracy = correct / total
    return accuracy


# Only select the columns for prediction
X = df[['Schiller', 'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Citology']]
y = df['Biopsy']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # Value if the node is a leaf node


class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.n_features = X.shape[1]
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_samples_per_class = [np.sum(y == i) for i in range(self.n_classes)]
        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or np.max(n_samples_per_class) == n_samples:
            leaf_value = np.argmax(n_samples_per_class)
            return Node(value=leaf_value)

        # Find the best split
        best_gini = np.inf
        best_criteria = None
        best_sets = None
        for feature_index in range(n_features):
            feature_values = np.unique(X[:, feature_index])
            for threshold in feature_values:
                left_indices = np.where(X[:, feature_index] <= threshold)[0]
                right_indices = np.where(X[:, feature_index] > threshold)[0]
                gini = self._gini(y[left_indices], y[right_indices])
                if gini < best_gini:
                    best_gini = gini
                    best_criteria = (feature_index, threshold)
                    best_sets = (left_indices, right_indices)

        # Create subtrees
        left = self._grow_tree(X[best_sets[0]], y[best_sets[0]], depth + 1)
        right = self._grow_tree(X[best_sets[1]], y[best_sets[1]], depth + 1)
        return Node(feature_index=best_criteria[0], threshold=best_criteria[1], left=left, right=right)

    def _gini(self, *groups):
        total_samples = sum(len(group) for group in groups)
        gini = 0.0
        for group in groups:
            size = float(len(group))
            if size == 0:
                continue
            score = 0.0
            for class_val in range(self.n_classes):
                p = [np.sum(group == class_val) / size for group in groups]
                score += p[class_val] ** 2
            gini += (1.0 - score) * (size / total_samples)
        return gini

    def _predict(self, x, tree):
        if tree.value is not None:
            return tree.value
        feature_value = x[tree.feature_index]
        subtree = tree.right
        if feature_value <= tree.threshold:
            subtree = tree.left
        return self._predict(x, subtree)

    def predict(self, X):
        return [self._predict(x, self.tree) for x in X]


# Instantiate the DecisionTree classifier
decision_tree = DecisionTree(max_depth=2)

# Fit the model to the training data
decision_tree.fit(X_train.values, y_train.values)

# Calculate test accuracy
dtc_test_predictions = decision_tree.predict(X_test.values)
dtc_test_accuracy = accuracy_score(dtc_test_predictions, y_test.values)
print("Decision Tree Classification Test Accuracy:", dtc_test_accuracy)

# Calculate train accuracy
dtc_train_predictions = decision_tree.predict(X_train.values)
dtc_train_accuracy = accuracy_score(dtc_train_predictions, y_train.values)
print("Decision Tree Classification Train Accuracy:", dtc_train_accuracy)

dtc_test_accuracy = calculate_accuracy(dtc_test_predictions, y_test.values)
print("Decision Tree Classification Test Accuracy (Alternate Method):", dtc_test_accuracy)
dtc_train_accuracy = calculate_accuracy(dtc_train_predictions, y_train.values)
print("Decision Tree Classification Train Accuracy (Alternate Method):", dtc_train_accuracy)


Decision Tree Classification Test Accuracy: 0.936046511627907
Decision Tree Classification Train Accuracy: 0.9373177842565598
Decision Tree Classification Test Accuracy (Alternate Method): 0.936046511627907
Decision Tree Classification Train Accuracy (Alternate Method): 0.9373177842565598


In [None]:
import pandas as pd
import numpy as np

class DecisionTree:
    def __init__(self):
        self.tree = None
    
    def gini_impurity(self, labels):
        if len(labels) == 0:
            return 0
        p = (labels == 1).sum() / len(labels)
        return 1 - p**2 - (1 - p)**2
    
    def find_best_split(self, data, labels):
        best_gini = float('inf')
        best_feature = None
        best_value = None
        
        for feature in data.columns:
            for value in data[feature].unique():
                left_labels = labels[data[feature] <= value]
                right_labels = labels[data[feature] > value]
                gini = (len(left_labels) * self.gini_impurity(left_labels) + 
                        len(right_labels) * self.gini_impurity(right_labels)) / len(labels)
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_value = value
        
        return best_feature, best_value, best_gini
    
    def fit(self, data, labels):
        self.tree = self._fit(data, labels)
    
    def _fit(self, data, labels):
        if len(labels.unique()) == 1:
            return {'prediction': labels.iloc[0]}
        
        if len(data.columns) == 0:
            return {'prediction': labels.mode()[0]}
        
        best_feature, best_value, best_gini = self.find_best_split(data, labels)
        
        left_data = data[data[best_feature] <= best_value]
        left_labels = labels[data[best_feature] <= best_value]
        right_data = data[data[best_feature] > best_value]
        right_labels = labels[data[best_feature] > best_value]
        
        return {'feature': best_feature,
                'value': best_value,
                'gini': best_gini,
                'left': self._fit(left_data, left_labels),
                'right': self._fit(right_data, right_labels)}
    
    def predict(self, data):
        predictions = []
        for _, row in data.iterrows():
            predictions.append(self._predict_row(row, self.tree))
        return pd.Series(predictions)
    
    def _predict_row(self, row, tree):
        if 'prediction' in tree:
            return tree['prediction']
        if row[tree['feature']] <= tree['value']:
            return self._predict_row(row, tree['left'])
        else:
            return self._predict_row(row, tree['right'])

# Example usage
data = pd.read_csv("cancer.csv")  # Load your data here
test_data = data[['Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology']]
labels = data['Biopsy']

model = DecisionTree()
model.fit(test_data, labels)

# Make predictions
test_predictions = model.predict(test_data)
print(test_predictions)
