In [119]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import math
from collections import counter#this is to count majority class

In [169]:
class Node:
    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None

In [198]:


def gini_impurity(y):
    m = len(y)
    return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in np.unique(y)) #formula 
#weigthed sum of ech child(1 -(square od all probabilities))

def split_dataset(X, y, index, threshold):
    left_mask = X[:, index] < threshold
    right_mask = X[:, index] >= threshold
    return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

def grow_tree(X, y, depth=0, max_depth=None):
    num_samples_per_class = {c: np.sum(y == c) for c in np.unique(y)}#assigning dictionary
    predicted_class = max(num_samples_per_class, key=num_samples_per_class.get)
    node = Node(
        gini=gini_impurity(y),
        num_samples=len(y),
        num_samples_per_class=num_samples_per_class,
        predicted_class=predicted_class,
    )
    
    if depth < max_depth:
        idx, thr = best_split(X, y)
        if idx is not None:
            X_left, X_right, y_left, y_right = split_dataset(X, y, idx, thr)
            node.feature_index = idx
            node.threshold = thr
            node.left = grow_tree(X_left, y_left, depth + 1, max_depth)
            node.right = grow_tree(X_right, y_right, depth + 1, max_depth)
    
    return node
#we dexise best split condition(return idx and thresh)
def best_split(X, y):
    m_sample, n_features = X.shape
    if m_sample<= 1:#2 or 3 gives less acc
        return None, None
    
    num_parent = {c: np.sum(y == c) for c in np.unique(y)}#here i find the total number
    #of repeats for each value[1,2,2,3,3]becomes{1:1,2:2,3:2} why do we use dictionaries
    print(num_parent)
    best_gini = 1.0 - sum((num / m_sample) ** 2 for num in num_parent.values())
    best_idx, best_thr = None, None
    
    for idx in range(n):
        thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
        num_left = {c: 0 for c in np.unique(y)}
        num_right = num_parent.copy()
        for i in range(1, m):
            c = classes[i - 1]
            num_left[c] += 1
            num_right[c] -= 1
            gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in num_left)
            gini_right = 1.0 - sum((num_right[x] / (m_sample - i)) ** 2 for x in num_right)
            gini = (i * gini_left + (m - i) * gini_right) / m_sample
            if thresholds[i] == thresholds[i - 1]:
                continue
            if gini < best_gini:
                best_gini = gini
                best_idx = idx
                best_thr = (thresholds[i] + thresholds[i - 1]) / 2
                
    return best_idx, best_thr
#here we decide where the sample's leaf node os
def predict_sample(node, sample):
    if node.left is None and node.right is None:
        return node.predicted_class
    if sample[node.feature_index] < node.threshold:
        return predict_sample(node.left, sample)
    else:#sample[node.feature_index] >node.threshold
        return predict_sample(node.right, sample)

def predict(tree, X):
    return [predict_sample(tree, sample) for sample in X]

Predicted classes: [1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]
Actual classes:    [1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]
the accaracy of the model is 100.0%


In [202]:
#read data
data=pd.read_csv("/Users/vishaal/Downloads/archive/Iris.csv")

# convert the data into X and output Y
X=data.drop('Species',axis=1)
X=X.drop('Id',axis=1)
Y=data['Species']
Y=Y.replace("Iris-setosa",0)
Y=Y.replace("Iris-versicolor",1)
Y=Y.replace("Iris-virginica",2)
data.isna().sum()#data has no nan values
#data is clear everything is in numbers

#convert the data into nummpy arrays
X=X.values
Y=Y.values
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
#grow/generate decission tree
tree = grow_tree(X_train, y_train, max_depth=3)
#prdict values based on tree and X_test
y_pred = predict(tree, X_test)
def accuracy(list1, list2):
    if len(list1) != len(list2):
        raise ValueError("Lists must have the same length")
    
    # Count matching elements
    matches = sum(1 for x, y in zip(list1, list2) if x == y)
    
    # Calculate accuracy
    accuracy = matches / len(list1)
    return accuracy
a=accuracy(y_test.tolist(),y_pred)#converting y-test into list
print(f"the accaracy of the model is {a*100}%")
X.shape

{0: 35, 1: 37, 2: 28}
{0: 35}
{1: 37, 2: 28}
{1: 32}
{1: 5, 2: 28}
the accaracy of the model is 90.0%


(150, 4)