In [None]:
from sklearn import decomposition, tree
from sklearn.metrics import accuracy_score
import numpy as np

train_data = np.load('MLProject2024/fashion_train.npy')

print("Shape of the train data:", train_data.shape)
train_images = train_data[:, :-1] 
train_labels = train_data[:, -1]  

test_data = np.load('MLProject2024/fashion_test.npy')

print("Shape of the test data:", test_data.shape)

test_images = test_data[:, :-1] 
test_labels = test_data[:, -1]

pca = decomposition.PCA(40)
pca_train = pca.fit_transform(train_images)
pca_test = pca.transform(test_images)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(pca_train, train_labels)



In [1]:
def gini_group(dataset):
    '''takes one group of data and calculates the gini impurity of that'''
    
    counts=np.bincount(labels) #count the number of samples in each class
    n0, n1, n2, n3, n4 = counts 
    n = n0+n1+n2+n3+n4 
    if n == 0: #if the group is empty, gini impurity is 0
        return 0
    p0 = n0/n
    p1 = n1/n
    p2 = n2/n
    p3 = n3/n
    p4 = n4/n
    
    p = p0**2+p1**2+p2**2+p3**2+p4**2
    gini = 1- p
    return gini

def gini_split(left, right):
    '''takes two groups of data and calculates the gini impurity of the split'''
    print ('calculating gini split')
    n_left=len(left)
    n_right=len(right)
    n_total=n_left+n_right
    if n_total == 0: 
        return 0
    gini_split=n_left/n_total*gini_group(left)+n_right/n_total*gini_group(right)
    return gini_split
    
def split (dataset, column, value):
    '''takes a dataset, a column and a value and splits the dataset 
    into two groups based on the value(threshold) of the column, 
    returns two groups'''
    left = []
    right = []
    for row in dataset:
        if row[column] < value:
            left.append(row)
        else:
            right.append(row)
    return np.array(left), np.array(right)
    

def best_split(data):
    '''Finds the best split for the dataset and returns the split details.'''
    print ("looking for best split")
    best_gini = 1
    best_column = None
    best_value = None
    best_left = None
    best_right = None

    for column in range(data.shape[1] - 1):  # Exclude label column
        unique_values = np.unique(data[:, column])  # Unique values for potential split points
        for value in unique_values:
            left, right = split(data, column, value)
            gini = gini_split(left, right)
            if gini < best_gini:
                best_gini = gini
                best_column = column
                best_value = value
                best_left = left
                best_right = right

    return best_left, best_right, best_column, best_value, best_gini




In [None]:
def build_tree(data, max_depth, depth=0):
    '''builds tree recursively'''
    left, right, best_column, best_value, gini = best_split(data)
    
    if depth==max_depth or len(left)==0 or len(right)==0:
        return majority_class(data)

    subtree = {
                'feature': best_column,
                'value': best_value,
                'left': build_tree(left, max_depth, depth + 1), #recursive call
                'right': build_tree(right, max_depth, depth + 1) #recursive call
            }
    return subtree  