In [22]:
import numpy as np
from sklearn import preprocessing, decomposition

train_data = np.load('MLProject2024/fashion_train.npy') 
test_data = np.load('MLProject2024/fashion_test.npy')
train_images = train_data[:, :-1]
test_images = test_data[:, :-1]
train_labels = train_data[:, -1].astype(int)
test_labels = test_data[:, -1].astype(int)

# Apply PCA and combine features with labels
pca = decomposition.PCA(16)
pca_train = pca.fit_transform(train_images)
pca_test = pca.transform(test_images)
train_data_pca = np.hstack((pca_train, train_labels.reshape(-1, 1)))

In [None]:
def gini_group(dataset):
    '''Calculates the Gini impurity for a group of data. Same as befor but for any n'''
    labels = dataset[:, -1].astype(int)
    counts = np.bincount(labels)
    n = counts.sum()
    if n == 0:
        return 0
    probs = counts / n
    gini = 1 - np.sum(probs ** 2)
    return gini

def gini_split(left, right):
    '''Calculates the Gini impurity for a split.'''
    n_left = len(left)
    n_right = len(right)
    n_total = n_left + n_right
    if n_total == 0:
        return 0
    gini = (n_left / n_total) * gini_group(left) + (n_right / n_total) * gini_group(right)
    return gini

def split(dataset, column, value):
    '''Splits the dataset based on the given feature and value.'''
    left = dataset[dataset[:, column] < value] 
    right = dataset[dataset[:, column] >= value]
    return left, right

def best_split(data):
    '''Finds the best split for the dataset.'''
    best_gini = 1
    best_column = None
    best_value = None
    best_left = None
    best_right = None

    n_features = data.shape[1] - 1  # Exclude label column
    for column in range(n_features):
        # Sort data and labels according to the feature column
        data_sorted = data[data[:, column].argsort()] # !!!!! Sort data by column to find best split efficiently
        feature_values = data_sorted[:, column]
        labels = data_sorted[:, -1]

        # ! another way to improve perfomance, only consider split points where the class label changes
        for i in range(1, len(feature_values)):
            if labels[i] != labels[i - 1]:
                value = (feature_values[i] + feature_values[i - 1]) / 2
                left, right = split(data, column, value)
                gini = gini_split(left, right)
                if gini < best_gini:
                    best_gini = gini
                    best_column = column
                    best_value = value
                    best_left = left
                    best_right = right

    # Check if a valid split was found
    if best_left is None or best_right is None:
        return None, None, None, None, None

    return best_left, best_right, best_column, best_value, best_gini

def majority_class(data):
    '''Finds the majority class in the data.'''
    labels = data[:, -1].astype(int)
    return np.bincount(labels).argmax()

def build_tree(data, max_depth, depth=0):
    '''Recursively builds the decision tree.'''
    left, right, best_column, best_value, gini = best_split(data)

    # Check if a split was possible or if maximum depth is reached
    if (best_column is None or depth == max_depth or
        left is None or right is None or
        left.size == 0 or right.size == 0):
        return majority_class(data)

    subtree = {
        'feature': best_column,
        'value': best_value,
        'left': build_tree(left, max_depth, depth + 1),
        'right': build_tree(right, max_depth, depth + 1)
    }
    return subtree

# Build the tree with the corrected functions



In [26]:
tree=build_tree(train_data_pca, 10)
tree

{'feature': 2,
 'value': 366.4975298185517,
 'left': {'feature': 1,
  'value': -41.02902304704946,
  'left': {'feature': 0,
   'value': -1142.2567507467038,
   'left': {'feature': 1,
    'value': -904.892252026264,
    'left': {'feature': 7,
     'value': 42.75650389677854,
     'left': {'feature': 0,
      'value': -1480.989879497356,
      'left': {'feature': 3,
       'value': 66.77262487101096,
       'left': {'feature': 7,
        'value': -184.82784566655846,
        'left': {'feature': 0,
         'value': -1508.0340642784772,
         'left': 0,
         'right': 2},
        'right': {'feature': 4,
         'value': -108.60286227486341,
         'left': {'feature': 10,
          'value': 69.78769969247205,
          'left': 2,
          'right': 0},
         'right': {'feature': 14,
          'value': 17.970959127987467,
          'left': 4,
          'right': 3}}},
       'right': {'feature': 2,
        'value': 109.64427643029492,
        'left': {'feature': 1,
         'valu

In [46]:
def predict(tree, features):
    if not isinstance(tree, dict):
        return tree
    if features[tree['feature']] < tree['value']:
        return predict(tree['left'], features)
    else:
        return predict(tree['right'], features)


In [49]:
from sklearn.metrics import accuracy_score

# Predict for each feature vector in pca_test
y_pred = [predict(tree, features) for features in pca_test]
accuracy = accuracy_score(test_labels, y_pred)
accuracy

0.7694