# Programming Assignment 2 by Zach Hatzenbeller

## Library Imports

In [1]:
# Library Imports are at the top
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Problem 1 - Feature Ranking Methods (FLDR, Decision Tree Classification)

### Part 1

In [5]:
# Load the data
mnist = pd.read_csv("mnist.csv")
X = mnist.values[:,1:]
y = mnist.values[:,0]

In [6]:
# Class to calculate the fischer score to rank features
def fisher_score(X, y):
    # Find unique classes and number of features
    classes = np.unique(y)
    n_features = X.shape[1]
    
    # Overall mean of each feature
    mean_overall = np.mean(X, axis=0)
    
    # Initialize S_B (between-class variance) and S_W (within-class variance)
    S_B = np.zeros(n_features)
    S_W = np.zeros(n_features)
    
    # Calculate S_B and S_W for each feature
    for c in classes:
        X_c = X[y == c]  # Data points for class c
        mean_c = np.mean(X_c, axis=0)  # Mean of each feature for class c
        n_c = X_c.shape[0]  # Number of data points in class c
        
        # Update between-class and within-class variances
        S_B += n_c * (mean_c - mean_overall) ** 2
        S_W += np.sum((X_c - mean_c) ** 2, axis=0)
    
    # Compute Fisher Score: S_B / S_W
    with np.errstate(divide='ignore', invalid='ignore'):
        scores = S_B / S_W
        scores[np.isnan(scores)] = 0  # Set scores where S_W is zero to 0 (avoid division by zero)
    
    return scores

# Compute Fisher scores
scores = fisher_score(X, y)

# Rank features
ranked_features = np.argsort(scores)[::-1][:20]  # Descending order
print("Top 20 Features Selected:")
print(ranked_features)


Top 20 Features Selected:
[350 378 461 406 434 409 542 386 514 377 489 433 568 462 428 358 543 515
 596 400]
