# Notebook 2: Preprocessing Experiments
## CSE 546 Final Project - Flower Classification

**Date**: November 2024  
**Experiments**: 002-004 (Normalization, PCA, Feature Selection)  
**Goal**: Systematic comparison of preprocessing techniques

**Baseline to beat**: 87.16% CV accuracy (KNN k=5, no preprocessing)


## 1. Imports and Setup


In [None]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('..')

# Sklearn imports
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Local imports
from src.preprocessing import load_data
from src.evaluation import evaluate_model, save_figure
from src.utils import RANDOM_STATE, save_results, log_experiment

# Set random seed
np.random.seed(RANDOM_STATE)

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("All libraries imported successfully!")
print(f"Random State: {RANDOM_STATE}")


In [None]:
# Load data and setup CV
X_train, y_train, filenames, label_mapping, class_names = load_data('../data/')
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE)

print(f"Data loaded: {X_train.shape}")
print(f"CV strategy: 4-fold Stratified")
print(f"\nBaseline to beat: 87.16% (KNN k=5, no preprocessing)")
