In [15]:
import numpy as np 
import pandas as pd
from PIL import Image, ImageOps, ImageFilter 
import matplotlib.pyplot as plt 
import os 
from tqdm import tqdm 

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix

In [2]:
# Label Encoding 

labels = pd.read_csv("data/labels.csv")
breeds = labels['breed'].unique() 

# Dict mapping 'breed' to index, model will operate on integers and we will convert back to breeds 
classes = {b: i for i, b in enumerate(breeds)}

In [3]:
# Image Preprocessing 
samples = []
targets = []

# for every image (add a progress bar)
for i, row in tqdm(labels.iterrows(), total = len(labels)):   
    path = os.path.join("data/train/", f"{row["id"]}.jpg") #retrieve the iamge
    img = Image.open(path).resize((96, 96)) #resize 
    img = img.convert('L') #gray scale
    img = ImageOps.equalize(img) #boost contrast
    img = img.filter(ImageFilter.GaussianBlur(radius=0.5)) #smooth edges
    sample = np.array(img) #convert to numerical data
    sample = sample/255 #normalize pixel values from 0->255 to 0->1
    sample = sample.flatten() #flatten into 1d array 

    # store image data and correct class label 
    samples.append(sample)
    targets.append(classes[row['breed']])

100%|██████████| 10222/10222 [00:25<00:00, 394.83it/s]


In [4]:
# Scaling, this will make std = 1 and mean approximately = 0, should be nearly 0 

scaler = StandardScaler(with_mean=True,with_std=True)
scaled_samples = scaler.fit_transform(samples)


In [None]:
# Standard PCA to reduce computational cost 

pca = PCA(n_components=500, svd_solver='randomized', random_state=42)
samples_pca = pca.fit_transform(scaled_samples)

In [None]:
sample_df = pd.DataFrame(samples_pca)
sample_df.to_csv("data/processed_samples.csv", index=False, header=False,sep=';')

In [13]:
target_df = pd.DataFrame(targets)
target_df.to_csv("data/targets.csv", index=False, header=False,sep=';')

In [38]:
X = np.array(samples_pca) 
y = np.array(targets)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, stratify=y)