In [23]:
import numpy as np
import pandas as pd
from PIL import Image 
import os 
from tqdm import tqdm 

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
# Loading labels 
labels = pd.read_csv("data/labels.csv")

# These will be the targets we're fitting to 
breeds = labels['breed'].unique() 
targets = {b: i for i, b in enumerate(breeds)}

print(targets)

X = []
y = [] 

# build matrix 
for i, row in tqdm(labels.iterrows(), total=len(labels)): 
    path = os.path.join("data/train/", f"{row['id']}.jpg")
    img = Image.open(path).convert('RGB').resize((64, 64))
    X.append(np.array(img).flatten())
    y.append(targets[row['breed']])

X = np.array(X)
y = np.array(y)


{'boston_bull': 0, 'dingo': 1, 'pekinese': 2, 'bluetick': 3, 'golden_retriever': 4, 'bedlington_terrier': 5, 'borzoi': 6, 'basenji': 7, 'scottish_deerhound': 8, 'shetland_sheepdog': 9, 'walker_hound': 10, 'maltese_dog': 11, 'norfolk_terrier': 12, 'african_hunting_dog': 13, 'wire-haired_fox_terrier': 14, 'redbone': 15, 'lakeland_terrier': 16, 'boxer': 17, 'doberman': 18, 'otterhound': 19, 'standard_schnauzer': 20, 'irish_water_spaniel': 21, 'black-and-tan_coonhound': 22, 'cairn': 23, 'affenpinscher': 24, 'labrador_retriever': 25, 'ibizan_hound': 26, 'english_setter': 27, 'weimaraner': 28, 'giant_schnauzer': 29, 'groenendael': 30, 'dhole': 31, 'toy_poodle': 32, 'border_terrier': 33, 'tibetan_terrier': 34, 'norwegian_elkhound': 35, 'shih-tzu': 36, 'irish_terrier': 37, 'kuvasz': 38, 'german_shepherd': 39, 'greater_swiss_mountain_dog': 40, 'basset': 41, 'australian_terrier': 42, 'schipperke': 43, 'rhodesian_ridgeback': 44, 'irish_setter': 45, 'appenzeller': 46, 'bloodhound': 47, 'samoyed': 

100%|██████████| 10222/10222 [03:17<00:00, 51.86it/s]


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components = 500)
X_pca = pca.fit_transform(X_scaled)

In [None]:
print(X_pca)

[[ 8.97265234e-01 -7.84292541e+00  1.24742835e-01 ...  3.02899040e+00
  -7.69965730e-01 -2.44273716e+00]
 [-7.63689584e+01  1.97533426e+01  1.05392611e+01 ... -1.23321324e+00
  -9.57673786e-01 -1.01327405e+00]
 [-8.75908884e+00  4.57308947e+01 -4.15992490e+00 ... -3.98514000e-02
   1.51510468e-01  3.54928150e-01]
 ...
 [-1.94521933e+01  1.21426919e+02 -6.14200995e+01 ... -1.85157984e-01
   8.50089889e-01  6.58489514e-01]
 [ 1.84845696e+01 -8.12605621e+00 -1.52712119e+01 ...  4.69363993e-01
  -1.05581769e+00  2.50118490e+00]
 [ 2.53402475e+01  5.72339608e+01 -4.06666098e+01 ... -3.09957873e-01
  -1.01090848e+00  4.51841772e-01]]


In [26]:
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size = 0.2, stratify=y)

In [28]:
model = LogisticRegression(max_iter=1500, solver='lbfgs')
model.fit(X_train, y_train)

print("Accuracy: ", model.score(X_val,y_val))

Accuracy:  0.02689486552567237
