In [None]:
import os
import pickle
import random
import numpy as np
import matplotlib.pyplot as plt
from utils import unpickle
from sklearn.decomposition import PCA

# Load the CIFAR-10 dataset

## About the dataset

- The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images per class. 
- There are 50000 training images and 10000 test images.
- The dataset is divided into five training batches and one test batch, each with 10000 images.
- The test batch contains exactly 1000 randomly-selected images from each class.
- The training batches contain the remaining images in random order, but some training batches may contain more images from one class than another.

## About the classes

The 10 classes are:

1. airplane
2. automobile
3. bird
4. cat
5. deer
6. dog
7. frog
8. horse
9. ship
10. truck

## About the images

- Each row of the array stores a 32x32 colour image.
- The first 1024 entries contain the red channel values, the next 1024 the green, and the final 1024 the blue.
- The image is stored in row-major order, so that the first 32 entries of the array are the red channel values of the first row of the image.

In [None]:
path = "../data/cifar-10-batches-py/"
os.listdir(path)

In [None]:
meta_file = path + "batches.meta"
meta = unpickle(meta_file)
meta[b'label_names']

In [None]:
train_files = [file for file in os.listdir(path) if file.__contains__("data_batch")]
test_file = [file for file in os.listdir(path) if file.__contains__("test_batch")][0]

In [None]:
# train = [unpickle(path + file) for file in train_files]

In [None]:
test = unpickle(path + test_file)
test.keys()

In [None]:
len(test[b'labels'])

In [None]:
test[b'data'].shape

In [None]:
red_channel = test[b'data'][:, :1024]
green_channel = test[b'data'][:, 1024:2048]
blue_channel = test[b'data'][:, 2048:]
red_channel.shape, green_channel.shape, blue_channel.shape

In [None]:
image_red = red_channel.reshape(10000, 32, 32)
image_green = green_channel.reshape(10000, 32, 32)
image_blue = blue_channel.reshape(10000, 32, 32)
image_red.shape, image_green.shape, image_blue.shape

In [None]:
image = np.stack([image_red, image_green, image_blue], axis=3)
image.shape

In [None]:
plt.imshow(image[1])
plt.show()

## A class for loading and processing the CIFAR-10 dataset

In [None]:
from utils import Cifar10

In [None]:
path = "../data/cifar-10-batches-py"
dataset = Cifar10(path)
train_images, train_labels = dataset.get_train()
print(train_images.shape, train_labels.shape)

In [None]:
idx = random.randint(0, 50000)
plt.imshow(train_images[idx])
plt.title(f'An image of {dataset.label_names[train_labels[idx]]}')
plt.show()

## Reducing the dimensionality of the images

In [None]:
X_train, y_train = dataset.get_train(flatten=True)
X_test, y_test = dataset.get_test(flatten=True)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
pca = PCA(n_components=512)
pca.fit(X_train)
pca.explained_variance_ratio_.sum()

In [None]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
X_train_pca.shape, X_test_pca.shape