# Digits recognition with clustering

`sklearn` has some built-in datasets. One of them is composed of small images of digits and the corresponding digit it represents (the label).

## Loading and visualizing the dataset

In [None]:
# Look at the documentation of the function load_digits.
# Use it to get the data (both X and y).
from sklearn.datasets import load_digits

#?load_digits
# By looking at the documentation, one can see that the
# function `load_digits()̣` returns a dictionary containing
# different data in each of its attributes. The interesting
# ones are named `data` (our X) and `target` (our y).
# I could write:
# X, y = load_digits(return_X_y=True)
# which only returns the interesting fields but since I will
# need the images for the next question, I load it entirely.
data = load_digits()
X = data["data"]
y = data["target"]

# How many samples are in this dataset ?
print("There are {} samples in the digits dataset.".
    format(len(X)))

# How many samples of the classe '7' are in this dataset ?
# The condition `y == 7` returns a numpy array of booleans
# where values are True each time the label is 7. Then
# np.sum() gives the number of occurrences of `True` in the 
# array, since True is evaluated as 1 and False as 0. The
# number of occurrences of True is the number of labels equal
# to 7.
print("There are {} samples of the class 7 in the dataset.".
    format((y == 7).sum()))

# How many features are in each input vector ?
print("There are {} features in each input vector.".
    format(X.shape[1]))

# Print the frequency of each class as an histogram.
import numpy as np
import matplotlib.pyplot as plt

# np.histogram() counts the number of values in each bucket,
# where the buckets are spread along [0, 9] (the possible
# clasess). We indicate that we want the same number of
# buckets as the number of labels.
occurrences = np.histogram(y, bins=len(data["target_names"]))[0]
plt.bar(data["target_names"], occurrences)

# visualization is better if mininal value on y-axis is not 0
plt.ylim(min(occurrences) - 2)

plt.xticks(data["target_names"])
plt.yticks(sorted(list(set(occurrences))))
plt.title("Frequency of each class in the digits dataset")
plt.show()

In [None]:
# Use the fonction imshow() of the plt library to represent
# as an image:
#   - 3 different samples of the class "9"
#   - 3 different samples of the class "6"
#   - 3 different samples of the class "5"

# Print all the images into the same plot (use subplots()
# with a size of 3x3 so that the first row contains "9", the
# second row contains "6" and the third row contains "5")

## Predicting digits with K-means

In [None]:
# Separate the dataset into a train and test size with
# random_state = 0.

In [None]:
# With the class KMeans, create a model that clusterize the 
# training data. Use the right number of clusters.

In [None]:
# The model learns 2 things:
#   - the position of each centroid (model.cluster_centers_)
#   - the id of the cluster to which each point belongs (model.labels_)
#
# Use the above information to build, for each centroid,
# the list of points and their label belonging to that 
# centroid.
# Then build a mapping between a cluster id and the class it
# represents (use a dictionary for this).

In [None]:
# Look at the documentation of the .predict() method of your
# model. Use it to find the class predicted by your model
# for each point of your test set.

In [None]:
# What is the accuracy of your KMeans model on the test set ?

## Combining PCA and Kmeans for prediction

In [None]:
# Use the PCA algorithm to transform the digits data
# (all of them i.e. both train and test) into a 3-dimensional
# dataset.

In [None]:
# Print the reduced dataset on a 3D figure.

In [None]:
# Train another KMeans on the 3D dataset. Compare its accuracy
# against the previous KMeans model. Do you get better results ?