## We will try to use anomaly detection with a Gaussian Mixture Model in a reduced dimension space to remove bad training instances from our dataset

### Note that this example is done in 2D for illustration purposes, but can also be done in higher dimensions.

In [None]:
from umap import UMAP
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.mixture import GaussianMixture

In [None]:
path = "doodle_data_npy/bear.npy"

X = np.load(path)

In [None]:
%%time

x_tfm = UMAP(n_components=2).fit_transform(X)

In [None]:
print(X.shape)
print(x_tfm.shape)

## Visualization

In [None]:
random_start = random.randint(0, X.shape[0]-10)
num_images = 10
for i in range(random_start, random_start+num_images):
    plt.subplot(1, num_images, i + 1 - random_start)
    x = X[i].reshape(28, 28)
    plt.imshow(x, cmap="binary")

    plt.axis("off")

### Notice how some images look very different. These will be in different locations in the reduced dimension space. We will try to target these and remove them

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(x_tfm[:, 0], x_tfm[:, 1], alpha=0.002)

### Lets sample some images from the cluster on the top left, this probably holds "bad" images

In [None]:
x_bad = X[(x_tfm[:, 0] < 4) & (x_tfm[:, 1] > 4)]

In [None]:
num_images = 10
for i in range(1, num_images):
    plt.subplot(1, num_images, i + 1)
    x = x_bad[random.randint(0, x_bad.shape[0])].reshape(28, 28)
    plt.imshow(x, cmap="binary")

    plt.axis("off")

### Looks like images in this area are mostly not circles! If we remove them, the data should get much better!

# STRATEGY

#### Use a Gaussian Mixture model to  get the main cluster, and then set a threshold to remove all elements that are far away from the center of the cluster. This will remove the anomalies

In [None]:
gm = GaussianMixture(n_components=1, n_init=10)
gm.fit(x_tfm)

In [None]:
# Any isntance located in a low-density region is considered to be an anomaly
densities = gm.score_samples(x_tfm) # score_samples esitmates the density of the model at any given location
# say 10% are anomalies (see https://koaning.io/til/moar-bad-labels/)
density_threshold = np.percentile(densities, 10) 
non_anomalies = x_tfm[densities > density_threshold]
found_anomalies = x_tfm[densities < density_threshold]
non_anomalies_idxs = np.nonzero(densities > density_threshold)[0]

In [None]:
plt.figure(figsize=(6, 6))
plt.title("Not Anomalies")
plt.scatter(non_anomalies[:, 0], non_anomalies[:, 1], alpha=0.002)

In [None]:
plt.figure(figsize=(6, 6))
plt.title("Anomalies")
plt.scatter(found_anomalies[:, 0], found_anomalies[:, 1], alpha=0.002)

In [None]:
X_new = X[non_anomalies_idxs]

In [None]:
x_new_tfm = UMAP(n_components=2).fit_transform(X_new)

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(x_new_tfm[:, 0], x_new_tfm[:, 1], alpha=0.002)