# Anomaly Detection

## Filenames

Identify images and movies which does not stick to naming patterns in order to have consistent filenames and which might come from an unusual image source.

In [3]:
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [210]:
filenames = os.listdir(os.path.expanduser('~') + '/Downloads/photos')

# vectorize a list of filenames using a CountVectorizer
vectorizer = CountVectorizer(analyzer='char', lowercase=False, max_features=8)
vectors = vectorizer.fit_transform(filenames).toarray()

# mean vector of all filenames
mean_vector = np.mean(vectors, axis=0)
    
# Euclidean distance between filename vector and mean vector
euclidean_distances = np.linalg.norm(vectors - mean_vector, axis=1)

# identify filenames that are significantly different from the mean (outside z-score of 4)
z_score = 4
anomaly_indices = np.where(euclidean_distances > np.mean(euclidean_distances) + z_score * np.std(euclidean_distances))[0]

for index in anomaly_indices:
    print(filenames[index])

30B10CA3-E96D-408B-9F82-BD86CABEFF61.JPG
FullSizeRender.heic
1D2C26F3-CA07-40F4-84D9-8CC08EF54C48.JPG
FullSizeRender-2.MOV
FullSizeRender.MOV
FullSizeRender-1.MOV
FullSizeRender-2.heic
AC6AB640-0BD8-43C3-BC47-D0856FBA6422.jpg
9F8D4DD2-B460-4213-BEA0-461BF2DB3E1C.jpg
FullSizeRender-1.heic
6427165E-9C83-476E-BB8C-B2D27E63CDB2.JPG
23600585-C713-45C4-AD8D-22E08871F18A.jpg
