In [40]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
import sys
import os
from shutil import copyfile
# sys.path.append("..")  # Adds higher directory to python modules path.
from PIL import Image
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Init

# Check out the doc to see what this all is http://pytorch.org/docs/master/torchvision/models.html
cuda = False
model = models.resnet18(pretrained=True)
output_layer = model._modules.get('avgpool')
image_vector_length = 512
        
if cuda:
    model.cuda()

model.eval()

scaler = transforms.Resize((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()

# Done with init

def get_image_vector(img):

    if cuda:
        image = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0)).cuda()
    else:
        image = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0))

    embedding = torch.zeros(image_vector_length)

    def copy_data(m, i, o):
        embedding.copy_(o.data)

    h = output_layer.register_forward_hook(copy_data)
    h_x = model(image)
    h.remove()

    return embedding.numpy()

def cluster_images(pca=False):
    input_path = './images/unclustered'
    files = os.listdir(input_path)
    samples = len(files)
    num_clusters = 6 # Images were sampled from 6 subreddits. Play around with this number
    image_vector_holder = np.zeros((samples, image_vector_length))
    sample_indices = np.random.choice(range(0, len(files)), size=samples, replace=False) # randomly sample files

    print('Begin the clustering')
    for index, i in enumerate(sample_indices):
        file = files[i]
        if not file.endswith(".jpg"):
            continue
        filename = os.fsdecode(file)
        img = Image.open(os.path.join(input_path, filename))
        # The model only works with images with 3 channels, i.e. RGB images.
        # If the image isn't RGB, convert it
        if not img.mode == 'RGB':
            img = img.convert('RGB')
        vec = get_image_vector(img)
        image_vector_holder[index, :] = vec

    reduced_data = None
    if pca:
        print('Apply the mighty PCA to reduce the dimensions of the data')
        # I'm not sure why this is set to 2 in all the examples I've seen and haven't looked into it yet
        reduced_data = PCA(n_components=10).fit_transform(image_vector_holder)
    else:
        reduced_data = image_vector_holder
    
    kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
    kmeans.fit(reduced_data)

    # Create directories for each cluster.
    for i in set(kmeans.labels_):
        try:
            os.mkdir('./images/' + str(i))
        except FileExistsError:
            continue

    print('Process reduced data')
    preds = kmeans.predict(reduced_data)

    print('Copy images')
    for index, i in enumerate(sample_indices):
        file = files[i]
        filename = os.fsdecode(file)
        copyfile(input_path + '/' + filename, './images/' + str(preds[index]) + '/' + filename)

    
print("Starting")
cluster_images(False)
print("Fin")



Starting
Begin the clustering




Apply the mighty PCA to reduce the dimensions of the data
Process reduced data
Copy images
Fin
