In [None]:
#This is a notebook to work with some galaxy images and apply the KMeans clustering algorithm.

#Author: Viviana Acquaviva

#License: BSD but really should be TBD - just be nice.

In [None]:
import numpy as np
import pandas as pd
import os
import random
import matplotlib.pyplot as plt
%matplotlib inline

import skimage
from skimage.transform import resize, rescale
from skimage import io
from skimage.feature import blob_dog, blob_log, blob_doh
from skimage.color import rgb2gray

This data set is composed by 200 images randomy selected from the Kaggle Galaxy Zoo challenge:

https://www.kaggle.com/c/galaxy-zoo-the-galaxy-challenge

The code below visualizes the first 25 objects in your data set. You can run it to get a view of the first 25 galaxies. Note: you might get an error message, in this case see here 

https://stackoverflow.com/questions/43288550/iopub-data-rate-exceeded-in-jupyter-notebook-when-viewing-image

In [None]:
fig, axes = plt.subplots(ncols= 5, nrows = 5,figsize=(50,50))

ax = axes.ravel()

for i in range(ax.shape[0]):

    img = skimage.io.imread('Image_'+str(i)+'.png')
    ax[i].imshow(img, cmap='gray')
    ax[i].set_xticks([])
    ax[i].set_yticks([])

    

Let's now get rid of some multiple sources.

In [None]:
#This shows how multiple sources can be identified and masked.

n_ob = 5

fig, ax = plt.subplots(2, n_ob, figsize=(50, 20))

for i in range(n_ob):

    img = skimage.io.imread('Image_'+str(i)+'.png')

    image_gray = rgb2gray(img)

    blobs_log = blob_log(image_gray, max_sigma=30, num_sigma=10, threshold=.1)

    # Compute radii in the 3rd column.
    
    blobs_log[:, 2] = blobs_log[:, 2] * np.sqrt(2)
    
    blobs_log = blobs_log[blobs_log[:,2].argsort()[::-1]]
    
    ax[0,i].imshow(img, interpolation='nearest')

    X, Y = np.ogrid[:img.shape[0], :img.shape[1]]
    
    center = np.array([img.shape[0]/2, img.shape[1]/2]) #center
    
    for blob in blobs_log:    
        y, x, r = blob    
        c = plt.Circle((x, y), r, color = 'yellow', linewidth=2, fill=False)
        ax[0,i].add_patch(c)
        
        if (np.linalg.norm(np.array([x,y])-center)) > 10: #If not in center
        
            mask = (X - blob[0])**2 + (Y - blob[1])**2 < r**2
            img[mask] = 0
    
    ax[1,i].imshow(img, interpolation='nearest')
        
    print('I found', int(len(blobs_log)), 'sources.')
    
    if blobs_log[1,2] > 0.5*blobs_log[0,2]: #second source bigger than half first
        print('Multiple large sources detected in image', str(i))


In [None]:
images.shape

In [None]:
#DO NOT EXECUTE!!! (Takes a LONG time)

for i in range(200):

    img = skimage.io.imread('Image_'+str(i)+'.png')

    image_gray = rgb2gray(img)

    blobs_log = blob_log(image_gray, max_sigma=30, num_sigma=10, threshold=.1)

    # Compute radii in the 3rd column.
    blobs_log[:, 2] = blobs_log[:, 2] * np.sqrt(2)
    
    blobs_log = blobs_log[blobs_log[:,2].argsort()[::-1]]
    
    X, Y = np.ogrid[:img.shape[0], :img.shape[1]]
    
    center = np.array([img.shape[0]/2, img.shape[1]/2]) #center
    
    for blob in blobs_log:    
        y, x, r = blob    
#        c = plt.Circle((x, y), r, color = 'yellow', linewidth=2, fill=False)
#        ax.add_patch(c)
        
        if (np.linalg.norm(np.array([x,y])-center)) > 10: #If not in center
        
            mask = (X - blob[0])**2 + (Y - blob[1])**2 < r**2
            img[mask] = 0
    
    skimage.io.imsave('NoSources_Image_'+str(i)+'.png',img)
    
    if np.mod(i, 10) == 0:
        print('Processing image', i)

## Start here!

In [None]:
#Takes < 1 minute

#Let's read in the images, and resize them to something a bit more manageable

images = []

for i in range(200):
    img =skimage.io.imread('NoSources_Image_'+str(i)+'.png')
    img_resized = resize(img,(100,100))
    length = np.prod(img_resized.shape)
    img_resized = np.reshape(img_resized,length)
    images.append(img_resized)
    
images = np.vstack(images)

In [None]:
images.shape

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
kmeans.fit(images)
y_kmeans = kmeans.predict(images)

In this case, the predictions (the cluster to which each image belongs to) can only assume the value 0 and 1. Here we show a quick way to show which galaxies are predicted to belong to each cluster.

In [None]:
print(len(np.where([y_kmeans == 0])[1]))

In [None]:
print(len(np.where([y_kmeans == 1])[1]))

We can use the code below to take a look at 25 galaxies that were placed in the first cluster and see if they look somehow alike.

In [None]:
fig, axes = plt.subplots(ncols= 5, nrows = 5,figsize=(50,50))

ax = axes.ravel()

for i in range(min(len(np.where([y_kmeans == 0])[1]),25)):
    #Note: the line below selects galaxies that are assigned to cluster 0
    img = skimage.io.imread('NoSources_Image_'+str(np.where([y_kmeans == 0])[1][i])+'.png')
    ax[i].imshow(img, cmap='gray')
    ax[i].set_xticks([])
    ax[i].set_yticks([])

We can do the same thing for the second cluster.

In [None]:
fig, axes = plt.subplots(ncols= 5, nrows = 5,figsize=(50,50))

ax = axes.ravel()

for i in range(min(len(np.where([y_kmeans == 1])[1]),25)):
    #Note: the line below selects galaxies that are assigned to cluster 1
    img = skimage.io.imread('NoSources_Image_'+str(np.where([y_kmeans == 1])[1][i])+'.png')
    ax[i].imshow(img, cmap='gray')
    ax[i].set_xticks([])
    ax[i].set_yticks([])

Let's now do the same thing but with three clusters, and slightly smarter initial conditions.

In [None]:
kmeans = KMeans(n_clusters=3, init= 'k-means++')
kmeans.fit(images)
y_kmeans = kmeans.predict(images)

In [None]:
#Let's how big the clusters are.

for i in range(3):
    print(len(np.where([y_kmeans == i])[1]))

We can investigate the small one.

In [None]:
fig, axes = plt.subplots(ncols= 5, nrows = 5,figsize=(50,50))

ax = axes.ravel()

for i in range(min(len(np.where([y_kmeans == 2])[1]),25)): #change index here as necessary
    #Note: the line below selects galaxies that are assigned to cluster 0
    img = skimage.io.imread('NoSources_Image_'+str(np.where([y_kmeans == 2])[1][i])+'.png') #and here
    ax[i].imshow(img, cmap='gray')
    ax[i].set_xticks([])
    ax[i].set_yticks([])

We can now take a look at the other two clusters.

In [None]:
fig, axes = plt.subplots(ncols= 5, nrows = 5, figsize=(50,50))

ax = axes.ravel()

for i in range(min(len(np.where([y_kmeans == 0])[1]),25)):
    #Note: the line below selects galaxies that are assigned to cluster 1
    img = skimage.io.imread('NoSources_Image_'+str(np.where([y_kmeans == 0])[1][i])+'.png')
    ax[i].imshow(img, cmap='gray')
    ax[i].set_xticks([])
    ax[i].set_yticks([])

In [None]:
fig, axes = plt.subplots(ncols= 5, nrows = 5, figsize=(50,50))

ax = axes.ravel()

for i in range(min(len(np.where([y_kmeans == 1])[1]),25)):
    #Note: the line below selects galaxies that are assigned to cluster 2
    img = skimage.io.imread('NoSources_Image_'+str(np.where([y_kmeans == 1])[1][i])+'.png')
    ax[i].imshow(img, cmap='gray')
    ax[i].set_xticks([])
    ax[i].set_yticks([])

### Q: Basically, KMeans is classifying galaxies according to size.

Ideas to fix?

<br> 0. Any metric should be rotationally invariant (or the data set should be pre-processed to correct for orientation).

<br> 1. Cut out central image and normalize size.

<br> 2. Improve pre-processing of multiple sources.

<br> 3. Modify the evaluation metric to give higher weight to features like color.

<br> 4. Use many clusters until you get a good degree of simlarity among members of the same clusters, then manually re-group.


In [None]:
#This splits one of the clusters (where y_means = 2) in two.

kmeans = KMeans(n_clusters=2, init= 'k-means++')

kmeans.fit(images[y_kmeans == 1])

y_kmeans_l = kmeans.predict(images[y_kmeans == 1])

Let's look at the two new clusters.

In [None]:
fig, axes = plt.subplots(ncols= 5, nrows = 5, figsize=(50,50))

ax = axes.ravel()

for i in range(min(len(np.where([y_kmeans_l == 0])[1]),25)):
    img = skimage.io.imread('NoSources_Image_'+str(np.where([y_kmeans == 1])[1][np.where([y_kmeans_l == 0])[1]][i])+'.png')
    ax[i].imshow(img, cmap='gray')
    ax[i].set_xticks([])
    ax[i].set_yticks([])

In [None]:
fig, axes = plt.subplots(ncols= 5, nrows = 5,figsize=(50,50))

ax = axes.ravel()

for i in range(min(len(np.where([y_kmeans_l == 1])[1]),25)):
    #Note: the line below selects galaxies that are assigned to cluster 1
    img = skimage.io.imread('NoSources_Image_'+str(np.where([y_kmeans == 1])[1][np.where([y_kmeans_l == 1])[1]][i])+'.png')
    ax[i].imshow(img, cmap='gray')
    ax[i].set_xticks([])
    ax[i].set_yticks([])

### Conclusions

IMHO, clustering algorithms are powerful when they are semi-supervised.

Pre-processing seems to be quite important; defining a proper distance metric can also help.