## Import 

In [None]:
import os
import cv2
import glob
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from PIL import Image
from tqdm.auto import tqdm 

from sklearn.manifold import TSNE
from matplotlib.pyplot import imshow
from sklearn.decomposition import PCA
from sklearn.feature_extraction import image

warnings.filterwarnings("ignore")

## Class wise distribution

In [None]:
cls = os.listdir('data\\deities')
num_cls = len(cls)

print(f"Number of classes: {num_cls}")
print(f"Classes: {cls}")

px.bar(
    data_frame=pd.DataFrame({
        'deities': cls,
        'num_imgs': [
            len(os.listdir(f"data\\deities\\{cl}")) for cl in cls
        ]
    }),
    x="deities",
    y="num_imgs",
    color="deities"
)

There is class imbalance in our dataset. we have to come up with technics to handle it. Else, the mamodel may develope a bias towards the majority classes and perform poorly on the minority classes.

## Visualize images

Let us visualize few images from each of the classes..

In [None]:
def display_images(deity, n=3):
    """ function to display images of a deity. """

    assert n % 3 == 0, f"{n} is not a multiple of 3."
    assert deity in cls, f"{deity} isn't a class."

    imgs = random.sample(glob.glob(f'data\\deities\\{deity}\\*.jpg'), n)
    _, axes = plt.subplots(n//3, 3, figsize=(10, 10))

    for ax, img in zip(axes.flatten(), imgs):
        img = mpimg.imread(img)
        ax.imshow(img)

    plt.show()


In [None]:
display_images(deity="murugan")

In [None]:
display_images(deity="buddha")

In [None]:
display_images(deity="ganesha")

In [None]:
display_images("saraswati")

In [None]:
display_images("kaali")

In [None]:
display_images("krishna")

In [None]:
display_images("sai-baba")

In [None]:
display_images("shiva")

In [None]:
display_images("lingam")

In [None]:
display_images(deity="nataraja")

In [14]:
display_images("theerthankaras")

## Distribution of image sizes

Analysing the dimensions of the images in our dataset helps us get a sense of the preprocessing steps that might be required

In [None]:
x, y = [], []
folder_path = 'data\\deities\\'
pattern = 'data\\deities\\*\\*' 
for pth in tqdm(glob.glob('data\\deities\\*\\*')):
    img = Image.open(pth)
    x_val, y_val = img.size
    x.append(x_val)
    y.append(y_val)

px.scatter(
    data_frame=pd.DataFrame(
        {
            'dim_x': x, 
            'dim_y': y
        }
    ),
    x='dim_x',
    y='dim_y'
)

The dimensions are all over the place. There are some outliers as well.

## TSNE plots

Let us plot image data in a t-SNE plot. Doing so we can observe how similar or dissimilar images are to each other. It can help in identifying outliers or anomalies in the data. These might be images that are mislabeled, or images that do not fit well within any of the clusters. 

In [None]:
def plot_tsne(deity):
    """ function to display tsne plot of a deity. """

    assert deity in cls, f"{deity} isn't a class"

    data = []
    folder = f"data\\deities\\{deity}"

    for filename in tqdm((os.listdir(folder))):
        image = cv2.imread(os.path.join(folder,filename))
        if image is not None:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            image = cv2.resize(image, (45,45))
            image = image.flatten()
            data.append([image, os.path.join(folder, filename)])

    features, images  = zip(*data)

    features = np.array(features)
    pca = PCA(n_components=200)
    pca.fit(features)
    pca_features = pca.transform(features)

    num_images_to_plot = len(images)

    if len(images) > num_images_to_plot:
        sort_order = sorted(random.sample(range(len(images)), num_images_to_plot))
        images = [images[i] for i in sort_order]
        pca_features = [pca_features[i] for i in sort_order]

    X = np.array(pca_features)
    tsne = TSNE(n_components=2, learning_rate=350, perplexity=30, angle=0.2, verbose=2).fit_transform(X)

    tx, ty = tsne[:,0], tsne[:,1]
    tx = (tx-np.min(tx)) / (np.max(tx) - np.min(tx))
    ty = (ty-np.min(ty)) / (np.max(ty) - np.min(ty))


    width = 4000
    height = 3000
    max_dim = 100

    full_image = Image.new('RGBA', (width, height))
    for img, x, y in zip(images, tx, ty):
        tile = Image.open(img)
        rs = max(1, tile.width/max_dim, tile.height/max_dim)
        tile = tile.resize((int(tile.width/rs), int(tile.height/rs)), Image.ANTIALIAS)
        full_image.paste(tile, (int((width-max_dim)*x), int((height-max_dim)*y)), mask=tile.convert('RGBA'))

    matplotlib.pyplot.figure(figsize = (16,12))
    imshow(full_image)

In [None]:
plot_tsne(deity="krishna")

In [None]:
plot_tsne(deity="shiva")

In [None]:
plot_tsne(deity="ayyappa")

Here, we can observe there two clusters. The cluster from bottom right has images with white background..

In [None]:
plot_tsne(deity="ganesha")

In [None]:
plot_tsne(deity="lingam")

The cluster from top right has images with plain background..

In [None]:
plot_tsne(deity="hanuman")