In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%env name=samples1_512_equiv_fixed

env: name=samples1_512_equiv_fixed


In [3]:
# copy samples file to directory
!cp ./drive/Shareddrives/Memoria/samples/$name.zip .

!unzip -o -q $name.zip -d ./samples

# copy tools file
!cp ./drive/Shareddrives/Memoria/code/tools.py .

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import cv2
from sklearn.cluster import KMeans
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import array_to_img, img_to_array, load_img
from natsort import natsort_keygen

from sklearn.metrics import accuracy_score, jaccard_score,\
                            f1_score, precision_recall_curve,\
                            confusion_matrix, ConfusionMatrixDisplay,\
                            silhouette_score, mutual_info_score,\
                            adjusted_mutual_info_score, normalized_mutual_info_score,\
                            adjusted_rand_score


In [5]:
np.random.seed(42)

In [6]:
# Constants

# dirs
inp_img_dir = "./samples/"
out_dir = ""

# init
start_num = 1

height, width = 32, 32

# placeholder
r = "rule_"

# Classes
class1 = [0, 255,
        8, 64, 239, 253,
        32, 251,
        40, 96, 235, 249,
        128, 254,
        136, 192, 238, 252,
        160, 250,
        168, 224, 234, 248]

class2 = [1, 127,
        2, 16, 191, 247,
        3, 17, 63, 119,
        4, 223,
        5, 95,
        6, 20, 159, 215,
        7, 21, 31, 87,
        9, 65, 111, 125,
        10, 80, 175, 245,
        11, 47, 81, 117,
        12, 68, 207, 221,
        13, 69, 79, 93,
        14, 84, 143, 213,
        15, 85,
        19, 55,
        23,
        24, 66, 189, 231,
        25, 61, 67, 103,
        26, 82, 167, 181,
        27, 39, 53, 83,
        28, 70, 157, 199,
        29, 71,
        33, 123,
        34, 48, 187, 243,
        35, 49, 59, 115,
        36, 219,
        37, 91,
        38, 52, 155, 211,
        42, 112, 171, 241,
        43, 113,
        44, 100, 203, 217,
        46, 116, 139, 209,
        50, 179,
        51,
        56, 98, 185, 227,
        57, 99,
        58, 114, 163, 177,
        62, 118, 131, 145,
        72, 237,
        73, 109,
        74, 88, 173, 229,
        76, 205,
        77,
        78, 92, 141, 197,
        94, 133,
        104, 233,
        108, 201,
        130, 144, 190, 246,
        132, 222,
        134, 148, 158, 214,
        138, 174, 208, #224,
        140, 196, 206, 220,
        142, 212,
        152, 188, 194, 230,
        154, 166, 180, 210,
        156, 198,
        162, 176, 186, 242,
        164, 218,
        170, 240,
        172, 202, 216, 228,
        178,
        184, 226,
        200, 236,
        204,
        232]
class3 = [18, 183,
        22, 151,
        30, 86, 135, 149,
        45, 75, 89, 101,
        60, 102, 153, 195,
        90, 165,
        105,
        122, 161,
        126, 129,
        146, 182,
        150]
class4 = [41, 97, 107, 121,
        54, 147,
        106, 120, 169, 225,
        110, 124, 137, 193]

In [7]:
def getPics(class_rules, samples_dir, n_samples_per_rule=None):
    class_imgs = []

    for r in class_rules:
        imgs_dir = samples_dir + f'rule_{str(r)}/'
        imgs_temp = [imgs_dir + f for f in os.listdir(imgs_dir)]
        class_imgs += imgs_temp[:n_samples_per_rule] # class balancing
    return class_imgs

c1_img_list = getPics(class1, inp_img_dir, 52)
c2_img_list = getPics(class2, inp_img_dir, 6)
c3_img_list = getPics(class3, inp_img_dir, 48)
c4_img_list = getPics(class4, inp_img_dir, 89)

files_target = [np.full(len(c1_img_list), 1), 
         np.full(len(c2_img_list), 2), 
         np.full(len(c3_img_list), 3), 
         np.full(len(c4_img_list), 4), 
         ]

# Data selection

c1_x = c1_img_list.copy()
c2_x = c2_img_list.copy()
c3_x = c3_img_list.copy()
c4_x = c4_img_list.copy()

c1_y = files_target[0].copy()
c2_y = files_target[1].copy()
c3_y = files_target[2].copy()
c4_y = files_target[3].copy()

c_x = np.array(c1_x+c2_x+c3_x+c4_x)
c_y = np.concatenate((c1_y,c2_y,c3_y,c4_y))

dataset = pd.DataFrame({'img':c_x, 'class':c_y})
dataset = dataset.sample(frac=1)
print(dataset.shape)
dataset.head(3)


(4888, 2)


Unnamed: 0,img,class
144,./samples/rule_8/8_000_0152_1074.png,1
79,./samples/rule_255/255_000_0124_7285.png,1
2098,./samples/rule_108/108_000_0021_8879.png,2


In [8]:
def getImages(files):
    img_list = []

    for img in files:
        img1 = load_img(out_dir + img)
        x = img_to_array(img1)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)   
        
        img_list.append(x)

    return np.array(img_list)

In [9]:
# Images Preview
def imagePreview(imgsArr):
    imgs_show = [imgsArr[0], imgsArr[50], imgsArr[1600], imgsArr[-4]]
    plt.figure(figsize=(16,8))
    columns = 4
    
    for i, image in enumerate(imgs_show):
        plt.subplot(len(imgs_show) // columns + 1, columns, i + 1)
        
        plt.imshow(array_to_img(image))

    plt.figure()
    plt.show()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(dataset['img'].values,
                                                    dataset['class'].values,
                                                    test_size=0.33,
                                                    random_state=42)

In [11]:
X_train_imgs = getImages(X_train)
# imagePreview(X_train)

In [12]:
X_test_imgs = getImages(X_test)
# imagePreview(X_test)

In [13]:
# Extract features from the images
img_features = []

In [14]:
# Load the pre-trained VGG16 model
input_shape = (height, width, 3)
model = VGG16(weights='imagenet', input_shape=input_shape, include_top=False)

In [15]:
# do feature extraction
for img in X_train_imgs:
    features = model.predict(img, verbose=0)
    img_features.append(features.flatten())

# Classification 5 clusters

In [16]:
# Define the number of clusters
num_clusters = 5

# Normalize the features
img_features = np.array(img_features)
img_features_norm = (img_features - img_features.mean()) / img_features.std()

# Perform clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(img_features_norm)



In [17]:
# Visualize the clusters
img_list = list(X_train)

len(img_features), len(img_list), len(X_train)

(3274, 3274, 3274)

In [18]:
def parse_rule(w):
  return w.split('/')[2]

def sort_by_column(df, col):
    df2 = df.sort_values(by=col, key=natsort_keygen())
    return df2

def group_by_two_columns(df, col1, col2, col3):
    df2 = df.groupby([col1, col2]).size().reset_index(name=col3)
    return df2

def get_classification(info, rules):
  rules_ids = []
  for n in rules:
    temp = info.loc[info['imgs'] == n]
    ix = temp['sum'].idxmax()
    rules_ids.append(ix)
  return rules_ids

r = class1 + class2 + class3 + class4 
r = [f'rule_{i}' for i in r]

In [19]:
results_kmeans = pd.DataFrame({'imgs': X_train, 'assigned_labels': np.array(kmeans.labels_)})

results_kmeans['imgs'] = results_kmeans['imgs'].apply(parse_rule)

X = group_by_two_columns(results_kmeans,
                         'imgs',
                         'assigned_labels',
                         'sum')

X_ids = get_classification(X, r)
X = sort_by_column(X, 'imgs')

X = X[X.index.isin(X_ids)][['imgs','assigned_labels']]

In [20]:
X

Unnamed: 0,imgs,assigned_labels
0,rule_0,0
1,rule_1,4
145,rule_2,4
226,rule_3,4
241,rule_4,0
...,...,...
215,rule_251,0
216,rule_252,0
217,rule_253,0
218,rule_254,0


In [21]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [22]:
dataset['img'] = dataset['img'].apply(parse_rule)
original_labels = sort_by_column(dataset.groupby(by=['img','class'])\
                                 .count()\
                                 .reset_index(),
                                 'img')

In [23]:
print('normalized_mutual_info_score',
      normalized_mutual_info_score(original_labels['class'].values,
                                   X['assigned_labels'].values))
print('adjusted_rand_score',
      adjusted_rand_score(original_labels['class'].values,
                          X['assigned_labels'].values))

normalized_mutual_info_score 0.44956107029027026
adjusted_rand_score 0.4278357839309806


In [24]:
fig = make_subplots(rows=2,
                    cols=1,
                    shared_xaxes=True)
fig.update_yaxes(tickmode='linear')


fig.add_trace(go.Scatter(
    x=X['imgs'],
    y=X['assigned_labels'],
    mode="markers",
    name="VGG & Kmeans 5 clusters",
), row=1, col=1)

fig.add_trace(go.Scatter(
      x=original_labels['img'],
      y=original_labels['class'],
      mode="markers",
      name='original labels',
  ), row=2, col=1)

fig.update_layout(
    title="Clustering Results: Equivalent rules not included",
    height=900,
    width=2000,
    legend_title="Experiment",
    yaxis=dict(title='Classes'),
    )
fig.show()

# Classification 4 clusters

In [25]:
# Define the number of clusters
num_clusters = 4

# Normalize the features
img_features = np.array(img_features)
img_features_norm = (img_features - img_features.mean()) / img_features.std()

# Perform clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(img_features_norm)

results_kmeans = pd.DataFrame({'imgs': X_train, 'assigned_labels': np.array(kmeans.labels_)})

results_kmeans['imgs'] = results_kmeans['imgs'].apply(parse_rule)

X = group_by_two_columns(results_kmeans,
                         'imgs',
                         'assigned_labels',
                         'sum')

X_ids = get_classification(X, r)
X = sort_by_column(X, 'imgs')

X = X[X.index.isin(X_ids)][['imgs','assigned_labels']]
X





Unnamed: 0,imgs,assigned_labels
0,rule_0,1
1,rule_1,0
143,rule_2,0
225,rule_3,3
240,rule_4,0
...,...,...
214,rule_251,1
215,rule_252,1
216,rule_253,1
217,rule_254,1


In [26]:
print('normalized_mutual_info_score',
      normalized_mutual_info_score(original_labels['class'].values,
                                   X['assigned_labels'].values))
print('adjusted_rand_score',
      adjusted_rand_score(original_labels['class'].values,
                          X['assigned_labels'].values))

normalized_mutual_info_score 0.47392727782992183
adjusted_rand_score 0.49982565976559734


In [27]:
fig = make_subplots(rows=2,
                    cols=1,
                    shared_xaxes=True)
fig.update_yaxes(tickmode='linear')


fig.add_trace(go.Scatter(
    x=X['imgs'],
    y=X['assigned_labels'],
    mode="markers",
    name="VGG & Kmeans 4 clusters",
), row=1, col=1)

fig.add_trace(go.Scatter(
      x=original_labels['img'],
      y=original_labels['class'],
      mode="markers",
      name='original labels',
  ), row=2, col=1)

fig.update_layout(
    title="Clustering Results: Equivalent rules not included",
    height=900,
    width=2000,
    legend_title="Experiment",
    yaxis=dict(title='Classes'),
    )
fig.show()