In [9]:
from tensorflow.keras.models import Model
from tensorflow.keras.applications import VGG16
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import numpy as np
import warnings
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, adjusted_rand_score
from PIL import Image


dataset_dir = "dataset"
warnings.filterwarnings("ignore")

Preprocessing/Resizing the images

In [10]:
def normalize_image(image):
  image_array = np.array(image)
  normalized_array = image_array / 255.0
  return Image.fromarray((normalized_array * 255).astype(np.uint8))

def denoise_image(image):
  # Apply median filtering (adjust kernel size as needed)
  image = cv2.medianBlur(image, ksize=5)

  # Normalize the image
  return normalize_image(image)

def preprocess_image(image):
  return normalize_image(image)
  # no need for denoising now
  # denoise_image(image)

Preprocessing images by resizing and normalizing them.

Saving the new images in new train and test directories.

In [11]:
train_dataset_folder = f"{dataset_dir}/train"
train_output_folder = f"{dataset_dir}/resized_train"

test_dataset_folder = f"{dataset_dir}/test"
test_output_folder = f"{dataset_dir}/resized_test"

In [12]:
def resize_images(dataset_folder, output_folder, target_size, force_update=False):
    if os.path.isdir(output_folder) and len(os.listdir(output_folder)) != 0 and not force_update:
        return
    # Loop through all images in the dataset folder
    for filename in os.listdir(dataset_folder):
        # Skiping non-image files in case any
        if not filename.lower().endswith((".jpg", ".jpeg", ".png")):
            continue

        image_path = os.path.join(dataset_folder, filename)
        image = Image.open(image_path)

        resized_image = image.resize(target_size, Image.LANCZOS)

        if image.mode != 'RGB':
            resized_image = resized_image.convert('RGB')

        resized_image = preprocess_image(resized_image)

        # Saving the resized image
        output_path = os.path.join(output_folder, filename)
        resized_image.save(output_path)

os.makedirs(train_output_folder, exist_ok=True)
os.makedirs(test_output_folder, exist_ok=True)

resize_images(train_dataset_folder, train_output_folder, target_size=(224, 224))
resize_images(test_dataset_folder, test_output_folder, target_size=(224, 224))

In [13]:
def build_image_features_extraction_model():
    base_model = VGG16(weights='imagenet', include_top=True)
    # Retrieve the last dense layer name
    last_dense_layer_name = base_model.layers[-2].name
    # return Model(inputs=base_model.input, outputs=base_model.get_layer('block5_pool').output)
    # Create a new model for feature extraction from the last dense layer
    return Model(inputs=base_model.input, outputs=base_model.get_layer(last_dense_layer_name).output)

In [14]:
def extract_image_features_as_list(feature_extractor, dir_path, max_iterations=-1):
    #train_output_folder
    features_list = []
    count = 0
    for filename in os.listdir(dir_path):
        if filename.endswith(('.jpg', '.jpeg', '.png')):
            # Load and preprocess the image
            img_path = os.path.join(dir_path, filename)
            img = Image.open(img_path)
            img_array = np.array(img)
            # add batch dimensions
            img_array = np.expand_dims(img_array, axis=0)
            img_array = img_array / 255.0

            # Extract features from the image
            features = feature_extractor.predict(img_array)

            # Append the features to the list
            features_list.append(features)

            count += 1
            if max_iterations > 0 and count >= max_iterations:
                break

    return features_list

In [15]:
def extract_images_features_into_csv(feature_extractor, image_dir, csv_file, max_files):
    # Iterate over the images in the directory
    features_list = extract_image_features_as_list(feature_extractor, image_dir, max_iterations=max_files)

    # features_array = np.vstack(features_list)
    features_list_flattened = [features.reshape(features.shape[0], -1) for features in features_list]

    # Convert the numpy array to a pandas DataFrame
    features_df = pd.DataFrame(np.concatenate(features_list_flattened, axis=0))

    # Save the DataFrame to a CSV file
    features_df.to_csv(csv_file, index=False)
    
    return features_df

In [16]:
feature_extractor = build_image_features_extraction_model()
features_df = extract_images_features_into_csv(feature_extractor, train_output_folder, 'extracted_features.csv', 1000)
features_df.head()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.163345,0.0,2.148929,0.077048,1.247398,0.40291,0.0,0.0,0.541019,0.0,...,0.0,1.382223,0.739724,0.179842,0.037214,0.152463,0.400144,0.0,0.0,1.194828
1,0.0,0.0,1.395392,0.141179,0.515802,0.34394,0.0,0.0,0.684522,0.0,...,0.0,1.192664,1.058604,0.132403,0.497913,0.145713,0.105466,0.0,0.0,0.594404
2,0.0,0.0,1.648631,0.171979,0.723046,0.313903,0.0,0.0,0.748148,0.0,...,0.0,1.328776,0.85931,0.307568,0.504338,0.0,0.336458,0.0,0.0,1.062166
3,0.0,0.0,1.710148,0.503269,1.069473,0.039042,0.0,0.0,0.581309,0.0,...,0.0,1.231393,0.544217,0.311978,0.627141,0.0,0.339312,0.0,0.0,1.007366
4,0.0,0.0,1.553678,0.468565,0.857742,0.276672,0.0,0.0,0.760021,0.0,...,0.0,1.124502,1.083881,0.037733,0.503879,0.0,0.428747,0.0,0.0,1.004979


In [17]:
test_features_df = extract_images_features_into_csv(feature_extractor, train_output_folder, 'extracted_test_features.csv', 1000)
test_features_df.head()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0.163345,0.0,2.148929,0.077048,1.247398,0.40291,0.0,0.0,0.541019,0.0,...,0.0,1.382223,0.739724,0.179842,0.037214,0.152463,0.400144,0.0,0.0,1.194828
1,0.0,0.0,1.395392,0.141179,0.515802,0.34394,0.0,0.0,0.684522,0.0,...,0.0,1.192664,1.058604,0.132403,0.497913,0.145713,0.105466,0.0,0.0,0.594404
2,0.0,0.0,1.648631,0.171979,0.723046,0.313903,0.0,0.0,0.748148,0.0,...,0.0,1.328776,0.85931,0.307568,0.504338,0.0,0.336458,0.0,0.0,1.062166
3,0.0,0.0,1.710148,0.503269,1.069473,0.039042,0.0,0.0,0.581309,0.0,...,0.0,1.231393,0.544217,0.311978,0.627141,0.0,0.339312,0.0,0.0,1.007366
4,0.0,0.0,1.553678,0.468565,0.857742,0.276672,0.0,0.0,0.760021,0.0,...,0.0,1.124502,1.083881,0.037733,0.503879,0.0,0.428747,0.0,0.0,1.004979


In [18]:
features_csv_file = "extracted_features.csv"
# Load extracted features from CSV
features_df = pd.read_csv(features_csv_file)

In [19]:
# Define a mapping from cluster labels to meaningful categories
label_mapping = {
    0: 'resident',
    1: 'visitor',
    2: 'frequent visitor',
    3: 'first-time visitor',
    4: 'delivery',
    5: 'other'
}

In [20]:
num_clusters = 6  # Number of clusters (e.g., residents, frequent visitors, first-time visitors, delivery, etc.)
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(features_df)

# Assign cluster labels to the data
cluster_labels = kmeans.labels_

# Print cluster labels
print(cluster_labels)

[0 5 2 3 5 0 1 2 3 4 1 2 2 3 3 3 3 0 5 5 3 5 2 0 2 4 3 4 1 4 3 3 3 5 2 0 1
 4 2 1 3 1 1 1 3 1 1 4 3 2 4 2 5 5 4 1 4 2 1 4 5 4 1 5 1 5 2 2 5 2 0 3 4 5
 4 4 5 2 1 2 4 3 4 1 3 2 2 3 3 5 2 4 2 3 1 1 4 0 5 1 1 5 2 2 3 3 4 5 0 3 0
 2 0 2 0 1 2 3 3 3 2 5 2 5 5 2 5 1 0 0 2 2 1 1 0 2 1 4 2 1 3 4 2 1 3 0 2 1
 2 4 3 5 4 2 4 4 4 4 0 2 1 1 3 3 5 0 2 0 0 1 2 1 3 1 1 1 3 5 5 1 4 2 3 1 5
 5 1 2 3 4 5 1 0 4 5 2 5 2 0 3 2 1 2 1 1 2 2 3 1 4 0 0 1 4 0 4 2 1 4 3 1 3
 5 2 4 4 2 4 1 1 3 4 4 5 1 4 3 3 1 2 2 1 1 2 1 4 5 0 2 2 1 3 1 5 5 3 2 1 2
 1 3 0 3 4 3 4 2 1 1 4 2 1 4 2 2 1 0 1 5 2 3 3 5 0 2 2 5 3 4 2 4 3 2 4 4 3
 5 0 3 5 2 4 1 0 5 0 2 0 3 4 4 5 3 0 3 4 3 1 1 4 3 5 0 1 3 0 2 1 5 1 3 4 4
 4 2 3 5 0 1 1 5 0 3 5 3 0 5 5 0 2 2 5 5 4 3 4 2 3 4 0 4 3 4 1 3 2 4 5 1 5
 4 2 3 4 1 2 3 4 3 1 4 4 4 4 4 2 3 1 2 4 5 3 3 2 1 1 1 2 5 1 4 4 1 4 0 2 1
 1 4 2 4 0 1 0 2 2 1 5 1 3 1 4 0 3 1 2 2 3 2 2 1 3 1 1 5 3 0 1 0 4 3 1 5 3
 3 2 2 5 5 5 1 1 0 2 2 4 0 5 2 4 2 0 3 0 4 1 2 5 3 3 1 1 1 2 2 1 1 1 1 1 3
 1 1 4 1 4 5 4 3 1 3 5 5 

In [21]:
# Calculate silhouette score
silhouette = silhouette_score(features_df, cluster_labels)

# Calculate Calinski-Harabasz index
calinski_harabasz = calinski_harabasz_score(features_df, cluster_labels)

# Calculate Davies-Bouldin index
davies_bouldin = davies_bouldin_score(features_df, cluster_labels)

# Calculate Adjusted Rand Index (ARI)
ari = adjusted_rand_score(cluster_labels, cluster_labels)

print(f"Silhouette Score: {silhouette}")
print(f"Calinski-Harabasz Index: {calinski_harabasz}")
print(f"Davies-Bouldin Index: {davies_bouldin}")
print(f"Adjusted Rand Index (ARI): {ari}")

Silhouette Score: 0.060011604383856054
Calinski-Harabasz Index: 62.323362195759756
Davies-Bouldin Index: 3.0555263446504135
Adjusted Rand Index (ARI): 1.0


In [22]:
labeled_categories = [label_mapping[label] for label in cluster_labels]
print(labeled_categories)

['resident', 'other', 'frequent visitor', 'first-time visitor', 'other', 'resident', 'visitor', 'frequent visitor', 'first-time visitor', 'delivery', 'visitor', 'frequent visitor', 'frequent visitor', 'first-time visitor', 'first-time visitor', 'first-time visitor', 'first-time visitor', 'resident', 'other', 'other', 'first-time visitor', 'other', 'frequent visitor', 'resident', 'frequent visitor', 'delivery', 'first-time visitor', 'delivery', 'visitor', 'delivery', 'first-time visitor', 'first-time visitor', 'first-time visitor', 'other', 'frequent visitor', 'resident', 'visitor', 'delivery', 'frequent visitor', 'visitor', 'first-time visitor', 'visitor', 'visitor', 'visitor', 'first-time visitor', 'visitor', 'visitor', 'delivery', 'first-time visitor', 'frequent visitor', 'delivery', 'frequent visitor', 'other', 'other', 'delivery', 'visitor', 'delivery', 'frequent visitor', 'visitor', 'delivery', 'other', 'delivery', 'visitor', 'other', 'visitor', 'other', 'frequent visitor', 'frequ

In [23]:
test_features_csv_file = "extracted_test_features.csv"

test_features_df = pd.read_csv(test_features_csv_file)

# Trying to predict the cars in the test directory
test_cluster_labels = kmeans.predict(test_features_df)

# Mapping the output to the text labels
predicted_categories = [label_mapping[label] for label in test_cluster_labels]

print(predicted_categories)

['resident', 'other', 'frequent visitor', 'first-time visitor', 'other', 'resident', 'visitor', 'frequent visitor', 'first-time visitor', 'delivery', 'visitor', 'frequent visitor', 'frequent visitor', 'first-time visitor', 'first-time visitor', 'first-time visitor', 'first-time visitor', 'resident', 'other', 'other', 'first-time visitor', 'other', 'frequent visitor', 'resident', 'frequent visitor', 'delivery', 'first-time visitor', 'delivery', 'visitor', 'delivery', 'first-time visitor', 'first-time visitor', 'first-time visitor', 'other', 'frequent visitor', 'resident', 'visitor', 'delivery', 'frequent visitor', 'visitor', 'first-time visitor', 'visitor', 'visitor', 'visitor', 'first-time visitor', 'visitor', 'visitor', 'delivery', 'first-time visitor', 'frequent visitor', 'delivery', 'frequent visitor', 'other', 'other', 'delivery', 'visitor', 'delivery', 'frequent visitor', 'visitor', 'delivery', 'other', 'delivery', 'visitor', 'other', 'visitor', 'other', 'frequent visitor', 'frequ