<a href="https://colab.research.google.com/github/swamy-surla/Data-mining-1/blob/main/Data_Mining_Programming_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import cv2
import os
import numpy as np
import warnings
from skimage.color import rgb2gray
from skimage import io, exposure, filters
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, SpectralClustering, BisectingKMeans
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.metrics import fowlkes_mallows_score, silhouette_score

In [3]:
warnings.filterwarnings('ignore')

# q(1)

In [4]:
def compute_gradient_angle(dx, dy):
    """Calculate the angles between horizontal and vertical operators."""
    return np.mod(np.arctan2(dy, dx), np.pi)

In [5]:
data_path = "/content/drive/MyDrive/data_mining/Processed_images"

labels_list = ['Dandie_Dinmont', 'Otterhound', 'Tibetan_terrier', 'French_bulldog']

In [6]:
dataframe = pd.DataFrame(columns = list(range(0,36))+['class'])
folders_list = os.listdir(data_path)
for folder in folders_list:
  class_path = os.path.join(data_path, folder)
  for i, label in enumerate(labels_list):
    if label.lower() == folder.split("-")[-1].lower():
      class_num = i
  for filename in os.listdir(class_path):
    img = io.imread(os.path.join(class_path,filename))
    gray_image = rgb2gray(img)
    angle_sobel = compute_gradient_angle(filters.sobel_h(gray_image),
                    filters.sobel_v(gray_image))
    hist,bins = exposure.histogram(angle_sobel,nbins=36)
    dataframe.loc[len(dataframe)] = list(hist)+[class_num]

In [7]:
scaler = StandardScaler()
scaler.fit(dataframe[dataframe.columns[:-1]])

data = dataframe[dataframe.columns[:-1]]

original_classes = np.array(dataframe[dataframe.columns[-1]])

scaled_data = scaler.transform(data)

# q (2)

In [8]:
pca = PCA(n_components=2)
transformed_data = pca.fit_transform(scaled_data)

# q (3)

In [9]:
# K-means clustering with init='random'
kmeans_random = KMeans(n_clusters=4, init='random', random_state=42)
kmeans_random.fit(transformed_data)
kmeans_random_labels = kmeans_random.labels_

# K-means clustering with init='k-means++'
kmeans_kmeans_pp = KMeans(n_clusters=4, init='k-means++', random_state=42)
kmeans_kmeans_pp.fit(transformed_data)
kmeans_kmeans_pp_labels = kmeans_kmeans_pp.labels_

# Bisecting K-means clustering with init='random'
bisecting_kmeans_random = BisectingKMeans(n_clusters=4, init='random', random_state=42)
bisecting_kmeans_random.fit(transformed_data)
bisecting_kmeans_random_labels = bisecting_kmeans_random.labels_

# Spectral clustering with default parameters
spectral_clustering = SpectralClustering(n_clusters=4, random_state=42)
spectral_clustering.fit(transformed_data)
spectral_clustering_labels = spectral_clustering.labels_

# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=2)
dbscan.fit(data)
dbscan_labels = dbscan.labels_

# Agglomerative clustering with different linkage methods
agglomerative_single = AgglomerativeClustering(n_clusters=4, linkage='single')
agglomerative_single.fit(data)
agglomerative_single_labels = agglomerative_single.labels_

agglomerative_complete = AgglomerativeClustering(n_clusters=4, linkage='complete')
agglomerative_complete.fit(data)
agglomerative_complete_labels = agglomerative_complete.labels_

agglomerative_average = AgglomerativeClustering(n_clusters=4, linkage='average')
agglomerative_average.fit(data)
agglomerative_average_labels = agglomerative_average.labels_

agglomerative_ward = AgglomerativeClustering(n_clusters=4, linkage='ward')
agglomerative_ward.fit(data)
agglomerative_ward_labels = agglomerative_ward.labels_

# q (4)

In [11]:
# Calculate Fowlkes-Mallows index
fowlkes_mallows_scores = {
    'K-means (Random)': fowlkes_mallows_score(original_classes, kmeans_random_labels),
    'K-means (k-means++)': fowlkes_mallows_score(original_classes, kmeans_kmeans_pp_labels),
    'Bisecting K-means': fowlkes_mallows_score(original_classes, bisecting_kmeans_random_labels),
    'Spectral Clustering': fowlkes_mallows_score(original_classes, spectral_clustering_labels),
    'DBSCAN': fowlkes_mallows_score(original_classes, dbscan_labels),
    'Agglomerative (Single link)': fowlkes_mallows_score(original_classes, agglomerative_single_labels),
    'Agglomerative (Complete link)': fowlkes_mallows_score(original_classes, agglomerative_complete_labels),
    'Agglomerative (Group Average)': fowlkes_mallows_score(original_classes, agglomerative_average_labels),
    'Agglomerative (Ward)': fowlkes_mallows_score(original_classes, agglomerative_ward_labels)
}


In [12]:
# Calculate Silhouette Coefficient
silhouette_scores = {
    'K-means (Random)': silhouette_score(transformed_data, kmeans_random_labels),
    'K-means (k-means++)': silhouette_score(transformed_data, kmeans_kmeans_pp_labels),
    'Bisecting K-means': silhouette_score(transformed_data, bisecting_kmeans_random_labels),
    'Spectral Clustering': silhouette_score(transformed_data, spectral_clustering_labels),
    'DBSCAN': silhouette_score(transformed_data, dbscan_labels),
    'Agglomerative (Single link)': silhouette_score(transformed_data, agglomerative_single_labels),
    'Agglomerative (Complete link)': silhouette_score(transformed_data, agglomerative_complete_labels),
    'Agglomerative (Group Average)': silhouette_score(transformed_data, agglomerative_average_labels),
    'Agglomerative (Ward)': silhouette_score(transformed_data, agglomerative_ward_labels)
}


In [13]:
# Rank methods based on Fowlkes-Mallows index
ranked_methods_fm = sorted(fowlkes_mallows_scores.items(), key=lambda x: x[1], reverse=True)
print("Ranking based on Fowlkes-Mallows index:")
for method, score in ranked_methods_fm:
    print(f"{method}: {score}")



Ranking based on Fowlkes-Mallows index:
DBSCAN: 0.4998403538257103
Agglomerative (Single link): 0.4987513310029087
Agglomerative (Complete link): 0.35669420807427843
Agglomerative (Group Average): 0.35669420807427843
Agglomerative (Ward): 0.3563513141283777
Bisecting K-means: 0.30555307205695637
Spectral Clustering: 0.28944658714078164
K-means (Random): 0.27729414209440567
K-means (k-means++): 0.27729414209440567


In [14]:
# Rank methods based on Silhouette Coefficient
ranked_methods_silhouette = sorted(silhouette_scores.items(), key=lambda x: x[1], reverse=True)
print("\nRanking based on Silhouette Coefficient:")
for method, score in ranked_methods_silhouette:
    print(f"{method}: {score}")


Ranking based on Silhouette Coefficient:
DBSCAN: 0.48279570686308526
K-means (Random): 0.457896412238061
K-means (k-means++): 0.457896412238061
Spectral Clustering: 0.4528878629166303
Bisecting K-means: 0.4023991803706941
Agglomerative (Complete link): 0.37323039512380946
Agglomerative (Group Average): 0.37323039512380946
Agglomerative (Ward): 0.3433855878055791
Agglomerative (Single link): -0.1587832586140878
