Import necessary libraries

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics import fowlkes_mallows_score
from sklearn.cluster import KMeans
from functions import create_embeddings, calculate_similarity_matrix, hierarchical_clustering, calculate_silhouette_scores, find_optimal_cluster, cluster_texts
from gpt_generated_test_sets import word_list_one, gold_clusters_one,word_list_two,gold_clusters_two
import openai
from matplotlib import pyplot as plt
import pandas as pd


Define engine and key

In [None]:
word_list=word_list_two
gold_cluster=gold_clusters_two
engine = "text-embedding-ada-002"
openai.api_key = "key"

Create word embeddings

In [None]:
embeddings = create_embeddings(word_list, engine)

Calculate silhouette scores to find the optimal number of clusters

In [None]:
cluster_results_km = calculate_silhouette_scores(embeddings)
num_cluster = find_optimal_cluster(cluster_results_km)


Print the optimal number of clusters and the corresponding results

In [None]:
print("Optimal number of clusters:", num_cluster)
print(cluster_results_km.loc[cluster_results_km['k'] == num_cluster])

Plot silhouette scores vs cluster number

In [None]:
cluster_results_km = cluster_results_km.set_index('k')
cluster_results_km.plot(title='Silhouette scores vs cluster number', grid=True, figsize=(15, 5))


Grid search K-Menas clustering w.r.t adjusted_rand_score, fowlkes_mallows_score

In [None]:
# Initialize an empty DataFrame
data = {'n_clusters': [], 'init': [], 'ARI': [], 'Fowlkes-Mallows': []}
results_df = pd.DataFrame(data)


# Define the range of hyperparameters for K-means
n_clusters_range = range(1,len(embeddings))  # Number of clusters to test
init_methods = ['k-means++', 'random']  # Initialization methods

for n_clusters in n_clusters_range:
    for init_method in init_methods:
        # Perform K-means clustering
        kmeans = KMeans(n_clusters=n_clusters, init=init_method, random_state=0)
        kmeans_clusters = kmeans.fit_predict(embeddings)

        # Calculate Adjusted Rand Score and Fowlkes-Mallows Score
        kmeans_ari = adjusted_rand_score(gold_cluster, kmeans_clusters)
        kmeans_fowlkes_mallows = fowlkes_mallows_score(gold_cluster, kmeans_clusters)

        # Append the results to the DataFrame
        results_df = results_df.append({'n_clusters': n_clusters, 'init': init_method,
                                        'ARI': kmeans_ari, 'Fowlkes-Mallows': kmeans_fowlkes_mallows},
                                        ignore_index=True)

# Find the row with the maximum ARI score and the row with the maximum Fowlkes-Mallows score
max_ari_row_k = results_df.loc[results_df['ARI'].idxmax()]
max_fowlkes_mallows_row_k = results_df.loc[results_df['Fowlkes-Mallows'].idxmax()]

# Print the rows with the maximum scores
print("Maximum ARI:")
print("Number of Clusters:", max_ari_row_k['n_clusters'])
print("Initialization Method:", max_ari_row_k['init'])
print("ARI:", max_ari_row_k['ARI'])
print("Fowlkes-Mallows:", max_ari_row_k['Fowlkes-Mallows'])

print("\nMaximum Fowlkes-Mallows:")
print("Number of Clusters:", max_fowlkes_mallows_row_k['n_clusters'])
print("Initialization Method:", max_fowlkes_mallows_row_k['init'])
print("ARI:", max_fowlkes_mallows_row_k['ARI'])
print("Fowlkes-Mallows:", max_fowlkes_mallows_row_k['Fowlkes-Mallows'])


Final Parameters for K-Means

In [None]:
final_ari_n_clusters = int(max_ari_row_k['n_clusters'])
final_ari_init = max_ari_row_k['init']

final_fowlkes_mallows_n_clusters = int(max_fowlkes_mallows_row_k['n_clusters'])
final_fowlkes_mallows_init = max_fowlkes_mallows_row_k['init']

Apply K-Means clustering

In [None]:
km_model = KMeans(n_clusters=final_ari_n_clusters, init=final_ari_init, random_state=42)
kmeans_clusters = km_model.fit_predict(embeddings)

Clustered groups  using K-Means

In [None]:

cluster_texts(word_list, kmeans_clusters)

Calculate similarity matrix for hierarchical clustering


In [None]:
similarity_matrix = calculate_similarity_matrix(embeddings)

Finding optimal linkage method and optimal number for k  adjusted_rand_score, fowlkes_mallows_score

In [None]:
# Initialize an empty DataFrame
data = {'Linkage Method': [], 'Threshold': [], 'Cluster Numbers': [], 'ARI': [], 'Fowlkes-Mallows': []}
results_df = pd.DataFrame(data)



# Define the range of linkage methods and thresholds
linkage_methods = ['ward', 'centroid', 'weighted', 'single', 'median']
threshold_range = [i/100 for i in range(0, 51, 5)]  # Thresholds from 0.1 to 0.5 in steps of 0.05

for linkage_method in linkage_methods:
    for threshold in threshold_range:
        # Perform hierarchical clustering
        hierarchical_clusters = hierarchical_clustering(similarity_matrix, linkage_method, threshold)

        # Calculate the number of clusters
        num_clusters = len(set(hierarchical_clusters))

        # Calculate Adjusted Rand Score and Fowlkes-Mallows Score
        hierarchical_ari = adjusted_rand_score(gold_cluster, hierarchical_clusters)
        hierarchical_fowlkes_mallows = fowlkes_mallows_score(gold_cluster, hierarchical_clusters)

        # Append the results to the DataFrame
        results_df = results_df.append({'Linkage Method': linkage_method, 'Threshold': threshold,
                                        'Cluster Numbers': num_clusters, 'ARI': hierarchical_ari,
                                        'Fowlkes-Mallows': hierarchical_fowlkes_mallows},
                                        ignore_index=True)

# Find the row with the maximum ARI score and the row with the maximum Fowlkes-Mallows score
max_ari_row = results_df.loc[results_df['ARI'].idxmax()]
max_fowlkes_mallows_row = results_df.loc[results_df['Fowlkes-Mallows'].idxmax()]

# Print the rows with the maximum scores
print("Row with Maximum ARI:")
print(max_ari_row)

print("\nRow with Maximum Fowlkes-Mallows:")
print(max_fowlkes_mallows_row)


Assign the cluster number and linkage method according to the outcome

In [None]:
max_ari_linkage = max_ari_row['Linkage Method']
max_ari_threshold = max_ari_row['Threshold']


max_fowlkes_mallows_linkage = max_fowlkes_mallows_row['Linkage Method']
max_fowlkes_mallows_threshold = max_fowlkes_mallows_row['Threshold']


Perform hierarchical clustering

In [None]:
hierarchical_clusters = hierarchical_clustering(similarity_matrix, max_ari_linkage, max_ari_threshold)

Cluster words using hierarchical clustering

In [None]:
cluster_texts(word_list, hierarchical_clusters)

Evaluate final clustering methods  using Adjusted Rand Score  and  Fowlkes-Mallows Score

In [None]:
kmeans_ari = adjusted_rand_score(gold_cluster, kmeans_clusters)
hierarchical_ari = adjusted_rand_score(gold_cluster, hierarchical_clusters)


kmeans_fowlkes_mallows = fowlkes_mallows_score(gold_cluster, kmeans_clusters)
hierarchical_fowlkes_mallows = fowlkes_mallows_score(gold_cluster, hierarchical_clusters)


print("K-Means Adjusted Rand Score:", kmeans_ari)
print("Hierarchical Adjusted Rand Score:", hierarchical_ari)
print("K-Means Fowlkes-Mallows Score:", kmeans_fowlkes_mallows)
print("Hierarchical Fowlkes-Mallows Score:", hierarchical_fowlkes_mallows)

For ARI and FMI to be applicable, it is necessary that the numbers of clusters in the true labels (gold clusters) and the predicted clusters are equal. These metrics rely on the comparison of pairs of data points in terms of their clustering assignment. If the numbers of clusters in the true and predicted clusters are not equal, these metrics would not be directly applicable.

Adjusted Mutual Information between two clusterings.

Adjusted Mutual Information (AMI) is an adjustment of the Mutual Information (MI) score to account for chance. It accounts for the fact that the MI is generally higher for two clusterings with a larger number of clusters, regardless of whether there is actually more information shared. For two clusterings 
 and 
, the AMI is given as:

AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]
This metric is independent of the absolute values of the labels: a permutation of the class or cluster label values won’t change the score value in any way.

In [None]:
from sklearn.metrics import adjusted_mutual_info_score
kmeans_ami = adjusted_mutual_info_score(gold_cluster, kmeans_clusters)
hierarchical_ami = adjusted_mutual_info_score(gold_cluster, hierarchical_clusters)

print("K-Means Adjusted Mutual Information Score:", kmeans_ami)
print("Hierarchical Adjusted Mutual Information Score:", hierarchical_ami)