In [1]:
# Exclude the first row from both centroids and human readable topics (assuming they are outliers)
centroids_limited = centroids.iloc[1:]  # Select 500 rows
human_readable_embeddings_limited = human_readable_embeddings.iloc[1:]  # Select 500 rows

# Add numbering to the y-axis labels (Human Readable Topics)
human_readable_embeddings_limited['Human_Readable_Topic'] = [f"{i+1}. {label}" for i, label in enumerate(human_readable_embeddings_limited['Human_Readable_Topic'])]

# Initialize matrices to store the pairwise Cosine Similarities and Euclidean Distances
similarity_matrix = np.zeros((len(human_readable_embeddings_limited), len(centroids_limited)))
distance_matrix = np.zeros((len(human_readable_embeddings_limited), len(centroids_limited)))

# Calculate the Cosine similarity and Euclidean distance for each pair (Human Readable Topic vs. Topic Centroid)
for i, hr_embedding in enumerate(human_readable_embeddings_limited['Human_Readable_Topic_embedding']):
    for j, centroid in enumerate(centroids_limited['centroid']):
        similarity = cosine_similarity([hr_embedding], [centroid])[0][0]
        distance = euclidean_distances([hr_embedding], [centroid])[0][0]
        similarity_matrix[i, j] = similarity
        distance_matrix[i, j] = distance

# Convert the matrices to DataFrames for better readability
similarity_df_limited = pd.DataFrame(similarity_matrix, 
                                     index=human_readable_embeddings_limited['Human_Readable_Topic'], 
                                     columns=centroids_limited['Topic Label'])

distance_df_limited = pd.DataFrame(distance_matrix, 
                                   index=human_readable_embeddings_limited['Human_Readable_Topic'], 
                                   columns=centroids_limited['Topic Label'])

# Save the DataFrames to CSV files
similarity_df_limited.to_csv('first_step_pairwise_cosine_similarity_heatmap_500.csv', index=True)
distance_df_limited.to_csv('first_step_pairwise_euclidean_distance_heatmap_500.csv', index=True)

# Create heatmaps for both similarity and distance with a blue-to-yellow color scale
def create_heatmap(df, title, color_label, filename_html, filename_png):
    fig = px.imshow(df,
                    labels=dict(x="Topic Centroids", y="Human Readable Topics", color=color_label),
                    x=df.columns,
                    y=df.index,
                    color_continuous_scale=['blue', 'yellow'],  # Custom gradient from blue to yellow
                    text_auto=True)

    # Adjust layout and ensure visibility
    fig.update_layout(title=title,
                      xaxis_title='Topic Centroids',
                      yaxis_title='Human Readable Topics',
                      width=3000,  # Increase width to handle 500 topics
                      height=3000,  # Increase height to handle 500 topics
                      font=dict(size=8))  # Adjust font size here if needed

    # Update y-axis layout to ensure all labels are visible
    fig.update_yaxes(automargin=True,
                     tickangle=0,  # Adjust the angle of y-axis labels
                     tickfont=dict(size=3))  # Adjust the font size of y-axis labels to make room for 500 topics

    # Save the figure as an interactive HTML file
    pio.write_html(fig, file=filename_html, auto_open=False)

    # Save the figure as a static image (e.g., PNG)
    pio.write_image(fig, file=filename_png, format='png', width=3000, height=3000, scale=3)

    # Display the heatmap
    fig.show()

# Create and save Cosine Similarity Heatmap
create_heatmap(similarity_df_limited, 
               'Pairwise Cosine Similarity Matrix Heatmap (First 500 Labels, excluding outliers)',
               'Cosine Similarity',
               'first_step_pairwise_cosine_similarity_heatmap_500.html',
               'first_step_pairwise_cosine_similarity_heatmap_500.png')

# Create and save Euclidean Distance Heatmap
create_heatmap(distance_df_limited, 
               'Pairwise Euclidean Distance Matrix Heatmap (First 500 Labels, excluding outliers)',
               'Euclidean Distance',
               'first_step_pairwise_euclidean_distance_heatmap_500.html',
               'first_step_pairwise_euclidean_distance_heatmap_500.png')


NameError: name 'centroids' is not defined