# This notebook performs topic diversity and topic coherence calculations for a clustering result dataset.

## Steps:
1. **Preprocessing the text data**:
   - Loads the dataset `Second_step_clustering_BERTopic_results.csv` and assumes a separate list of documents.
   - Preprocesses the text by removing stopwords and non-alphabetic tokens using NLTK.

2. **Topic diversity calculation**:
   - Groups the data by 'Topic Label' and extracts keywords from the 'Second_Step_Topic_Keywords' column.
   - Calculates topic diversity by computing the proportion of unique words to total words across all topics.

3. **Topic coherence calculation**:
   - Uses Gensim to compute the c_v coherence score for each topic.
   - A Gensim dictionary is created from the tokenized documents, and the c_v coherence score is computed for each topic.

4. **Results output**:
   - The results, including topic keywords and c_v coherence scores, are stored in a DataFrame and saved as `First_Level_Second_Step_coherence.csv`.
   - The average coherence score across all topics is calculated and printed for analysis.


In [4]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load your dataset
file_path = 'Second_step_clustering_BERTopic_results.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Example: Assuming you have a separate list of documents
documents = data['text']

# Preprocess documents
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

processed_docs = [preprocess(doc) for doc in documents]

# Extract topics after filtering
topic_representations = data.groupby('Topic Label')['Second_Step_Topic_Keywords'].apply(lambda x: x.iloc[0].split(', '))
topics = topic_representations.tolist()

# Calculate Topic Diversity
def calculate_topic_diversity(topics):
    unique_words = set()
    total_words = 0

    for topic in topics:
        unique_words.update(topic)  # Add words to the unique set
        total_words += len(topic)   # Count total words in all topics

    # Topic diversity is the proportion of unique words to total words
    topic_diversity = len(unique_words) / total_words
    return topic_diversity

# Calculate topic diversity
topic_diversity = calculate_topic_diversity(topics)

# Display the topic diversity score
print(f"\nTopic Diversity: {topic_diversity}")


[nltk_data] Downloading package punkt to /home/yc656703/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yc656703/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Topic Diversity: 0.429073482428115


Topic Coherence

In [5]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
import pandas as pd

# Assuming 'processed_docs' is a list of tokenized documents
# Create a Gensim dictionary from the processed documents
dictionary = Dictionary(processed_docs)

# Initialize a list to store c_v coherence scores
per_topic_coherence_cv = []

for topic in topics:
    # Create a list containing just the current topic
    current_topic = [topic]
    
    print(current_topic)
    # print(processed_docs)
    # print(dictionary)
    
    # Initialize the CoherenceModel for the current topic using 'c_v'
    coherence_model_cv = CoherenceModel(topics=current_topic, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    
    # Compute the c_v coherence score
    coherence_cv = coherence_model_cv.get_coherence()
    print(coherence_cv)
    # Append the c_v score to the list
    per_topic_coherence_cv.append(coherence_cv)
    
    

# Create a DataFrame to display the results
results = pd.DataFrame({
    "Topic": topic_ids,
    "Keywords": topic_representations.values,
    "Coherence c_v": per_topic_coherence_cv,
})


# Display overall and per-topic coherence scores
print(results)


[['hypergraph', 'subgraph', 'graphs', 'hypergraphs', 'graph', 'adjacency', 'networks', 'nodes', 'vertices', 'embeddings']]
0.57424560033956
[['federated', 'privacy', 'learning', 'datasets', 'collaborative', 'distributed', 'trained', 'sharing', 'fedllm', 'collaboratively']]
0.5736654091434796
[['graphs', 'subgraph', 'nodes', 'networks', 'graph', 'subgraphs', 'supervised', 'node', 'learning', 'labeled']]
0.6255834867442949
[['captioning', 'multimodal', 'embeddings', 'recognition', 'captions', 'encoder', 'embedding', 'retrieval', 'visual', 'text']]
0.6520743704244423
[['learns', 'reinforcement', 'robotic', 'robotics', 'robot', 'learning', 'robots', 'exploration', 'learned', 'imitation']]
0.6901123007785468
[['mri', 'imaging', 'neuroimaging', 'tomography', 'fmri', 'segmentation', 'dataset', 'deep', 'images', 'brain']]
0.5866121210747726
[['attention', 'rnns', 'memory', 'nlp', 'learn', 'recurrent', 'examples', 'context', 'language', 'transformers']]
0.4572251851231385
[['audio', 'microphone

In [6]:
results.to_csv('First_Level_Second_Step_coherence.csv')

In [8]:
average_coherence_cv = results['Coherence c_v'].mean()
average_coherence_cv

0.5132173039549905