# This notebook calculates topic diversity and topic coherence using tokenized documents
# and topic representations from a topic modeling pipeline.

## Steps:
1. **Preprocessing the text data**:
   - Loads the dataset and filters out outliers (topics labeled as -1).
   - Preprocesses the text documents by removing stopwords and non-alphabetic tokens.
   
2. **Topic diversity calculation**:
   - Groups the data by 'Topic Label' and extracts keywords from the 'First_Step_Topic_Keywords' column.
   - Calculates topic diversity by computing the ratio of unique words to total words across all topics.

3. **Topic coherence calculation**:
   - Uses the Gensim library to compute the c_v coherence score for each topic.
   - Tokenized documents are used to create a Gensim dictionary.
   - The c_v coherence score is calculated for each topic, and results are stored in a CSV file.

4. **Results output**:
   - The results of the topic diversity and coherence calculations are displayed and saved to CSV files for further analysis.


In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load your dataset
file_path = 'First_Step_BERTopic_result.csv'  # Replace with your file path
data = pd.read_csv(file_path)

data_filtered = data[data['Topic Label'] != -1]

# Example: Assuming you have a separate list of documents
documents = data['text']

# Preprocess documents
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

processed_docs = [preprocess(doc) for doc in documents]

# Extract topics after filtering
topic_representations = data.groupby('Topic Label')['First_Step_Topic_Keywords'].apply(lambda x: x.iloc[0].split(', '))
topics = topic_representations.tolist()

# Calculate Topic Diversity
def calculate_topic_diversity(topics):
    unique_words = set()
    total_words = 0

    for topic in topics:
        unique_words.update(topic)  # Add words to the unique set
        total_words += len(topic)   # Count total words in all topics

    # Topic diversity is the proportion of unique words to total words
    topic_diversity = len(unique_words) / total_words
    return topic_diversity

# Calculate topic diversity
topic_diversity = calculate_topic_diversity(topics)

# Display the topic diversity score
print(f"\nTopic Diversity: {topic_diversity}")


[nltk_data] Downloading package punkt to /home/yc656703/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yc656703/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  data = pd.read_csv(file_path)



Topic Diversity: 0.5280534351145039


In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load your dataset
file_path = 'First_Step_BERTopic_result.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Filter out outliers (-1 topic label)
data_filtered = data[data['Topic Label'] != -1]

# Preprocess documents
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Preprocess the documents (optional, used if needed elsewhere)
documents = data['text']
processed_docs = [preprocess(doc) for doc in documents]

# Preprocess the topic keywords
def preprocess_keywords(keywords):
    return preprocess(' '.join(keywords.split(', ')))

# Extract and preprocess topic keywords
topic_representations = data_filtered.groupby('Topic Label')['First_Step_Topic_Keywords'].apply(lambda x: preprocess_keywords(x.iloc[0]))
topics = topic_representations.tolist()

# Adjusted Dieng's Topic Diversity calculation
def calculate_topic_diversity_dieng(topics):
    word_counts = {}
    total_words = 0

    # Count the occurrences of each word across all topics
    for topic in topics:
        for word in topic:
            word_counts[word] = word_counts.get(word, 0) + 1
        total_words += len(topic)  # Count total words in all topics

    # Handle case where no words are found
    if total_words == 0:
        return 0.0

    # Calculate number of unique words (words that occur exactly once)
    unique_words_count = sum(1 for count in word_counts.values() if count == 1)

    # Topic diversity is the proportion of unique words to total words
    topic_diversity = unique_words_count / total_words
    return topic_diversity

# Calculate topic diversity using Dieng's metric
topic_diversity = calculate_topic_diversity_dieng(topics)

# Display the topic diversity score
print(f"\nTopic Diversity (Dieng's Metric): {topic_diversity}")


[nltk_data] Downloading package punkt to /home/yc656703/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yc656703/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  data = pd.read_csv(file_path)



Topic Diversity (Dieng's Metric): 0.3770364623739333


Topic Coherence

In [2]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
import pandas as pd

# Assuming 'processed_docs' is a list of tokenized documents
# Create a Gensim dictionary from the processed documents
dictionary = Dictionary(processed_docs)

# Initialize a list to store c_v coherence scores
per_topic_coherence_cv = []

for topic in topics:
    # Create a list containing just the current topic
    current_topic = [topic]
    
    print(current_topic)
    # print(processed_docs)
    # print(dictionary)
    
    # Initialize the CoherenceModel for the current topic using 'c_v'
    coherence_model_cv = CoherenceModel(topics=current_topic, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    
    # Compute the c_v coherence score
    coherence_cv = coherence_model_cv.get_coherence()
    print(coherence_cv)
    # Append the c_v score to the list
    per_topic_coherence_cv.append(coherence_cv)
    
    

# Create a DataFrame to display the results
results = pd.DataFrame({
    "Topic": topic_ids,
    "Keywords": topic_representations.values,
    "Coherence c_v": per_topic_coherence_cv,
})


# Display overall and per-topic coherence scores
print(results)
results.to_csv('First_Level_First_Step_coherence.csv')


[['features', 'classification', 'learning', 'datasets', 'models', 'ai', 'networks', 'neural', 'trained', 'dataset']]
0.4223467978785219
[['medical', 'clinical', 'patients', 'medicine', 'med', 'clinician', 'physician', 'doctor', 'annotated', 'physicians']]
0.6112921288535738
[['recommender', 'recommenders', 'personalized', 'conversational', 'recommendation', 'recommendations', 'factorization', 'embeddings', 'attention', 'ranking']]
0.6114976209565052
[['translations', 'translators', 'multilingual', 'lingual', 'translating', 'bilingual', 'translation', 'monolingual', 'translated', 'languages']]
0.7344624704245789
[['robotic', 'robot', 'robotics', 'robots', 'grasping', 'manipulator', 'teleoperation', 'embodied', 'grasp', 'humanoid']]
0.7726833424143702
[['cnn', 'segmentation', 'cnns', 'segmentations', 'segmenting', 'supervised', 'deep', 'convolutional', 'attention', 'ultrasound']]
0.616960762040347
[['quantum', 'qubits', 'qubit', 'qcnns', 'qcnn', 'grover', 'entanglement', 'qnns', 'qiskit'

NameError: name 'topic_ids' is not defined

In [26]:
# Create a DataFrame to display the results
results = pd.DataFrame({
    "Keywords": topic_representations.values,
    "Coherence c_v": per_topic_coherence_cv,
})


# Display overall and per-topic coherence scores
print(results)
results.to_csv('First_Level_First_Step_coherence.csv')

                                              Keywords  Coherence c_v
0    [features, classification, learning, datasets,...       0.422347
1    [medical, clinical, patients, medicine, med, c...       0.611292
2    [recommender, recommenders, personalized, conv...       0.611498
3    [translations, translators, multilingual, ling...       0.734462
4    [robotic, robot, robotics, robots, grasping, m...       0.772683
..                                                 ...            ...
519  [logs, parsers, log, openlogparser, parsing, p...       0.625958
520  [rumors, tweets, rumor, news, blogging, credib...       0.408653
521  [inceptionv3, dnns, dnn, deepfault, tensorflow...       0.412244
522  [shap, shapley, ranking, feature, generalising...       0.295639
523  [adversarial, exploitability, attacker, malici...       0.625160

[524 rows x 2 columns]


In [10]:
# Create a DataFrame from the list
df = pd.DataFrame(per_topic_coherence_cv, columns=['Topic Coherence Score'])

# Convert to CSV
df.to_csv('per_topic_coherence_cv.csv', index='Index')


In [16]:
file_path = 'per_topic_coherence_cv.csv'  # Replace with your file path
data = pd.read_csv(file_path)
mean_score = data['Topic Coherence Score'][1:].mean()
data['Topic Coherence Score'].mean()
mean_score


0.5422169300954853