In [68]:
# Imports

import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import glob
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sentence_transformers import SentenceTransformer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import seaborn as sns
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
import re
from hdbscan import HDBSCAN
#from octis.evaluation_metrics.coherence_metrics import Coherence
from umap import UMAP
import json
import pickle
from collections import Counter

In [69]:
# Format text files into dataframe
def convert_text_files(directories, label):
    texts = []
    labels = []
    years = []
    for directory in directories:
        text_files = glob.glob(f"{directory}/*.txt")
        for file_path in text_files:
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read().strip()
                    if text:  # Only add non-empty texts
                        texts.append(text)
                        labels.append(label)
                        if directory == './GrimmFairyTale/':
                            grimm_fairy_tale_year = 1857
                            years.append(grimm_fairy_tale_year)
                        else:
                            if directory == './GrimmFairyTaleAgain/':
                                grimm_fairy_tale_again_year = 1857
                                years.append(grimm_fairy_tale_again_year)
                            else:
                                year = file_path.split('_')[-1].replace('.txt', '')
                                years.append(int(year))

            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    dataframe = pd.DataFrame({'Texts': texts, 'Labels': labels, 'Years': years})

    # Add Eras column
    dataframe['Era'] = dataframe['Years'].apply(lambda x:
    'Fairytale (1857)' if x <= 1857 else
    'Lovecraft (1917-1937)' if x <= 1937 else
    'Post-Lovecraft (1938-1970)' if x <= 1970 else
    'Modern (1971-2000)' if x <= 2000 else
    'Contemporary (2001+)')

    return dataframe

# Helper function to save token list and topic words to pickle file and json file
def save_token_list_and_topics(token_list, topic_words, token_file='token_list.pkl', topics_file='topic_words.json'):
    # Save token list as pickle, this helps with large lists
    with open('PickleFiles/Tokens/' + token_file, 'wb') as f:
        pickle.dump(token_list, f)

    # Save topic words as JSON to make it more readable
    with open('PickleFiles/Topics/' + topics_file, 'w') as f:
        json.dump(topic_words, f, indent=2)

# Helper function to load token list from pickle file and load topic words from json file
def load_token_list_and_topics(token_file='token_list.pkl', topics_file='topic_words.json'):
    # Load token list from pickle
    with open(token_file, 'rb') as f:
        token_list = pickle.load(f)

    # Load topic words from JSON
    with open(topics_file, 'r') as f:
        topic_words = json.load(f)

    return token_list, topic_words

In [70]:
# Helper function to calculate what proportion of words are unique across all topics (Higher is better = more diverse)
def calculate_topic_diversity(top_words_list):
    # Flatten all words
    all_words = [word for topic_words in top_words_list for word in topic_words]
    unique_words = set(all_words)

    diversity = len(unique_words) / len(all_words)

    print(f"Total words: {len(all_words)}")
    print(f"Unique words: {len(unique_words)}")

    return diversity

In [71]:
# Calculates average proportion of unique words per topic
def calculate_topic_uniqueness(top_words_list):
    # Count how many topics each word appears in
    all_words = [word for topic_words in top_words_list for word, _ in topic_words]
    word_counts = Counter(all_words)

    uniqueness_scores = []
    for topic_words in top_words_list:
        words = [word for word, _ in topic_words]
        unique_count = sum(1 for word in words if word_counts[word] == 1)
        uniqueness_scores.append(unique_count / len(words) if words else 0)

    avg_uniqueness = np.mean(uniqueness_scores)

    print(f"\nTopic Uniqueness Score: {avg_uniqueness:.4f}")

    # Show per-topic uniqueness
    for i, score in enumerate(uniqueness_scores):
        print(f"  Topic {i}: {score:.4f}")

    return avg_uniqueness, uniqueness_scores

In [72]:
### GLOBAL BERT TOPIC PARAMETERS ###
bert_min_topic_size = 8 # CHANGED
bert_nr_topics = 'auto' # CHANGED: was None
bert_top_n_words = 100 # CHANGED: Was 10

# Set up embedding model that has been proven to provide better accuracy for semantic meanings. Slower but higher accuracy
embedding_model = SentenceTransformer('all-mpnet-base-v2')

# Representation model
rep_model = {
    "KeyBERT": KeyBERTInspired(),
    "MMR": MaximalMarginalRelevance(diversity=0.3)
}

In [73]:
# Helper function to determine which models to configure for BERTopic
def customize_BERTopic(stop_words=True, hdbscan=True, umap=True):
    if stop_words:
        # Set up stop words
        custom_stop_words = ['said', 'went', 'came', 'saw', 'looked', 'seemed',
                             'knew', 'told', 'asked', 'thought', 'felt', 'heard',
                             'began', 'made', 'found', 'like', 'did', 'didn', 'just', 'head',
                             'feet', 'pike', 'carter', 'paris', 'don', 've', 'know', 'right', 'away',
                             'way', 'going', 'later', 'new', 'look', 'mr', 'client', 'left', 'money',
                             'soon', 'little', 'peter', 'st', 'dr', 'shall', 'wasn', 'll', 'air', 'winter', 'let',
                             'answered', 'say', 'got', 'peel']

        all_stop_words = list(ENGLISH_STOP_WORDS.union(set(custom_stop_words)))

        vectorizer = CountVectorizer(stop_words=all_stop_words,
                                     min_df=1,
                                     max_df=0.75,
                                     ngram_range=(1, 2),
                                     max_features=2000)
    else:
        vectorizer = None

    if hdbscan:
        # HDBSCAN Model
        hdbscan_model = HDBSCAN(min_cluster_size=8,
                                min_samples=3,
                                cluster_selection_epsilon=0.0,
                                cluster_selection_method='leaf',
                                prediction_data=True)
    else:
        hdbscan_model = None

    if umap:
        # UMAP Model
        umap_model = UMAP(
        n_neighbors=15,
        n_components= 5,
        min_dist=0.0,
        metric='cosine',
        random_state=42)
    else:
        umap_model = None

    return vectorizer, hdbscan_model, umap_model

In [74]:
# Helper to initialize BERTopic model with customized parameters
def init_BERTopic(custom_vectorizer, custom_hdbscan, custom_umap):
    # Initialize BERTopic model
    bertopic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=custom_vectorizer, #Stop words
        min_topic_size=bert_min_topic_size, #minimum number of documents per topic
        verbose=False,
        calculate_probabilities=True, # Calculate probability of each document belonging to which topic
        representation_model=rep_model,
        top_n_words=bert_top_n_words,
        hdbscan_model=custom_hdbscan,
        umap_model=custom_umap
    )

    return bertopic_model

In [75]:
### Using custom stop words, hdbscan, umap ###

df = convert_text_files(directories=['./fiction_text_files/'], label='fiction')

# Initialize BERTopic model using custom parameters (Use custom stop words, hdbscan, umap)
topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fiction_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fiction_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fiction_text_diversity_score = calculate_topic_diversity(fiction_text_top_words)
print(f"Fiction Text Diversity Score: {fiction_text_diversity_score}")

# Compute Uniqueness Score
fiction_uniqueness, fiction_per_topic_uniqueness = calculate_topic_uniqueness(fiction_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fiction_tokens_all_configs.pkl',
                           'fiction_topics_all_configs.json')


=== Topic Information ===
   Topic  Count                                  Name  \
0     -1     25    -1_rogers_whateley_armitage_wilbur   
1      0     36            0_akeley_ammi_nahum_gaunts   
2      1     20   1_curwen_willett_clarendon_georgina   
3      2     15  2_zamacona_danforth_jermyn_antarctic   
4      3      8          3_innsmouth_moore_mummy_obed   

                                      Representation  \
0  [rogers, whateley, armitage, wilbur, randolph,...   
1  [akeley, ammi, nahum, gaunts, night gaunts, de...   
2  [curwen, willett, clarendon, georgina, gilman,...   
3  [zamacona, danforth, jermyn, antarctic, yan, g...   
4  [innsmouth, moore, mummy, obed, ghatanothoa, a...   

                                             KeyBERT  \
0  [exham priory, priory, old whateley, wilbur wh...   
1  [waking world, nameless city, akeley, dreamlan...   
2  [antiquarian, hamlet, reanimation, black fever...   
3  [archaean, primordial, tribes, pyramids, amnes...   
4  [submarine

In [76]:
### Using custom stop words, hdbscan ###
df = convert_text_files(directories=['./fiction_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fiction_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fiction_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fiction_text_diversity_score = calculate_topic_diversity(fiction_text_top_words)
print(f"Fiction Text Diversity Score: {fiction_text_diversity_score}")

# Compute Uniqueness Score
fiction_uniqueness, fiction_per_topic_uniqueness = calculate_topic_uniqueness(fiction_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fiction_tokens_stop_words_hdbscan.pkl',
                           'fiction_topics_stop_words_hdbscan.json')


=== Topic Information ===
   Topic  Count                                 Name  \
0     -1     37      -1_akeley_marsh_jones_innsmouth   
1      0     31  0_night gaunts_gaunts_kadath_marble   
2      1     24        1_ward_curwen_willett_charles   
3      2     12   2_zamacona_danforth_jermyn_crystal   

                                      Representation  \
0  [akeley, marsh, jones, innsmouth, rogers, ammi...   
1  [night gaunts, gaunts, kadath, marble, galley,...   
2  [ward, curwen, willett, charles, clarendon, gi...   
3  [zamacona, danforth, jermyn, crystal, antarcti...   

                                             KeyBERT  \
0  [whateley, wilbur, priory, mummy, akeley, coff...   
1  [earth gods, palaces, waking world, monastery,...   
2  [specimens, occult, salem, researches, rat, ph...   
3  [archaean, snakes, pyramids, antarctic, specim...   

                                                 MMR  \
0  [akeley, innsmouth, folks, wilbur, ben, bog, w...   
1  [night gaunts, 

In [77]:
### Using custom stop words, umap ###

df = convert_text_files(directories=['./fiction_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fiction_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fiction_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fiction_text_diversity_score = calculate_topic_diversity(fiction_text_top_words)
print(f"Fiction Text Diversity Score: {fiction_text_diversity_score}")

# Compute Uniqueness Score
fiction_uniqueness, fiction_per_topic_uniqueness = calculate_topic_uniqueness(fiction_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fiction_tokens_stop_words_umap.pkl',
                           'fiction_topics_stop_words_umap.json')


=== Topic Information ===
   Topic  Count                                     Name  \
0     -1     40         -1_akeley_marsh_innsmouth_robert   
1      0     26             0_ward_curwen_willett_doctor   
2      1     24  1_gaunts_night gaunts_galley_great ones   
3      2     14        2_zamacona_danforth_antarctic_yan   

                                      Representation  \
0  [akeley, marsh, innsmouth, robert, ammi, nahum...   
1  [ward, curwen, willett, doctor, clarendon, jon...   
2  [gaunts, night gaunts, galley, great ones, ngr...   
3  [zamacona, danforth, antarctic, yan, yig, grea...   

                                             KeyBERT  \
0  [akeley, priory, innsmouth, farmhouse, old bug...   
1  [specimens, mummy, wilbur, mrs ward, rat, char...   
2  [earth gods, palaces, waking world, monastery,...   
3  [archaean, exploring, pyramids, antarctic, amn...   

                                                 MMR  \
0  [akeley, folks, priory, bog, specimens, herber...  

In [78]:
### Using custom stop words only ###

df = convert_text_files(directories=['./fiction_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fiction_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fiction_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fiction_text_diversity_score = calculate_topic_diversity(fiction_text_top_words)
print(f"Fiction Text Diversity Score: {fiction_text_diversity_score}")

# Compute Uniqueness Score
fiction_uniqueness, fiction_per_topic_uniqueness = calculate_topic_uniqueness(fiction_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fiction_tokens_stop_words.pkl',
                           'fiction_topics_stop_words.json')


=== Topic Information ===
   Topic  Count                                     Name  \
0     -1     46           -1_ward_curwen_willett_charles   
1      0     26  0_gaunts_night gaunts_galley_great ones   
2      1     22        1_clarendon_jones_gilman_georgina   
3      2     10            2_zamacona_yan_yig_great race   

                                      Representation  \
0  [ward, curwen, willett, charles, akeley, marsh...   
1  [gaunts, night gaunts, galley, great ones, mar...   
2  [clarendon, jones, gilman, georgina, rogers, w...   
3  [zamacona, yan, yig, great race, tsath, walker...   

                                             KeyBERT  \
0  [specimens, ruin, danforth, swamp, salem, expe...   
1  [waking world, dreamland, monastery, nameless ...   
2  [wilbur whateley, old whateley, mummy, wilbur,...   
3  [pyramids, snakes, sphinx, pyramid, amnesia, e...   

                                                 MMR  \
0  [willett, letters, specimens, folks, danforth,...  

In [79]:
### Using custom stop words, hdbscan, umap ###
df = convert_text_files(directories=['./Fan_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fan_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fan_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fan_text_diversity_score = calculate_topic_diversity(fan_text_top_words)
print(f"Fan Text Diversity Score: {fan_text_diversity_score}")

# Compute Uniqueness Score
fan_uniqueness, fan_per_topic_uniqueness = calculate_topic_uniqueness(fan_text_top_words)


# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fan_tokens_all_configs.pkl', 'fan_topics_all_configs.json')


=== Topic Information ===
   Topic  Count                              Name  \
0     -1     50  -1_emerson_hastur_hamilton_nadia   
1      0     23         0_mazlo_darby_oskar_billy   
2      1     23         1_hex_calum_cyprian_alejo   
3      2      8              2_sam_karen_mike_ray   

                                      Representation  \
0  [emerson, hastur, hamilton, nadia, hera, jobe,...   
1  [mazlo, darby, oskar, billy, lenora, anne, vic...   
2  [hex, calum, cyprian, alejo, wolf, noolan, ric...   
3  [sam, karen, mike, ray, moore, eddie, tcho, ca...   

                                             KeyBERT  \
0  [sergeant, soldiers, officer, military, missio...   
1  [morgue, outline, parents, mom, mum, watchers,...   
2  [cave, stalks, servants, harvest, island, cast...   
3  [patients, mused, drugs, mob, mike, drug, work...   

                                                 MMR  \
0  [wilmarth, welles, soldiers, jacqueline, conne...   
1  [sara, ken, morgue, dolores, g

In [80]:
### Using custom stop words, hdbscan ###
df = convert_text_files(directories=['./Fan_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fan_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fan_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fan_text_diversity_score = calculate_topic_diversity(fan_text_top_words)
print(f"Fan Text Diversity Score: {fan_text_diversity_score}")

# Compute Uniqueness Score
fan_uniqueness, fan_per_topic_uniqueness = calculate_topic_uniqueness(fan_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fan_tokens_stop_words_hdbscan.pkl', 'fan_topics_stop_words_hdbscan.json')


=== Topic Information ===
   Topic  Count                                Name  \
0     -1     16  -1_hamilton_garner_wilmarth_morton   
1      0     50            0_hastur_hera_dummy_kane   
2      1     25           1_hex_calum_cyprian_alejo   
3      2     13          2_emerson_nadia_jobe_louis   

                                      Representation  \
0  [hamilton, garner, wilmarth, morton, aga, warr...   
1  [hastur, hera, dummy, kane, mazlo, sam, karen,...   
2  [hex, calum, cyprian, alejo, wolf, noolan, ric...   
3  [emerson, nadia, jobe, louis, emery, myles, am...   

                                             KeyBERT  \
0  [1846, patients, coffin, crevasse, museum, gue...   
1  [ventriloquism, welles, ventriloquist, film, f...   
2  [island, castle, nightmarosaurus, stalks, harv...   
3  [soldiers, military, insurgents, kgb officer, ...   

                                                 MMR  \
0  [garner, roger, hidden ones, patients, 1846, m...   
1  [morrell, welles, gi

In [81]:
### Using custom stop words, umap ###
df = convert_text_files(directories=['./Fan_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fan_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fan_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fan_text_diversity_score = calculate_topic_diversity(fan_text_top_words)
print(f"Fan Text Diversity Score: {fan_text_diversity_score}")

# Compute Uniqueness Score
fan_uniqueness, fan_per_topic_uniqueness = calculate_topic_uniqueness(fan_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fan_tokens_stop_words_umap.pkl', 'fan_topics_stop_words_umap.json')


=== Topic Information ===
   Topic  Count                                Name  \
0     -1     55  -1_emerson_cyprian_hastur_hamilton   
1      0     25           0_dummy_mazlo_darby_billy   
2      1     24             1_calum_jobe_louis_wolf   

                                      Representation  \
0  [emerson, cyprian, hastur, hamilton, nadia, he...   
1  [dummy, mazlo, darby, billy, lenora, anne, sar...   
2  [calum, jobe, louis, wolf, ricou, dennis, kurt...   

                                             KeyBERT  \
0  [sergeant, police, armitage, crime, inspector,...   
1  [ventriloquism, greater ventriloquism, ventril...   
2  [harvest, wolf corn, cave, servants, musket, v...   

                                                 MMR  \
0  [wilmarth, welles, jacqueline, drood, connelly...   
1  [darby, billy, vox, rachel, dummies, greater v...   
2  [bogolyubov, hugo, legion, combines, dark god,...   

                                 Representative_Docs  
0  [Sergeant Emerson J

In [82]:
### Using custom stop words only ###
df = convert_text_files(directories=['./Fan_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fan_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fan_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fan_text_diversity_score = calculate_topic_diversity(fan_text_top_words)
print(f"Fan Text Diversity Score: {fan_text_diversity_score}")

# Compute Uniqueness Score
fan_uniqueness, fan_per_topic_uniqueness = calculate_topic_uniqueness(fan_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fan_tokens_stop_words.pkl', 'fan_topics_stop_words.json')


=== Topic Information ===
   Topic  Count                              Name  \
0     -1     49  -1_hastur_cyprian_hamilton_nadia   
1      0     30        0_mazlo_darby_billy_lenora   
2      1     17         1_ricou_dennis_calum_lucy   
3      2      8        2_emerson_jobe_louis_myles   

                                      Representation  \
0  [hastur, cyprian, hamilton, nadia, hex, hera, ...   
1  [mazlo, darby, billy, lenora, anne, sara, vict...   
2  [ricou, dennis, calum, lucy, iggy, isaac, ayan...   
3  [emerson, jobe, louis, myles, amye, brooks, da...   

                                             KeyBERT  \
0  [welles, phantom, nadia, crime, armitage, hera...   
1  [houses, parents, outline, mom, dad, living ro...   
2  [haunted, throne, castle, dreamers, poet, serv...   
3  [soldiers, insurgents, military, kgb officer, ...   

                                                 MMR  \
0  [wilmarth, welles, steve, office, vox, caesar,...   
1  [darby, sara, ken, desk, caleb

In [83]:
### Using custom stop words, hdbscan, umap ###

df = convert_text_files(directories=['./GrimmFairyTale/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_fairy_tale_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_fairy_tale_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_text_diversity_score = calculate_topic_diversity(grimm_fairy_tale_text_top_words)
print(f"Grimm Fairytale Text Diversity Score: {grimm_text_diversity_score}")

# Compute Uniqueness Score
grimm_uniqueness, grimm_per_topic_uniqueness = calculate_topic_uniqueness(grimm_fairy_tale_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_tokens_all_configs.pkl', 'grimm_topics_all_configs.json')


=== Topic Information ===
   Topic  Count                                      Name  \
0     -1     50         -1_gretel_old woman_mountain_sick   
1      0     44                   0_fox_hedgehog_wolf_cat   
2      1     33  1_lustig_brother lustig_shudder_knapsack   
3      2     18                2_fox_soldier_spear_schulz   
4      3     18       3_snow white_glass_cinderella_elsie   
5      4     17          4_tailor_giant_thumbling_bailiff   
6      5     17     5_john_faithful john_old woman_maleen   
7      6      8         6_griffin_simpleton_frog_youngest   

                                      Representation  \
0  [gretel, old woman, mountain, sick, dragon, fl...   
1  [fox, hedgehog, wolf, cat, devil, frederick, c...   
2  [lustig, brother lustig, shudder, knapsack, je...   
3  [fox, soldier, spear, schulz, cap, giants, mas...   
4  [snow white, glass, cinderella, elsie, red cap...   
5  [tailor, giant, thumbling, bailiff, thief, sho...   
6  [john, faithful john, old wo

In [84]:
### Using custom stop words, hdbscan ###

df = convert_text_files(directories=['./GrimmFairyTale/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_fairy_tale_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_fairy_tale_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_text_diversity_score = calculate_topic_diversity(grimm_fairy_tale_text_top_words)
print(f"Grimm Fairytale Text Diversity Score: {grimm_text_diversity_score}")

# Compute Uniqueness Score
grimm_uniqueness, grimm_per_topic_uniqueness = calculate_topic_uniqueness(grimm_fairy_tale_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_tokens_stop_words_hdbscan.pkl', 'grimm_topics_stop_words_hdbscan.json')


=== Topic Information ===
   Topic  Count                                      Name  \
0     -1     45             -1_devil_dragon_flounder_lion   
1      0     38  0_lustig_brother lustig_shudder_knapsack   
2      1     23      1_snow_snow white_cinderella_red cap   
3      2     20               2_fox_wolf_musician_sparrow   
4      3     19        3_john_faithful john_church_maleen   
5      4     16          4_tailor_giant_thumbling_bailiff   
6      5     15                5_fox_soldier_schulz_spear   
7      6     13        6_frederick_catherine_hen_hedgehog   
8      7      8              7_sick_drummer_hansel_gretel   
9      8      8       8_griffin_simpleton_frog_water life   

                                      Representation  \
0  [devil, dragon, flounder, lion, gretel, host, ...   
1  [lustig, brother lustig, shudder, knapsack, le...   
2  [snow, snow white, cinderella, red cap, elsie,...   
3  [fox, wolf, musician, sparrow, goat, aik, meh,...   
4  [john, faithful jo

In [85]:
### Using custom stop words, umap ###

df = convert_text_files(directories=['./GrimmFairyTale/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_fairy_tale_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_fairy_tale_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_text_diversity_score = calculate_topic_diversity(grimm_fairy_tale_text_top_words)
print(f"Grimm Fairytale Text Diversity Score: {grimm_text_diversity_score}")

# Compute Uniqueness Score
grimm_uniqueness, grimm_per_topic_uniqueness = calculate_topic_uniqueness(grimm_fairy_tale_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_tokens_stop_words_umap.pkl', 'grimm_topics_stop_words_umap.json')


=== Topic Information ===
   Topic  Count                                      Name  \
0     -1     67         -1_gretel_tailor_thumbling_dragon   
1      0     41  0_brother lustig_lustig_knapsack_shudder   
2      1     38         1_sick_drummer_john_faithful john   
3      2     31       2_hedgehog_wolf_frederick_catherine   
4      3     15       3_tailor_griffin_dummling_simpleton   
5      4     13     4_spear_schulz_master schulz_wild man   

                                      Representation  \
0  [gretel, tailor, thumbling, dragon, flounder, ...   
1  [brother lustig, lustig, knapsack, shudder, je...   
2  [sick, drummer, john, faithful john, cinderell...   
3  [hedgehog, wolf, frederick, catherine, sparrow...   
4  [tailor, griffin, dummling, simpleton, shoemak...   
5  [spear, schulz, master schulz, wild man, golde...   

                                             KeyBERT  \
0  [hansel, hans gretel, thee, mother holle, gret...   
1  [forbidden door, beggar, brother lust

In [86]:
### Using custom stop words only ###

df = convert_text_files(directories=['./GrimmFairyTale/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_fairy_tale_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_fairy_tale_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_text_diversity_score = calculate_topic_diversity(grimm_fairy_tale_text_top_words)
print(f"Grimm Fairytale Text Diversity Score: {grimm_text_diversity_score}")

# Compute Uniqueness Score
grimm_uniqueness, grimm_per_topic_uniqueness = calculate_topic_uniqueness(grimm_fairy_tale_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_tokens_stop_words.pkl', 'grimm_topics_stop_words.json')


=== Topic Information ===
   Topic  Count                                     Name  \
0     -1     88           -1_tailor_gretel_soldier_giant   
1      0     30    0_brother lustig_lustig_shudder_devil   
2      1     26           1_fox_wolf_frederick_catherine   
3      2     18     2_snow white_sick_drummer_cinderella   
4      3     17  3_john_faithful john_maid maleen_maleen   
5      4     15          4_tailor_griffin_dummling_thief   
6      5     11      5_fox_paddock_wild man_golden horse   

                                      Representation  \
0  [tailor, gretel, soldier, giant, dragon, lion,...   
1  [brother lustig, lustig, shudder, devil, learn...   
2  [fox, wolf, frederick, catherine, sparrow, hen...   
3  [snow white, sick, drummer, cinderella, red ca...   
4  [john, faithful john, maid maleen, maleen, wai...   
5  [tailor, griffin, dummling, thief, simpleton, ...   
6  [fox, paddock, wild man, golden horse, golden ...   

                                           

In [87]:
### Using custom stop words, hdbscan, umap ###

df = convert_text_files(directories=['./GrimmFairyTaleAgain/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_text_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_text_top_words)
print(f"Grimm Fairytale Again Text Diversity Score: {grimm_again_text_diversity_score}")

# Compute Uniqueness Score
grimm_again_uniqueness, grimm_again_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_tokens_all_configs.pkl', 'grimm_again_topics_all_configs.json')


=== Topic Information ===
   Topic  Count                                      Name  \
0     -1     33              -1_gretel_goat_tailor_hansel   
1      0     34  0_tailor_lustig_brother lustig_thumbling   
2      1     18  1_john_faithful john_marlinchen_rapunzel   
3      2     10                    2_fox_lion_dragon_hare   
4      3      8    3_cinderella_elsie_red cap_grandmother   

                                      Representation  \
0  [gretel, goat, tailor, hansel, snow white, pea...   
1  [tailor, lustig, brother lustig, thumbling, he...   
2  [john, faithful john, marlinchen, rapunzel, ju...   
3  [fox, lion, dragon, hare, host, marshal, golde...   
4  [cinderella, elsie, red cap, grandmother, roeb...   

                                             KeyBERT  \
0  [hansel gretel, seven dwarfs, dwarfs, hansel, ...   
1  [peasant, tailor, dwarf, riddle, pig, brother ...   
2  [father ate, coffins, aged king, golden dwelli...   
3  [soaring lark, golden bird, gold pieces, o

In [88]:
### Using custom stop words, hdbscan ###

df = convert_text_files(directories=['./GrimmFairyTaleAgain/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_text_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_text_top_words)
print(f"Grimm Fairytale Again Text Diversity Score: {grimm_again_text_diversity_score}")

# Compute Uniqueness Score
grimm_again_uniqueness, grimm_again_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_tokens_stop_words_hdbscan.pkl', 'grimm_again_topics_stop_words_hdbscan.json')


=== Topic Information ===
   Topic  Count                                    Name  \
0     -1     44             -1_tailor_gretel_goat_giant   
1      0     16  0_john_faithful john_rapunzel_huntsmen   
2      1     13        1_snow_snow white_cinderella_cap   
3      2     11        2_soldier_shudder_devil_bearskin   
4      3     11               3_fox_lion_dragon_animals   
5      4      8     4_hedgehog_hans hedgehog_dwarf_cock   

                                      Representation  \
0  [tailor, gretel, goat, giant, brother lustig, ...   
1  [john, faithful john, rapunzel, huntsmen, padd...   
2  [snow, snow white, cinderella, cap, elsie, red...   
3  [soldier, shudder, devil, bearskin, knapsack, ...   
4  [fox, lion, dragon, animals, hare, host, marsh...   
5  [hedgehog, hans hedgehog, dwarf, cock, water l...   

                                             KeyBERT  \
0  [peasant, pig, hungry, tailor, shoemaker, lett...   
1  [aged king, golden dwelling, princess golden, ...  

In [89]:
### Using custom stop words, umap ###

df = convert_text_files(directories=['./GrimmFairyTaleAgain/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_text_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_text_top_words)
print(f"Grimm Fairytale Again Text Diversity Score: {grimm_again_text_diversity_score}")

# Compute Uniqueness Score
grimm_again_uniqueness, grimm_again_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_tokens_stop_words_umap.pkl', 'grimm_again_topics_stop_words_umap.json')


=== Topic Information ===
   Topic  Count                               Name  \
0     -1     51      -1_tailor_gretel_goat_peasant   
1      0     29  0_sister_faithful john_john_bride   
2      1     14     1_shudder_devil_bearskin_dwarf   
3      2      9             2_fox_dragon_lion_hare   

                                      Representation  \
0  [tailor, gretel, goat, peasant, brother lustig...   
1  [sister, faithful john, john, bride, cook, cin...   
2  [shudder, devil, bearskin, dwarf, seven years,...   
3  [fox, dragon, lion, hare, host, marshal, golde...   

                                             KeyBERT  \
0  [peasant, tailor, dwarfs, hungry, letter, shoe...   
1  [cinderella, true bride, sisters, sister, witc...   
2  [devil, hell, dwarf, riddle, clever, earn brea...   
3  [huntsmen, feathers, golden bird, bird heart, ...   

                                                 MMR  \
0  [tailor, peasant, snow white, hansel, letter, ...   
1  [faithful john, rapunzel,

In [90]:
### Using custom stop words only ###

df = convert_text_files(directories=['./GrimmFairyTaleAgain/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_text_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_text_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_text_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_text_top_words)
print(f"Grimm Fairytale Again Text Diversity Score: {grimm_again_text_diversity_score}")

# Compute Uniqueness Score
grimm_again_uniqueness, grimm_again_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_text_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_tokens_stop_words.pkl', 'grimm_again_topics_stop_words.json')


=== Topic Information ===
   Topic  Count                                     Name  \
0     -1     31    -1_peasant_thumbling_gretel_godfather   
1      0     40         0_step_goat_hansel_faithful john   
2      1     32  1_lustig_brother lustig_dragon_knapsack   

                                      Representation  \
0  [peasant, thumbling, gretel, godfather, hare, ...   
1  [step, goat, hansel, faithful john, john, cind...   
2  [lustig, brother lustig, dragon, knapsack, hed...   

                                             KeyBERT  \
0  [peasant, comes gretel, riddle, given gretel, ...   
1  [cinderella, seven dwarfs, wise woman, hansel ...   
2  [dear huntsman, innkeeper, pig, brother lustig...   

                                                 MMR  \
0  [hans gretel, talers, cabbages, tail, priest, ...   
1  [cinderella, juniper tree, enchantress, mary, ...   
2  [brother lustig, dragon, talers, king thrushbe...   

                                 Representative_Docs  
0 

In [91]:
### Using custom stop words, hdbscan, umap ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/','./GrimmFairyTale/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
all_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        all_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
all_corpus_diversity_score = calculate_topic_diversity(all_corpus_top_words)
print(f"All Corpus Text Diversity Score: {all_corpus_diversity_score}")

# Compute Uniqueness Score
all_corpus_uniqueness, all_corpus_per_topic_uniqueness = calculate_topic_uniqueness(all_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'all_corpus_tokens_all_configs.pkl', 'all_corpus_topics_all_configs.json')


=== Topic Information ===
   Topic  Count                                      Name  \
0     -1    124              -1_emerson_cyprian_nadia_hex   
1      0    150  0_tailor_king daughter_huntsman_hedgehog   
2      1    139                1_ward_gods_despite_curwen   

                                      Representation  \
0  [emerson, cyprian, nadia, hex, dummy, kane, gi...   
1  [tailor, king daughter, huntsman, hedgehog, br...   
2  [ward, gods, despite, curwen, ahead, probably,...   

                                             KeyBERT  \
0  [museum, mummy, tales, sinister, welles, pyram...   
1  [huntsmen, jest, dwarfs, huntsman, cinderella,...   
2  [specimens, horrors, ghouls, grotesque, alien,...   

                                                 MMR  \
0  [wilmarth, welles, museum, steve, armitage, un...   
1  [tailor, hedgehog, huntsmen, catherine, sparro...   
2  [abyss, sinister, tales, library, ghouls, old ...   

                                 Representative_Docs 

In [92]:
### Using custom stop words, hdbscan ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/','./GrimmFairyTale/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
all_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        all_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
all_corpus_diversity_score = calculate_topic_diversity(all_corpus_top_words)
print(f"All Corpus Text Diversity Score: {all_corpus_diversity_score}")

# Compute Uniqueness Score
all_corpus_uniqueness, all_corpus_per_topic_uniqueness = calculate_topic_uniqueness(all_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'all_corpus_tokens_stop_words_hdbscan.pkl', 'all_corpus_topics_stop_words_hdbscan.json')


=== Topic Information ===
   Topic  Count                            Name  \
0     -1    115   -1_cyprian_hamilton_calum_hex   
1      0    171     0_tailor_maiden_brother_fox   
2      1    107    1_ward_curwen_willett_wholly   
3      2     12  2_emerson_innsmouth_myles_amye   
4      3      8         3_karen_mike_eddie_tcho   

                                      Representation  \
0  [cyprian, hamilton, calum, hex, clarendon, moo...   
1  [tailor, maiden, brother, fox, huntsman, king ...   
2  [ward, curwen, willett, wholly, charles, hastu...   
3  [emerson, innsmouth, myles, amye, damien, broo...   
4  [karen, mike, eddie, tcho, caesar, pickman, da...   

                                             KeyBERT  \
0  [mummy, coffin, wilbur, welles, tales, patient...   
1  [huntsmen, geese, hen, dwarfs, misfortune, egg...   
2  [archaic, specimens, discovery, tales, antarct...   
3  [insurgents, lieutenant, officer, uniform, inv...   
4  [conversations, asylum, mike, recognition, soc

In [93]:
### Using custom stop words, umap ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/','./GrimmFairyTale/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
all_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        all_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
all_corpus_diversity_score = calculate_topic_diversity(all_corpus_top_words)
print(f"All Corpus Text Diversity Score: {all_corpus_diversity_score}")

# Compute Uniqueness Score
all_corpus_uniqueness, all_corpus_per_topic_uniqueness = calculate_topic_uniqueness(all_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'all_corpus_tokens_stop_words_umap.pkl', 'all_corpus_topics_stop_words_umap.json')


=== Topic Information ===
   Topic  Count                                    Name  \
0      0    207  0_tailor_king daughter_huntsman_gretel   
1      1    206              1_ward_gods_ahead_probably   

                                      Representation  \
0  [tailor, king daughter, huntsman, gretel, hedg...   
1  [ward, gods, ahead, probably, metal, wholly, c...   

                                             KeyBERT  \
0  [huntsmen, huntsman, dwarfs, hansel, cinderell...   
1  [horrors, museum, ghouls, grotesque, alien, co...   

                                                 MMR  \
0  [tailor, gretel, hansel, cinderella, hans hedg...   
1  [ward, ahead, arkham, library, various, centur...   

                                 Representative_Docs  
0  [There was once a man who understood all kinds...  
1  [“The essential Saltes of Animals may be so pr...  

Topic 0:
[('tailor', 0.6260641281628178), ('king daughter', 0.5194441631054035), ('huntsman', 0.481818579961294), ('gretel

In [94]:
### Using custom stop words only ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/','./GrimmFairyTale/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
all_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        all_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
all_corpus_diversity_score = calculate_topic_diversity(all_corpus_top_words)
print(f"All Corpus Text Diversity Score: {all_corpus_diversity_score}")

# Compute Uniqueness Score
all_corpus_uniqueness, all_corpus_per_topic_uniqueness = calculate_topic_uniqueness(all_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'all_corpus_tokens_stop_words.pkl', 'all_corpus_topics_stop_words.json')


=== Topic Information ===
   Topic  Count                                    Name  \
0      0    208  0_tailor_king daughter_huntsman_gretel   
1      1    205              1_ward_gods_ahead_probably   

                                      Representation  \
0  [tailor, king daughter, huntsman, gretel, hedg...   
1  [ward, gods, ahead, probably, metal, wholly, c...   

                                             KeyBERT  \
0  [huntsmen, huntsman, dwarfs, hansel, cinderell...   
1  [horrors, grotesque, alien, museum, ghouls, ab...   

                                                 MMR  \
0  [tailor, gretel, hedgehog, hansel, cinderella,...   
1  [ward, emerson, arkham, library, papers, old o...   

                                 Representative_Docs  
0  [There was once a man who understood all kinds...  
1  [I.\n\nIt is only within the last few years th...  

Topic 0:
[('tailor', 0.624390195892266), ('king daughter', 0.5181109511743214), ('huntsman', 0.48060151488845737), ('grete

In [95]:
### Using custom stop words, hdbscan, umap ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fan_and_fiction_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fan_and_fiction_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fan_and_fiction_diversity_score = calculate_topic_diversity(fan_and_fiction_top_words)
print(f"Fan And Fiction Corpus Text Diversity Score: {fan_and_fiction_diversity_score}")

# Compute Uniqueness Score
fan_and_fiction_corpus_uniqueness, fan_and_fiction_corpus_per_topic_uniqueness = calculate_topic_uniqueness(fan_and_fiction_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fan_and_fiction_tokens_all_configs.pkl', 'fan_and_fiction_topics_all_configs.json')


=== Topic Information ===
   Topic  Count                            Name  \
0     -1     64     -1_cyprian_hastur_nadia_hex   
1      0     47       0_couldn_hera_maybe_dummy   
2      1     32   1_emerson_zamacona_jobe_louis   
3      2     20  2_ward_curwen_willett_hamilton   
4      3     14       3_akeley_ammi_marsh_nahum   
5      4     11  4_musides_slater_kalos_kuranes   
6      5     11    5_iranon_sarnath_aira_yalden   
7      6      9  6_gilman_jones_rogers_whateley   

                                      Representation  \
0  [cyprian, hastur, nadia, hex, kane, ghouls, al...   
1  [couldn, hera, maybe, dummy, mazlo, sam, karen...   
2  [emerson, zamacona, jobe, louis, garner, camp,...   
3  [ward, curwen, willett, hamilton, doctor, clar...   
4  [akeley, ammi, marsh, nahum, denis, marceline,...   
5  [musides, slater, kalos, kuranes, roman, grove...   
6  [iranon, sarnath, aira, yalden, barzai, atal, ...   
7  [gilman, jones, rogers, whateley, wilbur, armi...   

        

In [96]:
### Using custom stop words, hdbscan ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fan_and_fiction_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fan_and_fiction_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fan_and_fiction_diversity_score = calculate_topic_diversity(fan_and_fiction_top_words)
print(f"Fan And Fiction Corpus Text Diversity Score: {fan_and_fiction_diversity_score}")

# Compute Uniqueness Score
fan_and_fiction_corpus_uniqueness, fan_and_fiction_corpus_per_topic_uniqueness = calculate_topic_uniqueness(fan_and_fiction_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fan_and_fiction_tokens_stop_words_hdbscan.pkl', 'fan_and_fiction_topics_stop_words_hdbscan.json')


=== Topic Information ===
   Topic  Count                                  Name  \
0     -1     79         -1_hastur_nadia_hex_professor   
1      0     44                0_hera_dummy_mazlo_sam   
2      1     26           1_akeley_gilman_alejo_jones   
3      2     13          2_ward_curwen_willett_joseph   
4      3     10     3_ghouls_gaunts_night gaunts_onyx   
5      4      9      4_emerson_innsmouth_myles_brooks   
6      5      9  5_hamilton_clarendon_georgina_morton   
7      6      9             6_zamacona_jobe_louis_yan   
8      7      9          7_iranon_sarnath_aira_barzai   

                                      Representation  \
0  [hastur, nadia, hex, professor, kane, garner, ...   
1  [hera, dummy, mazlo, sam, karen, darby, oskar,...   
2  [akeley, gilman, alejo, jones, rogers, ammi, n...   
3  [ward, curwen, willett, joseph, joseph curwen,...   
4  [ghouls, gaunts, night gaunts, onyx, kadath, g...   
5  [emerson, innsmouth, myles, brooks, damien, am...   
6  [hamilt

In [97]:
### Using custom stop words, umap ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fan_and_fiction_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fan_and_fiction_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fan_and_fiction_diversity_score = calculate_topic_diversity(fan_and_fiction_top_words)
print(f"Fan And Fiction Corpus Text Diversity Score: {fan_and_fiction_diversity_score}")

# Compute Uniqueness Score
fan_and_fiction_corpus_uniqueness, fan_and_fiction_corpus_per_topic_uniqueness = calculate_topic_uniqueness(fan_and_fiction_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fan_and_fiction_tokens_stop_words_umap.pkl', 'fan_and_fiction_topics_stop_words_umap.json')


=== Topic Information ===
   Topic  Count                               Name  \
0     -1    102      -1_ward_curwen_willett_hastur   
1      0     39            0_hera_dummy_kane_mazlo   
2      1     31      1_emerson_zamacona_jobe_louis   
3      2     25  2_night gaunts_gaunts_onyx_galley   
4      3     11      3_gilman_rogers_carson_elwood   

                                      Representation  \
0  [ward, curwen, willett, hastur, cyprian, akele...   
1  [hera, dummy, kane, mazlo, karen, darby, billy...   
2  [emerson, zamacona, jobe, louis, garner, camp,...   
3  [night gaunts, gaunts, onyx, galley, great one...   
4  [gilman, rogers, carson, elwood, harington, or...   

                                             KeyBERT  \
0  [mummy, prison, wilbur, charles ward, gaston, ...   
1  [ventriloquism, greater ventriloquism, welles,...   
2  [insurgents, antarctic, archaean, planes, tent...   
3  [earth gods, pnakotic manuscripts, monastery, ...   
4  [torment, countenance, abnor

In [98]:
### Using custom stop words only ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
fan_and_fiction_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        fan_and_fiction_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
fan_and_fiction_diversity_score = calculate_topic_diversity(fan_and_fiction_top_words)
print(f"Fan And Fiction Corpus Text Diversity Score: {fan_and_fiction_diversity_score}")

# Compute Uniqueness Score
fan_and_fiction_corpus_uniqueness, fan_and_fiction_corpus_per_topic_uniqueness = calculate_topic_uniqueness(fan_and_fiction_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'fan_and_fiction_tokens_stop_words.pkl', 'fan_and_fiction_topics_stop_words.json')


=== Topic Information ===
   Topic  Count                                 Name  \
0     -1    103       -1_ward_emerson_charles_curwen   
1      0     31  0_ghouls_night gaunts_gaunts_galley   
2      1     30             1_hera_dummy_mazlo_darby   
3      2     23         2_akeley_gilman_alejo_rogers   
4      3     21         3_zamacona_jobe_louis_garner   

                                      Representation  \
0  [ward, emerson, charles, curwen, willett, cypr...   
1  [ghouls, night gaunts, gaunts, galley, great o...   
2  [hera, dummy, mazlo, darby, billy, ava, lenora...   
3  [akeley, gilman, alejo, rogers, whateley, deni...   
4  [zamacona, jobe, louis, garner, danforth, kurt...   

                                             KeyBERT  \
0  [patients, mummy, sergeant, welles, charles wa...   
1  [earth gods, monastery, frescoes, kuranes, nam...   
2  [ventriloquism, greater ventriloquism, ventril...   
3  [old whateley, wilbur whateley, exham priory, ...   
4  [masonry, archae

In [99]:
### Using custom stop words, hdbscan, umap ###
df = convert_text_files(directories=['./GrimmFairyTaleAgain/', './fiction_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_and_fiction_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_and_fiction_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_fairy_tale_and_fiction_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_and_fiction_corpus_top_words)
print(f"Grimm FairyTale Again and Fiction Text Diversity Score: {grimm_again_fairy_tale_and_fiction_diversity_score}")

# Compute Uniqueness Score
grimm_again_fairy_tale_and_fiction_corpus_uniqueness, grimm_fairy_tale_and_fiction_corpus_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_and_fiction_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_and_fiction_tokens_all_configs.pkl', 'grimm_again_and_fiction_topics_all_configs.json')


=== Topic Information ===
   Topic  Count                                    Name  \
0     -1     73           -1_ancient_akeley_gods_tailor   
1      0     73  0_huntsman_king daughter_tailor_hansel   
2      1     61           1_ward_curwen_willett_ancient   

                                      Representation  \
0  [ancient, akeley, gods, tailor, ghouls, jones,...   
1  [huntsman, king daughter, tailor, hansel, thum...   
2  [ward, curwen, willett, ancient, charles, ye, ...   

                                             KeyBERT  \
0  [consciousness, tales, chaos, imagination, mys...   
1  [huntsmen, huntsman, fisherman, feathers, goos...   
2  [history, consciousness, specimens, ancient, m...   

                                                 MMR  \
0  [vast, tales, museum, nameless, arkham, sinist...   
1  [tailor, hansel, huntsmen, hedgehog, cinderell...   
2  [vast, library, specimens, consciousness, what...   

                                 Representative_Docs  
0  [Du

In [100]:
### Using custom stop words, hdbscan ###
df = convert_text_files(directories=['./GrimmFairyTaleAgain/', './fiction_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_and_fiction_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_and_fiction_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_fairy_tale_and_fiction_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_and_fiction_corpus_top_words)
print(f"Grimm FairyTale Again and Fiction Text Diversity Score: {grimm_again_fairy_tale_and_fiction_diversity_score}")

# Compute Uniqueness Score
grimm_again_fairy_tale_and_fiction_corpus_uniqueness, grimm_fairy_tale_and_fiction_corpus_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_and_fiction_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_and_fiction_tokens_stop_words_hdbscan.pkl', 'grimm_again_and_fiction_topics_stop_words_hdbscan.json')


=== Topic Information ===
   Topic  Count                                            Name  \
0     -1     83                  -1_ward_ancient_curwen_willett   
1      0     71  0_huntsman_king daughter_lustig_brother lustig   
2      1     53                      1_gods_vast_dreams_ancient   

                                      Representation  \
0  [ward, ancient, curwen, willett, charles, akel...   
1  [huntsman, king daughter, lustig, brother lust...   
2  [gods, vast, dreams, ancient, zamacona, ahead,...   

                                             KeyBERT  \
0  [consciousness, grotesque, horrors, morbid, si...   
1  [huntsmen, huntsman, fisherman, goose, dwarfs,...   
2  [history, ancient, masonry, antarctic, conscio...   

                                                 MMR  \
0  [ward, arkham, books, sinister, nameless, what...   
1  [snow white, huntsmen, hedgehog, tailor, cinde...   
2  [vast, abyss, mound, sinister, towers, carving...   

                             

In [101]:
### Using custom stop words, umap ###
df = convert_text_files(directories=['./GrimmFairyTaleAgain/', './fiction_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_and_fiction_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_and_fiction_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_fairy_tale_and_fiction_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_and_fiction_corpus_top_words)
print(f"Grimm FairyTale Again and Fiction Text Diversity Score: {grimm_again_fairy_tale_and_fiction_diversity_score}")

# Compute Uniqueness Score
grimm_again_fairy_tale_and_fiction_corpus_uniqueness, grimm_fairy_tale_and_fiction_corpus_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_and_fiction_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_and_fiction_tokens_stop_words_umap.pkl', 'grimm_again_and_fiction_topics_stop_words_umap.json')


=== Topic Information ===
   Topic  Count                                    Name  \
0      0    104  0_tailor_king daughter_huntsman_gretel   
1      1    103                 1_ward_vast_wholly_gods   

                                      Representation  \
0  [tailor, king daughter, huntsman, gretel, lust...   
1  [ward, vast, wholly, gods, despite, nameless, ...   

                                             KeyBERT  \
0  [huntsmen, huntsman, dwarfs, goose, hansel, ci...   
1  [consciousness, specimens, grotesque, imaginat...   

                                                 MMR  \
0  [tailor, gretel, hansel, hedgehog, cinderella,...   
1  [vast, arkham, sinister, tales, consciousness,...   

                                 Representative_Docs  
0  [There was once a man who understood all kinds...  
1  [“The essential Saltes of Animals may be so pr...  

Topic 0:
[('tailor', 0.49708149765176285), ('king daughter', 0.4141750644592999), ('huntsman', 0.4023191949282858), ('gret

In [102]:
### Using custom stop words only ###
df = convert_text_files(directories=['./GrimmFairyTaleAgain/', './fiction_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_and_fiction_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_and_fiction_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_fairy_tale_and_fiction_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_and_fiction_corpus_top_words)
print(f"Grimm FairyTale Again and Fiction Text Diversity Score: {grimm_again_fairy_tale_and_fiction_diversity_score}")

# Compute Uniqueness Score
grimm_again_fairy_tale_and_fiction_corpus_uniqueness, grimm_fairy_tale_and_fiction_corpus_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_and_fiction_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_and_fiction_tokens_stop_words.pkl', 'grimm_again_and_fiction_topics_stop_words.json')


=== Topic Information ===
   Topic  Count                                    Name  \
0      0    104  0_tailor_king daughter_huntsman_gretel   
1      1    103                 1_ward_vast_wholly_gods   

                                      Representation  \
0  [tailor, king daughter, huntsman, gretel, lust...   
1  [ward, vast, wholly, gods, despite, nameless, ...   

                                             KeyBERT  \
0  [huntsmen, huntsman, dwarfs, goose, hansel, ci...   
1  [consciousness, specimens, grotesque, imaginat...   

                                                 MMR  \
0  [tailor, gretel, hansel, hedgehog, cinderella,...   
1  [vast, arkham, sinister, tales, consciousness,...   

                                 Representative_Docs  
0  [There was once a man who understood all kinds...  
1  [“The essential Saltes of Animals may be so pr...  

Topic 0:
[('tailor', 0.49708149765176285), ('king daughter', 0.4141750644592999), ('huntsman', 0.4023191949282858), ('gret

In [103]:
### Using custom stop words, hdbscan, umap ###

df = convert_text_files(directories=['./GrimmFairyTaleAgain/', './Fan_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_and_fan_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_and_fan_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_fairy_tale_and_fan_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_and_fan_corpus_top_words)
print(f"Grimm Again FairyTale and Fan Text Diversity Score: {grimm_again_fairy_tale_and_fan_diversity_score}")

# Compute Uniqueness Score
grimm_again_fairy_tale_and_fan_corpus_uniqueness, grimm_again_fairy_tale_and_fan_corpus_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_and_fan_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_and_fan_tokens_all_configs.pkl', 'grimm_again_and_fan_topics_all_configs.json')


=== Topic Information ===
   Topic  Count                                   Name  \
0     -1     62        -1_cyprian_hamilton_kane_garner   
1      0     49             0_hastur_couldn_hera_dummy   
2      1     24          1_tailor_hans_thumbling_giant   
3      2     17               2_hex_alejo_noolan_ricou   
4      3     16    3_faithful john_faithful_john_bride   
5      4     11    4_hansel_gretel_marlinchen_rapunzel   
6      5     10             5_emerson_jobe_louis_myles   
7      6      9  6_snow white_cinderella_elsie_red cap   
8      7      9             7_huntsman_fox_lion_dragon   

                                      Representation  \
0  [cyprian, hamilton, kane, garner, nadia, wolf,...   
1  [hastur, couldn, hera, dummy, maybe, mazlo, sa...   
2  [tailor, hans, thumbling, giant, hedgehog, shu...   
3  [hex, alejo, noolan, ricou, dennis, calum, har...   
4  [faithful john, faithful, john, bride, hare, p...   
5  [hansel, gretel, marlinchen, rapunzel, juniper...   


In [104]:
### Using custom stop words, hdbscan ###

df = convert_text_files(directories=['./GrimmFairyTaleAgain/', './Fan_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_and_fan_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_and_fan_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_fairy_tale_and_fan_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_and_fan_corpus_top_words)
print(f"Grimm Again FairyTale and Fan Text Diversity Score: {grimm_again_fairy_tale_and_fan_diversity_score}")

# Compute Uniqueness Score
grimm_again_fairy_tale_and_fan_corpus_uniqueness, grimm_again_fairy_tale_and_fan_corpus_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_and_fan_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_and_fan_tokens_stop_words_hdbscan.pkl', 'grimm_again_and_fan_topics_stop_words_hdbscan.json')


=== Topic Information ===
   Topic  Count                               Name  \
0     -1     75  -1_tailor_cyprian_hastur_hamilton   
1      0     64  0_daughter_maiden_huntsman_castle   
2      1     56            1_couldn_hex_hera_maybe   
3      2     12         2_emerson_jobe_louis_nadia   

                                      Representation  \
0  [tailor, cyprian, hastur, hamilton, dummy, kan...   
1  [daughter, maiden, huntsman, castle, king daug...   
2  [couldn, hex, hera, maybe, mazlo, sam, karen, ...   
3  [emerson, jobe, louis, nadia, emery, myles, am...   

                                             KeyBERT  \
0  [welles, grimoire, police, film, gaston, mask,...   
1  [riddle, huntsmen, arose, misfortune, beautifu...   
2  [shadows, victim, ahead, office, shadow, apart...   
3  [military, insurgents, sergeant, lieutenant, m...   

                                                 MMR  \
0  [tailor, wilmarth, gaston, gallowglass, welles...   
1  [king son, huntsmen, witc

In [105]:
### Using custom stop words, umap ###

df = convert_text_files(directories=['./GrimmFairyTaleAgain/', './Fan_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_and_fan_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_and_fan_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_fairy_tale_and_fan_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_and_fan_corpus_top_words)
print(f"Grimm Again FairyTale and Fan Text Diversity Score: {grimm_again_fairy_tale_and_fan_diversity_score}")

# Compute Uniqueness Score
grimm_again_fairy_tale_and_fan_corpus_uniqueness, grimm_again_fairy_tale_and_fan_corpus_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_and_fan_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_and_fan_tokens_stop_words_umap.pkl', 'grimm_again_and_fan_topics_stop_words_umap.json')


=== Topic Information ===
   Topic  Count                                      Name  \
0      0    104               0_emerson_maybe_hadn_hastur   
1      1    103  1_tailor_king daughter_huntsman_king son   

                                      Representation  \
0  [emerson, maybe, hadn, hastur, cyprian, wouldn...   
1  [tailor, king daughter, huntsman, king son, gr...   

                                             KeyBERT  \
0  [shadows, shadow, history, police, mask, ahead...   
1  [huntsmen, seven dwarfs, dwarfs, hansel, hunts...   

                                                 MMR  \
0  [emerson, wouldn, jobe, simon, wilmarth, profe...   
1  [tailor, hansel, bridegroom, merry, cinderella...   

                                 Representative_Docs  
0  [Gaston Morrell was drowning, The waters of th...  
1  [There was once a man who understood all kinds...  

Topic 0:
[('emerson', 0.02361732358268409), ('maybe', 0.023534205389951268), ('hadn', 0.020035057159009433), ('hastu

In [106]:
### Using custom stop words only ###

df = convert_text_files(directories=['./GrimmFairyTaleAgain/', './Fan_text_files/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
grimm_again_fairy_tale_and_fan_corpus_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        grimm_again_fairy_tale_and_fan_corpus_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
grimm_again_fairy_tale_and_fan_diversity_score = calculate_topic_diversity(grimm_again_fairy_tale_and_fan_corpus_top_words)
print(f"Grimm Again FairyTale and Fan Text Diversity Score: {grimm_again_fairy_tale_and_fan_diversity_score}")

# Compute Uniqueness Score
grimm_again_fairy_tale_and_fan_corpus_uniqueness, grimm_again_fairy_tale_and_fan_corpus_per_topic_uniqueness = calculate_topic_uniqueness(grimm_again_fairy_tale_and_fan_corpus_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'grimm_again_and_fan_tokens_stop_words.pkl', 'grimm_again_and_fan_topics_stop_words.json')


=== Topic Information ===
   Topic  Count                                      Name  \
0      0    104               0_emerson_maybe_hadn_hastur   
1      1    103  1_tailor_king daughter_huntsman_king son   

                                      Representation  \
0  [emerson, maybe, hadn, hastur, cyprian, wouldn...   
1  [tailor, king daughter, huntsman, king son, gr...   

                                             KeyBERT  \
0  [shadows, shadow, history, police, mask, ahead...   
1  [huntsmen, seven dwarfs, dwarfs, hansel, hunts...   

                                                 MMR  \
0  [emerson, wouldn, jobe, simon, wilmarth, profe...   
1  [tailor, hansel, bridegroom, merry, cinderella...   

                                 Representative_Docs  
0  [Gaston Morrell was drowning, The waters of th...  
1  [There was once a man who understood all kinds...  

Topic 0:
[('emerson', 0.02361732358268409), ('maybe', 0.023534205389951268), ('hadn', 0.020035057159009433), ('hastu

In [107]:
### Using custom stop words, hdbscan, umap ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/','./GrimmFairyTaleAgain/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
all_corpus_again_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        all_corpus_again_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
all_corpus_again_diversity_score = calculate_topic_diversity(all_corpus_again_top_words)
print(f"All Corpus with Grimm Again Diversity Score: {all_corpus_again_diversity_score}")

# Compute Uniqueness Score
all_corpus_again_uniqueness, all_corpus_again_per_topic_uniqueness = calculate_topic_uniqueness(all_corpus_again_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'all_corpus_again_tokens_all_configs.pkl', 'all_corpus_again_topics_all_configs.json')


=== Topic Information ===
   Topic  Count                            Name  \
0     -1    117   -1_ward_charles_curwen_doctor   
1      0    113    0_ancient_couldn_vast_hastur   
2      1     69  1_brother_maiden_huntsman_hans   
3      2     12  2_emerson_innsmouth_myles_amye   

                                      Representation  \
0  [ward, charles, curwen, doctor, willett, cypri...   
1  [ancient, couldn, vast, hastur, metal, hills, ...   
2  [brother, maiden, huntsman, hans, castle, brot...   
3  [emerson, innsmouth, myles, amye, damien, broo...   

                                             KeyBERT  \
0  [corpse, welles, ancient, coffin, ghouls, muse...   
1  [history, consciousness, tales, ancient, alien...   
2  [huntsmen, servants, feathers, egg, kingdom, k...   
3  [marine, sergeant, lieutenant, officer, missio...   

                                                 MMR  \
0  [ward, ghouls, wilmarth, library, welles, name...   
1  [vast, nameless, knowledge, tales, abyss

In [108]:
### Using custom stop words, hdbscan ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/','./GrimmFairyTaleAgain/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=True, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
all_corpus_again_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        all_corpus_again_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
all_corpus_again_diversity_score = calculate_topic_diversity(all_corpus_again_top_words)
print(f"All Corpus with Grimm Again Diversity Score: {all_corpus_again_diversity_score}")

# Compute Uniqueness Score
all_corpus_again_uniqueness, all_corpus_again_per_topic_uniqueness = calculate_topic_uniqueness(all_corpus_again_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'all_corpus_again_tokens_stop_words_hdbscan.pkl', 'all_corpus_again_topics_stop_words_hdbscan.json')


=== Topic Information ===
   Topic  Count                                    Name  \
0     -1    109           -1_cyprian_gods_ancient_nadia   
1      0    136           0_ancient_ward_couldn_emerson   
2      1     66  1_huntsman_king daughter_tailor_lustig   

                                      Representation  \
0  [cyprian, gods, ancient, nadia, hex, couldn, p...   
1  [ancient, ward, couldn, emerson, ye, curwen, f...   
2  [huntsman, king daughter, tailor, lustig, brot...   

                                             KeyBERT  \
0  [ancient, consciousness, museum, mummy, ghouls...   
1  [history, specimens, ancient, consciousness, p...   
2  [huntsmen, huntsman, cinderella, brother lusti...   

                                                 MMR  \
0  [ghouls, cult, night gaunts, museum, sara, kno...   
1  [vast, nameless, arkham, sinister, library, ta...   
2  [tailor, brother lustig, huntsmen, onwards, ci...   

                                 Representative_Docs  
0  [(F

In [109]:
### Using custom stop words, umap ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/','./GrimmFairyTaleAgain/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=True))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
all_corpus_again_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        all_corpus_again_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
all_corpus_again_diversity_score = calculate_topic_diversity(all_corpus_again_top_words)
print(f"All Corpus with Grimm Again Diversity Score: {all_corpus_again_diversity_score}")

# Compute Uniqueness Score
all_corpus_again_uniqueness, all_corpus_again_per_topic_uniqueness = calculate_topic_uniqueness(all_corpus_again_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'all_corpus_again_tokens_stop_words_umap.pkl', 'all_corpus_again_topics_stop_words_umap.json')


=== Topic Information ===
   Topic  Count                                    Name  \
0      0    207                0_vast_ward_gods_despite   
1      1    104  1_tailor_king daughter_huntsman_gretel   

                                      Representation  \
0  [vast, ward, gods, despite, ahead, probably, m...   
1  [tailor, king daughter, huntsman, gretel, brot...   

                                             KeyBERT  \
0  [history, consciousness, horrors, grotesque, a...   
1  [huntsmen, huntsman, hansel, cinderella, broth...   

                                                 MMR  \
0  [vast, arkham, knowledge, sinister, tales, lib...   
1  [tailor, king daughter, gretel, brother lustig...   

                                 Representative_Docs  
0  [EDITOR’S NOTE: Alonzo Hasbrouck Typer of King...  
1  [There was once a man who understood all kinds...  

Topic 0:
[('vast', 0.01673063144622864), ('ward', 0.015069727502192827), ('gods', 0.014675774318627774), ('despite', 0.014

In [110]:
### Using custom stop words only ###

df = convert_text_files(directories=['./Fan_text_files/', './fiction_text_files/','./GrimmFairyTaleAgain/'], label='fiction')

topic_model = init_BERTopic(*customize_BERTopic(stop_words=True, hdbscan=False, umap=False))

# Fit the model on text data
topics, probs = topic_model.fit_transform(df['Texts'])

# Reduce to specified number of topics
topic_model.reduce_topics(df['Texts'], nr_topics=bert_nr_topics)

# Retrieve updated topics after reduction
topics = topic_model.topics_

# Store entire probability matrix for potential future use
all_document_probabilities = probs

# Add topics back to dataframe
df['Topic'] = topics
df['Topic_Probability'] = probs.max(axis=1)

# Get topic information
topic_info = topic_model.get_topic_info()
print("\n=== Topic Information ===")
print(topic_info)

# Display documents with their topics
#print("\n=== Documents with Assigned Topics ===")
# print(df[['Texts', 'Labels', 'Topic', 'Topic_Probability']])

# Get representative words for each topic
all_corpus_again_top_words = []
for topic_num in topic_info['Topic']:
    print(f"\nTopic {topic_num}:")
    print(topic_model.get_topic(topic_num))
    if topic_num != -1:
        current_topic_top_words = topic_model.get_topic(topic_num)
        all_corpus_again_top_words.append(current_topic_top_words)

print()

# Compute Diversity Score
all_corpus_again_diversity_score = calculate_topic_diversity(all_corpus_again_top_words)
print(f"All Corpus with Grimm Again Diversity Score: {all_corpus_again_diversity_score}")

# Compute Uniqueness Score
all_corpus_again_uniqueness, all_corpus_again_per_topic_uniqueness = calculate_topic_uniqueness(all_corpus_again_top_words)

# Save pickle file for OCTIS score
token_list = []
for text in df["Texts"]:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    token_list.append(text)
topic_words = []
for topic_id in topic_model.get_topics():
    if topic_id != -1:
        words = [word for word, i in topic_model.get_topic(topic_id)]
        topic_words.append(words)

# Save token list and topic words to file
save_token_list_and_topics(token_list, topic_words,'all_corpus_again_tokens_stop_words.pkl', 'all_corpus_again_topics_stop_words.json')


=== Topic Information ===
   Topic  Count                                    Name  \
0      0    207                0_vast_ward_gods_despite   
1      1    104  1_tailor_king daughter_huntsman_gretel   

                                      Representation  \
0  [vast, ward, gods, despite, ahead, probably, m...   
1  [tailor, king daughter, huntsman, gretel, brot...   

                                             KeyBERT  \
0  [history, consciousness, horrors, grotesque, a...   
1  [huntsmen, huntsman, hansel, cinderella, broth...   

                                                 MMR  \
0  [vast, arkham, knowledge, sinister, tales, lib...   
1  [tailor, king daughter, gretel, brother lustig...   

                                 Representative_Docs  
0  [EDITOR’S NOTE: Alonzo Hasbrouck Typer of King...  
1  [There was once a man who understood all kinds...  

Topic 0:
[('vast', 0.01673063144622864), ('ward', 0.015069727502192827), ('gods', 0.014675774318627774), ('despite', 0.014