Preliminary

In [None]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)
import hdbscan
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms import bipartite
import os

Functions

In [None]:
def get_probs(model):
    probs = hdbscan.all_points_membership_vectors(model.hdbscan_model)
    prob_df = pd.DataFrame(model._map_probabilities(probs, original_topics=True)).transpose()
    return prob_df

In [None]:
# get documents for each topic from model
def extract_docs_per_topic(doc_df, prob_df, n, multi_text=True):
    '''
    Extract the first n documents corresponding to each topic as fitted by the model 
    according to the probability scores. 
    '''
    col_names = ['Topic '+str(i) for i in range(prob_df.shape[0])]
    docs_per_topic_df = pd.DataFrame(columns=col_names, index=range(n))
    if multi_text is True: 
        books_per_topic_df = pd.DataFrame(columns=col_names, index=range(n))
    ind_list = []
    for i in range(prob_df.shape[0]):
        ind_list = list(prob_df.sort_values(by=i, axis=1, ascending=False).iloc[:, :n].columns)
        doc_list = []
        if multi_text is True: 
            book_list = []
        for j in ind_list:
            doc_list.append(doc_df.iloc[j, 0])
            if multi_text is True: 
                book_list.append(doc_df.iloc[j, 1])
        docs_per_topic_df['Topic '+str(i)] = doc_list
        if multi_text is True: 
            books_per_topic_df['Topic '+str(i)] = book_list
            joint_df = docs_per_topic_df.join(
                books_per_topic_df, lsuffix='_doc', rsuffix='_text_name').sort_index(axis=1)
    if multi_text is True: 
        return docs_per_topic_df, books_per_topic_df, joint_df
    return docs_per_topic_df

In [None]:
def get_frequency(book_df):
    return book_df.apply(pd.Series.value_counts, axis=0).fillna(0)

In [None]:
def extract_unique(tale_df, doc_df, n):
    '''
    Input: 
        data frame of topics vs. tale names, data frame of topics vs. docs
    Output: 
        a data frame, rows are topics, 6 columns as follows:
            For every topic, 
                tale_unique_prob: list of unique tale/book with highest probability documents
                doc_unique_prob: the first document from each tale/book in tale_unique_prob
                doc_ind_prob: the indices of the selected documents in doc_unique_prob
                tale_unique_freq: list of unique tale/book with the most documents
                doc_unique_freq: the first document from each tale/book in tale_unique_freq
                doc_ind_freq: the indices of the selected documents in doc_unique_freq
    Extract the top n unique tales/books and their corresponding documents for every topic
    based on probability and frequency
    '''

    topic_list = list(tale_df.columns)

    tale_unique_prob = []
    doc_unique_prob = []
    doc_ind_prob = []
    tale_unique_freq = []
    doc_unique_freq = []
    doc_ind_freq = []

    for topic in topic_list:
        tale_group = tale_df.groupby(topic).groups
        for key in tale_group.keys():
            tale_group[key] = list(tale_group[key])
        n_prob = sorted(tale_group.items(), key=lambda item: item[1])[:n]
        ind_prob = [n_prob[i][1][0] for i in range(len(n_prob))]
        n_freq = sorted(tale_group.items(), key=lambda item: len(item[1]), reverse=True)[:n]
        ind_freq = [n_freq[i][1][0] for i in range(len(n_freq))]
        tale_unique_prob.append([n_prob[i][0] for i in range(len(n_prob))])
        doc_unique_prob.append(list(doc_df[topic].iloc[ind_prob]))
        doc_ind_prob.append(ind_prob)
        tale_unique_freq.append([n_freq[i][0] for i in range(len(n_freq))])
        doc_unique_freq.append(list(doc_df[topic].iloc[ind_freq]))
        doc_ind_freq.append(ind_freq)
  
    return pd.DataFrame(data={'tale_unique_prob': tale_unique_prob, 'doc_unique_prob': doc_unique_prob, 
                            'doc_ind_prob': doc_ind_prob, 'tale_unique_freq': tale_unique_freq, 
                            'doc_unique_freq': doc_unique_freq, 'doc_ind_freq': doc_ind_freq}, 
                      index=topic_list)

In [None]:
def custom_labels(model, nr_words=5, topic_prefix=True, separator=', '):
    '''
    Customize labels for each topic in model
    Input: 
        model: fitted topic model
        nr_words: top n words per topic to use
        topic_prefix: whether to use the topic ID as a prefix
        separator: the string with which the words and topic prefix will be separated
    '''
    topic_labels = model.generate_topic_labels(nr_words=nr_words, topic_prefix=topic_prefix, separator=separator)
    model.set_topic_labels(topic_labels)

In [None]:
def bipartite_graph(df):
    B = nx.Graph()
    for i in df.index:
        B.add_node(i, bipartite=0)
        for j in df.columns:
            B.add_node(j, bipartite=1)
            if (df.loc[i,j] > 0.0):
                B.add_edge(i, j, weight=df.loc[i,j])
    return B

Load models and datasets

In [None]:
# Danish

# travel datasets
sweden_danish_df = pd.read_csv('./small/sverrig.csv', index_col=0)
poets_danish_df = pd.read_csv('./small/bazar.csv', index_col=0)
spain_danish_df = pd.read_csv('./small/spanien.csv', index_col=0)
rambles_danish_df = pd.read_csv('./small/skygge.csv', index_col=0)
travel_dan_df = pd.read_csv('./large/travels_dan.csv', index_col=0)

# fairytale datasets
tale_danish_df = pd.read_csv("./large/tales_dan.csv", index_col=0)

# combined corpuses
travel_fairytales_df = pd.read_csv('./large/travels_tales_dan.csv', index_col=0)
group0 = pd.read_csv('./small/group_0.csv')
group1_skygge = pd.read_csv('./large/group_1.csv', index_col=0)
group2_bazar = pd.read_csv('./large/group_2.csv', index_col=0)
group3_sverrig = pd.read_csv('./large/group_3.csv', index_col=0)
group4_spanien = pd.read_csv('./large/group_4.csv', index_col=0)

In [None]:
df_list_dan = [sweden_danish_df, poets_danish_df, spain_danish_df, rambles_danish_df, 
               travel_dan_df, tale_danish_df, travel_fairytales_df, group1_skygge, group2_bazar, 
               group3_sverrig, group4_spanien]

In [None]:
for df in df_list_dan:
     print(df.columns)

In [None]:
# Danish

# travelogue models
sverrig = BERTopic.load('./sverrig_topic_model')
bazar = BERTopic.load('./bazar_topic_model')
spanien = BERTopic.load('./spanien_topic_model')
skygge = BERTopic.load('./skygge_topic_model')
travel_dan = BERTopic.load('./travels_dan_topic_model')

# fairytale models
tales_danish = BERTopic.load("./tales_dan_topic_model")

# combined corpuses
full_travels_tales_dan = BERTopic.load("./travels_tales_dan_topic_model")
group1_skygge_model = BERTopic.load("./group_1_topic_model")
group2_bazar_model = BERTopic.load("./group_2_topic_model")
group3_sverrig_model = BERTopic.load("./group_3_topic_model")
group4_spanien_model = BERTopic.load("./group_4_topic_model")

In [None]:
model_list_dan = [sverrig, bazar, spanien, skygge, travel_dan, tales_danish, full_travels_tales_dan, 
                  group1_skygge_model, group2_bazar_model, group3_sverrig_model, group4_spanien_model]

Get documents for all models

In [None]:
# get probability matrices for models
prob_list = []
for model in (model_list_dan):
    prob_list.append(get_probs(model))

In [None]:
docs_single_text = dict()
docs_multi_text = dict()
books = dict()
joints = dict()
freqs = dict()

keys = ['sverrig', 'bazar', 'spanien', 'skygge', 'travel_dan', 'tales_danish', 'travels_tales_dan', 'group1_skygge', 
        'group2_bazar', 'group3_sverrig', 'group4_spanien']

N = 50 #this number can be changed depending on application

for i in range(len(model_list_dan)):
    if (i < 4): 
        docs_single_text[keys[i]] = extract_docs_per_topic((df_list_dan)[i], 
                                                           prob_list[i], n=N, multi_text=False)
    else: 
        doc, book, joint = extract_docs_per_topic((df_list_dan)[i], prob_list[i], n=N)
        docs_multi_text[keys[i]] = doc
        books[keys[i]] = book
        joints[keys[i]] = joint
        freqs[keys[i]] = get_frequency(book)

Tabular summary of travelogue models (Danish)

In [None]:
max_len = max([len(model.get_topics()) for model in model_list_dan[:-1]])
travel_summary = pd.DataFrame(index=range(max_len))
col_dict = dict()
keys = ['sverrig', 'bazar', 'spanien', 'skygge', 'travel_dan']
for i in range(len(keys)):
    custom_labels(model_list_dan[i])
    col_dict[keys[i]] = model_list_dan[i].custom_labels_

In [None]:
for col_name, col_data in col_dict.items():
    if len(col_data) < max_len:
        col_data += [float('nan')] * (max_len - len(col_data))
    travel_summary[col_name] = col_data

In [None]:
top3textsdan = books['travel_dan'].iloc[:3, :].T.rename(columns={0:'Text_1', 1:'Text_2', 2:'Text_3'})
books['travel_dan'].columns
top3docsdan = docs_multi_text['travel_dan'].iloc[:3, :].T.rename(columns={0:'Doc_1', 1:'Doc_2', 2:'Doc_3'})

In [None]:
traveldan_topics_df = pd.DataFrame(model_list_dan[4].get_topics())
traveldan_topics_df.rename(columns=dict(zip(range(traveldan_topics_df.shape[1]-1), ['Topic '+str(i) for i in range(101)])), 
                  inplace=True)
traveldan_topics_df = traveldan_topics_df.applymap(lambda x:x[0])
traveldan_topics_df

In [None]:
top3wordsdan = traveldan_topics_df.iloc[:3, 1:].T

In [None]:
topic_doc_traveldan_df = pd.concat([top3wordsdan, top3docsdan, top3textsdan], axis=1)
topic_doc_traveldan_df['top3wordsdan'] = topic_doc_traveldan_df[[0, 1, 2]].astype(str).agg(', '.join, axis=1)
topic_doc_traveldan_df.drop([0,1,2], axis=1, inplace=True)
# uncomment following for inspection
#topic_doc_traveldan_df
#topic_doc_traveldan_df.columns

More analysis on full combined corpuses

In [None]:
top3texts = books['travels_tales_dan'].iloc[:3, :].T.rename(columns={0:'Text_1', 1:'Text_2', 2:'Text_3'})

In [None]:
top3docs = docs_multi_text['travels_tales_dan'].iloc[:3, :].T.rename(columns={0:'Doc_1', 1:'Doc_2', 2:'Doc_3'})

In [None]:
all_topics_df = pd.DataFrame(model_list_dan[6].get_topics())
all_topics_df.rename(columns=dict(zip(range(all_topics_df.shape[1]-1), ['Topic '+str(i) for i in range(101)])), 
                  inplace=True)
all_topics_df = all_topics_df.applymap(lambda x:x[0])
all_topics_df

In [None]:
top3words = all_topics_df.iloc[:3, 1:].T

In [None]:
top3words.iloc[4]

In [None]:
topic_doc_tale_df = pd.concat([top3words, top3docs, top3texts], axis=1)
topic_doc_tale_df['top3words'] = topic_doc_tale_df[[0, 1, 2]].applymap(str).agg(', '.join, axis=1)
topic_doc_tale_df.drop([0,1,2], axis=1, inplace=True)
topic_doc_tale_df

In [None]:
df = extract_unique(books['travels_tales_dan'], docs_multi_text['travels_tales_dan'], 3)
df

In [None]:
topic_list = list(books['travels_tales_dan'].columns)
df2 = pd.DataFrame(df['tale_unique_prob'].to_list(), columns=['tale_prob_1','tale_prob_2', 'tale_prob_3'], index=topic_list)
df3 = pd.DataFrame(df['doc_unique_prob'].to_list(), columns=['doc_prob_1','doc_prob_2', 'doc_prob_3'], index=topic_list)
df4 = pd.DataFrame(df['tale_unique_freq'].to_list(), columns=['tale_freq_1','tale_freq_2', 'tale_freq_3'], index=topic_list)
df5 = pd.DataFrame(df['doc_unique_freq'].to_list(), columns=['doc_freq_1','doc_freq_2', 'doc_freq_3'], index=topic_list)

In [None]:
ind_prob = pd.DataFrame(df['doc_ind_prob'])
ind_freq = pd.DataFrame(df['doc_ind_freq'])

In [None]:
topic_words = pd.DataFrame(topic_doc_tale_df['top3words'])

In [None]:
topic_words.iloc[4]

In [None]:
# Top 3 texts and top 3 documents for each topic by probability
topic_doc_tale_prob = pd.concat([topic_words, df2, df3, ind_prob], axis=1)
topic_doc_tale_prob

In [None]:
# Top 3 texts and top 3 documents for each topic by document frequency
topic_doc_tale_freq = pd.concat([topic_words, df4, df5, ind_freq], axis=1)
topic_doc_tale_freq

# Visualizations for multi-text models

Stacked bar chart for full travelogue model (Danish)

In [None]:
fig2 = px.bar(freqs['travel_dan'].T)
fig2.update_layout(xaxis_title_text='Topics', yaxis_title_text='Counts', legend_title='Book')
fig2.show()

Heatmaps for fairytale corpus and combined corpuses

In [None]:
# fairytale corpus 
fig4 = px.imshow(freqs['tales_danish'], color_continuous_scale='ice_r', range_color=[0,20], width=750, height=750)
fig4.update_layout(xaxis_title_text='Topics', yaxis_title_text='Fairytales', 
                   title_text='Heatmap for Fairytale Model')
fig4.show()

In [None]:
# combined corpuses 
fig6 = px.imshow(freqs['travels_tales_dan'], color_continuous_scale='ice_r', 
                 range_color=[0,30], width=1000, height=1000)
fig6.update_layout(xaxis_title_text='Topics', yaxis_title_text='Fairytales and Travelogues', 
                   title_text='Heatmap for Combined Corpuses Model (Danish)')
fig6.show()

In [None]:
fig100 = px.bar(freqs['travels_tales_dan'].loc[['bazar', 'skygge', 'spanien', 'sverrig']].T)
fig100.update_layout(xaxis_title_text='Topics', yaxis_title_text='Counts', legend_title='Book')
fig100.show()

In [None]:
fig101 = px.bar(freqs['travels_tales_dan']['Topic 54'])
fig101.update_layout(xaxis_title_text='Title', yaxis_title_text='Counts', legend_title='Topic')
fig101.show()

In [None]:
# Topic ID for the bar chart
topic_id = 'Topic 5'

# Get the top 3 words for 'Topic 54' from the 'topic_doc_tale_df' DataFrame
top3_words_for_topic = topic_doc_tale_df.loc[topic_id, 'top3words']

# Create the bar chart
fig101 = px.bar(freqs['travels_tales_dan']['Topic 5'])

# Combine the topic number and top 3 words for the legend title
legend_title = f"{topic_id}: {''.join(top3_words_for_topic)}"

# Update the layout of the bar chart
fig101.update_layout(
    xaxis_title_text='Title',
    yaxis_title_text='Counts',
    legend_title=legend_title  # Update the legend title
)

# Display the plot
fig101.show()

In [None]:
# freqs: DataFrame containing frequencies of documents associated with each topic
freqs['travels_tales_dan']

In [None]:
# topic_doc_tale_df: DataFrame containing the top 3 words for each topic
# List of topic IDs to plot
topic_ids_to_plot = ['Topic 5', 'Topic 10', 'Topic 15']

# Create empty list to store the dataframes for each topic
dataframes_for_each_topic = []

# Loop through each topic ID and create dataframe for that topic
for topic_id in topic_ids_to_plot:
    # Get top 3 words for the current topic
    top3_words_for_topic = topic_doc_tale_df.loc[topic_id, 'top3words']
    
    # Create new dataframe with the data for the current topic
    df_topic = freqs['travels_tales_dan'].copy()
    df_topic['Title'] = df_topic.index  # Use the index as the title value for each topic
    df_topic['Counts'] = df_topic[topic_id]
    df_topic['Topic'] = f"{topic_id}: {''.join(top3_words_for_topic)}"
    
    # Append dataframe to the list
    dataframes_for_each_topic.append(df_topic)

# Concatenate dataframes for all topics into a single dataframe
combined_df = pd.concat(dataframes_for_each_topic)

# Create a single bar chart with different colored bars for each topic
fig104 = px.bar(combined_df, x='Title', y='Counts', color='Topic',
                labels={'Title': 'Title', 'Counts': 'Counts'},
                category_orders={'Topic': topic_ids_to_plot})

# Update the layout of the bar chart
fig104.update_layout(
    xaxis_title_text='Title',
    yaxis_title_text='Counts',
    barmode='group'  # Set the barmode to 'group' for different colored bars
)

# Display the plot
fig104.show()

In [None]:
# fairytale group1 + Skygge (Danish)
fig7 = px.imshow(freqs['group1_skygge'], color_continuous_scale='ice_r', 
                 range_color=[0,30], width=750, height=750)
fig7.update_layout(xaxis_title_text='Topics', yaxis_title_text='Text titles', 
                   title_text='Heatmap for Skyggebilleder and Same Period Fairytales Model (Danish)')
fig7.show()

In [None]:
# fairytale group2 + Bazar (Danish)
fig9 = px.imshow(freqs['group2_bazar'], color_continuous_scale='ice_r', 
                 range_color=[0,30], width=750, height=750)
fig9.update_layout(xaxis_title_text='Topics', yaxis_title_text='Text titles', 
                   title_text='Heatmap for En Digters Bazar and Same Period Fairytales Model (Danish)')
fig9.show()

In [None]:
# fairytale group3 + Sverrig (Danish)
fig11 = px.imshow(freqs['group3_sverrig'], color_continuous_scale='ice_r', 
                 range_color=[0,30], width=750, height=750)
fig11.update_layout(xaxis_title_text='Topics', yaxis_title_text='Text titles', 
                   title_text='Heatmap for I Sverrig and Same Period Fairytales Model (Danish)')
fig11.show()

In [None]:
# fairytale group4 + Spanien (Danish)
fig13 = px.imshow(freqs['group4_spanien'], color_continuous_scale='ice_r', 
                 range_color=[0,30], width=750, height=750)
fig13.update_layout(xaxis_title_text='Topics', yaxis_title_text='Text titles', 
                   title_text='Heatmap for I Spanien and Et Besoeg i Portugal and Same Period Fairytales Model (Danish)')
fig13.show()

Line graph of document contributions per topic in combined corpuses model -- see figure 2 in article

In [None]:
fig14 = go.Figure()
for i in range(prob_list[6].shape[0]): 
    fig14.add_trace(go.Scatter(x=np.arange(prob_list[6].shape[1]), 
                              y=prob_list[6].iloc[i].sort_values(ascending=False),
                              mode='lines', name="Topic "+str(i)))
# fig.add_trace(go.Scatter(x=np.arange(full_prob_df.shape[1]), y=[0.2]*100, mode='lines'))
fig14.update_layout(legend_title="Topics", xaxis_title_text="Documents", 
                   yaxis_title_text="Probability", 
                   title_text="First 100 Document Contributions per Topic in Combined Corpuses Model")
fig14.update_xaxes(range=[0,100])
fig14.show()

## Bipartite Networks

All fairytales model 

In [None]:
B_fairytale_dan = bipartite_graph(freqs['tales_danish'])

In [None]:
plt.figure(figsize=(20, 20))
nx.draw_networkx(B_fairytale_dan, pos=nx.drawing.layout.bipartite_layout(B_fairytale_dan, 
                                                                         list(freqs['tales_danish'].index)), 
                 with_labels=False, node_size=100)
plt.show()

In [None]:
nx.write_graphml_lxml(B_fairytale_dan, "./reduced_bigraphs/fairytales_danish_bigraph.graphml")

### Full model (travel + tales) (Danish)

In [None]:
B_all_dan = bipartite_graph(freqs['travels_tales_dan'])

In [None]:
plt.figure(figsize=(20, 20))
nx.draw_networkx(B_all_dan, pos=nx.drawing.layout.bipartite_layout(B_all_dan, list(freqs['travels_tales_dan'].index)), 
                 with_labels=False, node_size=100)
plt.show()

In [None]:
nx.write_graphml_lxml(B_all_dan, "./reduced_bigraphs/danish_travel_fairytale_bigraph.graphml")

### Group 1: Skygge + fairytales (Danish)

In [None]:
B_skygge = bipartite_graph(freqs['group1_skygge'])

In [None]:
plt.figure(figsize=(10, 10))
nx.draw_networkx(B_skygge, pos=nx.drawing.layout.bipartite_layout(B_skygge, list(freqs['group1_skygge'].index)), 
                 with_labels=False, node_size=100)
plt.show()

In [None]:
nx.write_graphml_lxml(B_skygge, "./reduced_bigraphs/group1_skygge_bigraph.graphml")

### Group 2: Bazar + fairytales (Danish)

In [None]:
B_bazar = bipartite_graph(freqs['group2_bazar'])

In [None]:
plt.figure(figsize=(10, 10))
nx.draw_networkx(B_bazar, pos=nx.drawing.layout.bipartite_layout(B_bazar, list(freqs['group2_bazar'].index)), 
                 with_labels=False, node_size=100)
plt.show()

In [None]:
nx.write_graphml_lxml(B_bazar, "./reduced_bigraphs/group2_bazar_bigraph.graphml")

### Group 3: Sverrig + fairytales (Danish)

In [None]:
B_sverrig = bipartite_graph(freqs['group3_sverrig'])

In [None]:
plt.figure(figsize=(10, 10))
nx.draw_networkx(B_sverrig, pos=nx.drawing.layout.bipartite_layout(B_sverrig, list(freqs['group3_sverrig'].index)), 
                 with_labels=False, node_size=100)
plt.show()

In [None]:
nx.write_graphml_lxml(B_sverrig, "./reduced_bigraphs/group3_sverrig_bigraph.graphml")

### Group 4: Spanien + fairytales (Danish)

In [None]:
B_spanien = bipartite_graph(freqs['group4_spanien'])

In [None]:
plt.figure(figsize=(10, 10))
nx.draw_networkx(B_spanien, pos=nx.drawing.layout.bipartite_layout(B_spanien, list(freqs['group4_spanien'].index)), 
                 with_labels=False, node_size=100)
plt.show()

In [None]:
nx.write_graphml_lxml(B_spanien, "./reduced_bigraphs/group4_spanien_bigraph.graphml")