In [1]:
import numpy as np
import json
import glob
import re

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim

<h2>Pre-processing</h2>

In [2]:
import spacy
import pandas as pd
import re

# Load the German language model in spaCy
nlp = spacy.load("de_core_news_sm")  # German model

custom_stopwords = set(["Seite", "xx", "h", "vorher", "davon", "oder", "z.", "B.", "u.", "a.", "of", "to", "and", "\uf0b7",
    "ggf.", "jeder", "'s", "or", "be", "via", "z.B.", "bzw.", "-Das", "the","(="])

for word in custom_stopwords:
    nlp.Defaults.stop_words.add(word)
    nlp.vocab[word].is_stop = True

# Load the DataFrame from the saved pickle file
df = pd.read_pickle("../../data/processed/df/df_00.pkl")

# Apply the lambda function to remove extra whitespace from the 'ExtractedText' column
df['ExtractedText'] = df['ExtractedText'].apply(lambda text: ' '.join(text.split()))

# Code to clean and tokenize the text using the German model only
cleaned_texts = []

for index, row in df.iterrows():
    text = row['ExtractedText']
    doc = nlp(text)  # Process with the German model

    # Remove German stopwords, custom words, non-alphanumeric characters, email, URL, and all punctuation
    cleaned_tokens = []

    for token_index, token in enumerate(doc):
        if(
            token.is_stop or 
            token.like_email or 
            token.like_url or 
            token.is_punct or 
            token.like_num or
            (len(token.text) == 1 and ord(token.text) < 128)
        ):
            continue
        else:
            cleaned_tokens.append(token.text)
              
    # Combine the cleaned tokens from the German model
    cleaned_text = ' '.join(cleaned_tokens)

    cleaned_texts.append(cleaned_text)

df['CleanedText'] = cleaned_texts

In [3]:
token_list= ['AOG-0-PropAO', 'AOG-0-PropAO', 'AOG-1-AnPhysio', 'AOG-1-AnPhysio', 'AOG-1-AnPhysio', 'AOG-1-IngMa-1', 'AOG-1-IngMa-1', 'AOG-1-EP-IW-WI', 'AOG-1-EP-IW-WI', 'AOG-1-EP-IW-WI', 'AOG-1-PhysGL-1', 'AOG-1-PhysGL-1', 'AOG-1-PhysGL-1', 'AOG-1-SubjRefra-1', 'AOG-1-SubjRefra-1', 'AOG-1-TO-1', 'AOG-1-TO-1', 'AOG-2-IngMa-2', 'AOG-2-IngMa-2', 'AOG-2-KLAnp-1', 'AOG-2-KLAnp-1', 'AOG-2-PhysGL-2', 'AOG-2-PhysGL-2', 'AOG-2-PhysGL-2', 'AOG-2-SkiOph', 'AOG-2-SkiOph', 'AOG-2-SubjRefra-2', 'AOG-2-SubjRefra-2', 'AOG-2-TO-2', 'AOG-1-TO-1', 'AOG-2-TO-2', 'AOG-3-ETG-1', 'AOG-3-ETG-1', 'AOG-3-KLAnp-2', 'AOG-3-KLAnp-2', 'AOG-3-KuF', 'AOG-3-KuF', 'AOG-3-KuF', 'AOG-3-OG-1', 'AOG-3-OG-1', 'AOG-3-Path', 'AOG-3-Path', 'AOG-3-Path', 'AOG-3-SubjRefra-3', 'AOG-3-SubjRefra-3', 'AOG-4-ETG-2', 'AOG-4-ETG-2', 'AOG-4-KLAnp-3', 'AOG-4-KLAnp-3', 'AOG-4-MT', 'AOG-4-MT', 'AOG-4-MT', 'ISBN-13', '662-', 'AOG-4-OG-2', 'AOG-4-OG-2', 'AOG-4-OG-2', 'AOG-4-Opt', 'AOG-4-Opt', 'AOG-4-SubjRefra-4', 'AOG-4-SubjRefra-4', 'AOG-5-OTS', 'AOG-5-OTS', 'AOG-5-WFFO-1', 'AOG-5-WFFO-1', 'AOG-5-WP1-Werk', 'AOG-5-WP1-Werk', 'AOG-5-WP1-Progr', 'AOG-5-WP1-Progr', 'AOG-5-PP', 'AOG-5-PP', 'AOG-6-Alt', 'AOG-6-Alt', 'AOG-6-KLAnp-4', 'AOG-6-KLAnp-4', 'AOG-6-WFFO-2', 'AOG-6-WFFO-2', 'AOG-6-WP2-BWLHWK', 'AOG-6-WP2-BWLHWK', 'AOG-6-WP3', '1-', 'AOG-6-WP3', '1-', 'AOG-6-WP3', '2-EntwS', 'AOG-6-WP3', '2-EntwS', '3-', '13-', '3-', 'AOG-6-WP3', '2-EntwS', 'AOG-6-WP4-KlinP', 'AOG-6-WP4-KlinP', 'AOG-6-WP2-DST', 'AOG-6-WP2-DST', 'AOG-6-WP3', '1-SRT', 'AOG-6-WP3', '1-SRT', 'AOG-6-WP3', '2-Spek', 'AOG-6-WP3', '2-Spek', 'AOG-6-WP4', '1-DBV', 'AOG-6-WP4', '1-DBV', 'AOG-6-WP4', '2-DvBG', 'AOG-6-WP4', '2-DvBG', 'AOG-7-F&E', 'AOG-7-F&E', 'AOG-7-LaOph', 'AOG-7-LaOph', 'AOG-7-LaOph', 'AOG-7-WP5', '1-', 'AOG-7-WP5', '1-', 'AOG-7-WP5', '1-', 'AOG-7-WP5', '2-VPD', 'AOG-7-WP5', '2-VPD', 'AOG-7-WP5', '1-MoL', 'AOG-7-WP5', '1-MoL', 'AOG-7-WP5', '2-BWL', 'AOG-7-WP5', '2-BWL', '352-', '2-', '3-', '978-', '42169-', '978-', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-10', '26194-X', '7923-', 'ISBN-10:0387773258', 'ISBN-10', '3-', 'ISBN-10', 'ISBN-10', 'WK1-L', 'WK2-L', '1-Vorlesung', '1:2011VU-gekerbten', 'FT1-L', '6-Achs-', '468-gliedrige', '4-gliedriger', '3D-Druck', '662e-book', 'ISBN-13', '26.-29.', '7-', '293-', '2-', '3-', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISO-9241', '03932-', '26194-X', '7923-', 'ISBN-10:0387773258', 'ISBN-10', '978-', '978-', '528-', '486-', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-13', 'ISBN-13', 'ISBN-10', 'ISBN-10', '3-', 'ISBN-10', 'ISBN-10', '978-', 'ISBN-13', 'ISBN-13', '802.11-Netzen', '446-', '642-', '3-', 'Z2hoch-n', 'ISBN-13', 'B1-B2', 'ISBN-13', '27000-Reihe', '63497-', '26194-X', 'ISBN-13', '662e-book', 'ISBN-13', 'AI-DB1', 'AI-DB2', 'AI-DB3', 'AI-DB4', 'AI-DB5', 'AI-DB6', '7-Anwenderprogramms', 'S7-Projektierung', '7-Anwenderbausteinen', 'S7-GRAPH', 'TP700-Projektierung', 'RS485-Übertragungstechnik', 'SPS-PV1', 'SPS-PV2', 'Bd-3', '6-', 'CO2-Gehalt', 'EIA-232', 'EIA-485', 'KNX-1', 'KNX-2', 'KNX-3', 'KNX-4', 'KNX-5', '8051-Mikrocontrollers', '662e-book', 'ISBN-13', '7-Laborarbeitsplatz', '7-Operator', '7-Konfigurierung', '7-Konfigurierung', '7-Inbetriebnahme', '978-', '978-', '3-', 'Loop-2', '80-/20-Prinzipien', '3-', 'ISBN-10', 'ISBN-13', 'ISBN-10', 'ISBN-13', 'ISBN-10', 'ISBN-13', 'ISBN-13', '978-', 'ISBN-13', '63497-X.', '978-', 'Cortex-M3', 'ISBN-13', 'ISBN-10978', '1-', 'ISBN-10', 'ISBN-13', 'ISBN-10', 'ISBN-13', 'IT-Grundschutz1', 'IT-Grundschutz-Kompendium2', 'BSI-Standard-200', '4-Business-Continuity-Management', 'bsi-standard-200-', '978-', '0-', 'ISBN-13', '978-', 'ISBN-13', '662-', '3-', '978-', '3-', '658-', '21151-', 'Xv3-Zertifikate', 'ISBN-10', 'ISBN-13', 'Thttps://bmakewiki.th-brandenburg.de1', '80-', '/20-Prinzipien', 'customer-vendor-integration-s4-hana', 'inno-vations-with-sap-s4hana', 'networks-book.pdf1', 'contents-1.html1', '1986-', 'ISBN-13', 'ISBN-13', 'ISBN-13', 'ISBN-13', '978-', 'ISBN-13', '978-', 'ISBN-13', '978-', '55860-', 'ISBN-13', 'ISBN-10978', '1-', 'ISBN-10', '978-', '0-', 'ISBN-13', '978-', 'ISBN-13', '63497-X.', '978-', '3-', '94013-X', '0304-', '800-', '0306-Ü', '3D-Druck', 'M-5', 'WPM-7b', '87155-', '2-', '3-', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISO-9241', '03932-', '26194-X', '7923-', 'ISBN-10:0387773258', 'ISBN-10', '978-', '978-', '528-', '486-', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-13', 'ISBN-13', 'ISBN-10', 'ISBN-10', '3-', 'ISBN-10', 'ISBN-10', '1-', '978-', 'ISBN-13', 'ISBN-13', '802.11-Netzen', '446-', '642-', 'isbn-13:978', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-10', 'ISBN-10', '22150-', 'ISBN-13', 'ISBN-13', 'ISBN-13', 'ISBN-10', 'ISBN-13', 'ISBN-10', 'ISBN-13', 'ISBN-10', 'ISBN-13', 'ISBN-13', 'ISBN-13', 'ISBN-13', 'ISBN-13', 'ISBN-13', 'ISBN-10', 'ISBN-13:9783621275910', 'doi.org/10.1007/978-3-319-44162-7.', 'T1-', 'T2-Bestimmung', 'V1-V6', 'V1-V6', 'Z2-hoch-n', '3-', '3-', '133-', '9241-xxx', '446-', '0-', '201-', '63497-X.', '978-', 'B1-B2', '27000-Reihe', 'B1-B2', 'B2-C1', 'IZO-803', 'IZO-804']

removed_hyphen_digits_texts = []

# hyphen_digits_texts = []

count = 0

for index, row in df.iterrows():
    text = row['CleanedText']
    doc = nlp(text)  # Process with the German model

    removed_hyphen_digits_tokens = []

    for token in doc:
        if token.text in token_list:
            continue
        else:
            removed_hyphen_digits_tokens.append(token.text)

    removed_hyphen_digits_text = ' '.join(removed_hyphen_digits_tokens)

    removed_hyphen_digits_texts.append(removed_hyphen_digits_text)

df['ProcessedText'] = removed_hyphen_digits_texts

In [4]:
updated_texts = []

for index, row in df.iterrows():
    text = row['ProcessedText']
    doc = nlp(text)  # Process with the German model

    updated_tokens = []

    # Iterate through the tokens
    for token in doc:
        if token.text.startswith("-"):
            updated_tokens.append(token.text[1:])  # Append the word without the hyphen
        else:
            updated_tokens.append(token.text)  # Append the token as is if it doesn't start with hyphen

    updated_text = ' '.join(updated_tokens)
    updated_texts.append(updated_text)

df['UpdatedText'] = updated_texts

In [5]:
refined_texts = []

for index, row in df.iterrows():
    text = row['UpdatedText']
    doc = nlp(text)  # Process with the German model

    refined_tokens = []

    # Iterate through the tokens
    for token in doc:
        if token.text.endswith("-"):
            refined_tokens.append(token.text[:-1])  # Remove the hyphen at the end and append the word
            # count = count + 1
        else:
            refined_tokens.append(token.text)  # Append the token as is if it doesn't end with hyphen

    refined_text = ' '.join(refined_tokens)
    refined_texts.append(refined_text)

df['RefinedText'] = refined_texts

In [6]:
# Function to lemmatize German text
def lemmatize_text_german(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_tokens)

# Apply the lemmatization function to the 'cleanedtext' column
df['LemmatizedText'] = df['RefinedText'].apply(lemmatize_text_german)

In [7]:
import pandas as pd

data = df['LemmatizedText']

def gen_words(texts):
    final = []
    for text in texts:
        new = text.split()  # Split the text into words
        final.append(new)
    return final

data_words = gen_words(data)

df['Words'] = data_words

In [8]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_words)

texts = data_words

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

<h2>LDA Modelling</h2>

In [12]:
lda_model_03 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=20,
                                           passes=10)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_03, corpus, id2word, mds="mmds", R=30)
pyLDAvis.display(vis)



In [None]:
lda_model.save("lda_model_03")

In [None]:
num_words = 30
topics = lda_model_03.print_topics(num_words=num_words)

In [None]:
custom_topic_order = [5,8,7,6,4,3,1,0,9,2]

# Rearrange topics based on the custom order
rearranged_topics = [topics[i] for i in custom_topic_order]

# Print the rearranged topics
for topic in rearranged_topics:
    print(topic)

<h2>Compute Matrix for Heatmap</h2>

In [17]:
import numpy as np

# Number of words per topic
num_words = 30

# Initialize an empty dictionary to store the topics and their terms
all_topics = {}

# Iterating over the topics to extract words
for topic_num, topic in rearranged_topics:
    # Extracting words from each topic string
    words = [word.split('*')[1].strip('" ') for word in topic.split('+')]
    all_topics[topic_num] = words

# Create a list of lists of terms
all_topics_list = list(all_topics.values())

# Convert the list of lists into a NumPy array
topics_array = np.array(all_topics_list, dtype=object)

# Now, all_topics_list and topics_array are in the desired format 

In [18]:
df_sorted = df.sort_values(by='Subject')

In [19]:
topic_matrices = []

# Define the categories
categories = ['FBI', 'FBT', 'FBW']

for topic_words in topics_array:
    # Create a matrix for each topic
    topic_matrix = np.zeros((len(topic_words), len(categories)), dtype=int)
    
    for i, word in enumerate(topic_words):
        for j, category in enumerate(categories):
            # Sum the frequency of the word from the topic within each category
            topic_matrix[i][j] = sum(df_sorted[df_sorted['Subject'] == category]['Words'].apply(lambda x: x.count(word)))
    
    # Append the topic matrix to the list of matrices
    topic_matrices.append(topic_matrix)

# Display the resulting matrices
for i, matrix in enumerate(topic_matrices):
    print(f"Topic {i + 1} Matrix:")
    print(matrix)

In [20]:
import numpy as np

# Create a matrix to store the sums
sum_matrix = np.zeros((len(topic_matrices), len(categories)), dtype=int)

# Iterate through topic matrices and sum values vertically
for i, topic_matrix in enumerate(topic_matrices):
    sum_matrix[i] = np.sum(topic_matrix, axis=0)

# Display the resulting sum matrix
print("Sum Matrix:")
print(sum_matrix)

In [21]:
topic_matrices_title = []

# Define the categories
categories = df_sorted['Title'].unique()

for topic_words in topics_array:
    # Create a matrix for each topic
    topic_matrix_title = np.zeros((len(topic_words), len(categories)), dtype=int)
    
    for i, word in enumerate(topic_words):
        for j, category in enumerate(categories):
            # Sum the frequency of the word from the topic within each category
            topic_matrix_title[i][j] = sum(df_sorted[df_sorted['Title'] == category]['Words'].apply(lambda x: x.count(word)))
    
    # Append the topic matrix to the list of matrices
    topic_matrices_title.append(topic_matrix_title)

Display the resulting matrices
for i, matrix in enumerate(topic_matrices_title):
    print(f"Topic {i + 1} Matrix:")
    print(matrix)

<h2>Heatmap Plotting</h2>

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

plt.rc('font', size=10)           # Controls default text size
plt.rc('axes', titlesize=16)      # Font size of the title
plt.rc('axes', labelsize=14)      # Font size of the x and y labels
plt.rc('xtick', labelsize=12)     # Font size of the x-axis tick labels
plt.rc('ytick', labelsize=12)     # Font size of the y-axis tick labels
plt.rc('legend', fontsize=12) 

# Create a NumPy array from the values in 'all_topics'
topics_array = np.array(list(all_topics.values()))

# Define a custom colormap transitioning from blue to red
colors = [(0, 0, 1), (1, 0, 0)]  # Blue to Red
custom_cmap = LinearSegmentedColormap.from_list("custom_colormap", colors, N=256)

# Plotting heatmaps for each topic matrix with swapped axes, rotated x-axis tick labels, and values inside cells
for i, matrix in enumerate(topic_matrices):
    transposed_matrix = matrix.T  # Transpose the matrix
    
    plt.figure(figsize=(14, 12))
    
    # Use the custom colormap here
    plt.imshow(transposed_matrix, cmap=custom_cmap, interpolation='nearest')
    
    # Display values inside the cells
    for y in range(transposed_matrix.shape[0]):
        for x in range(transposed_matrix.shape[1]):
            plt.text(x, y, f'{transposed_matrix[y, x]}', ha='center', va='center', color='white')
    
    words_for_ticks = topics_array[i]
    categories_for_ticks = ['FBI', 'FBT', 'FBW']  # Replace with actual categories
    
    plt.xticks(ticks=np.arange(len(words_for_ticks)), labels=words_for_ticks, rotation=60, ha='right')
    plt.yticks(ticks=np.arange(len(categories_for_ticks)), labels=categories_for_ticks)
    
    plt.xlabel('Words')
    plt.ylabel('Categories')
    plt.title(f'Heatmap of Word Frequencies across Departments for Topic #{i+1}')
    
    # Show the heatmap
    # plt.colorbar()
    plt.tight_layout()

    np.save(f'heatmap/final-result-lda/heatmap_topic_{i+1}.npy', transposed_matrix)
    
    # Save the heatmap as an image
    plt.savefig(f'heatmap/final-result-lda/heatmap_topic_{i+1}.png', bbox_inches='tight')

    # Show all the heatmaps
    plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.rc('font', size=12)           # Controls default text size
plt.rc('axes', titlesize=18)      # Font size of the title
plt.rc('axes', labelsize=16)      # Font size of the x and y labels
plt.rc('xtick', labelsize=14)     # Font size of the x-axis tick labels
plt.rc('ytick', labelsize=14)     # Font size of the y-axis tick labels
plt.rc('legend', fontsize=14) 

colors = [(0, 0, 1), (1, 0, 0)]  # Blue to Red
custom_cmap = LinearSegmentedColormap.from_list("custom_colormap", colors, N=256)

# Create a NumPy array from the values in 'all_topics'
topics_array = np.array(list(all_topics.values()))

# Define the categories for the y-axis labels from the 'Title' column of your dataframe
categories_for_ticks = df_sorted['Title'].unique()

# Plotting heatmaps for each topic matrix with swapped axes, rotated x-axis tick labels, and values inside cells
for i, matrix in enumerate(topic_matrices_title):
    transposed_matrix = matrix.T  # Transpose the matrix
    
    # Create a figure and axis for the plot
    fig, ax = plt.subplots(figsize=(20, 18))
    
    # Plot the heatmap using imshow from matplotlib
    cax = ax.imshow(transposed_matrix, cmap=custom_cmap, interpolation='nearest')
    
    # Add color bar
    # fig.colorbar(cax)
    
    # Define the words for the x-axis labels from the topics_array
    words_for_ticks = topics_array[i]
    
    # Set the tick labels for the x-axis and y-axis
    ax.set_xticks(np.arange(len(words_for_ticks)))
    ax.set_yticks(np.arange(len(categories_for_ticks)))
    
    # Label the ticks with the respective list entries
    ax.set_xticklabels(words_for_ticks, rotation=45, ha="right", rotation_mode="anchor")
    ax.set_yticklabels(categories_for_ticks)
    
    # Loop over the data dimensions and create text annotations
    for y in range(transposed_matrix.shape[0]):
        for x in range(transposed_matrix.shape[1]):
            ax.text(x, y, f'{transposed_matrix[y, x]}', ha='center', va='center', color='white')
    
    # Add a title to the heatmap
    ax.set_xlabel('Words')
    ax.set_ylabel('Module Handbooks')
    ax.set_title(f'Frequency Distribution Heatmap for Topic #{i+1} Across Module Handbooks')
    
    # Show the plot
    plt.tight_layout()  # Adjust layout to prevent overlapping

    np.save(f'heatmap/final-result-lda/heatmap_topic_mh_{i+1}.npy', transposed_matrix)
    
    # Save the heatmap as an image
    plt.savefig(f'heatmap/final-result-lda/heatmap_topic_mh_{i+1}.png', bbox_inches='tight')
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

# Transpose the sum_matrix
transposed_sum_matrix = sum_matrix.T

# Define a custom colormap transitioning from blue to red
colors = [(0, 0, 1), (1, 0, 0)]  # Blue to Red
custom_cmap = LinearSegmentedColormap.from_list("custom_colormap", colors, N=256)

# Plot the heatmap for the transposed_sum_matrix
plt.figure(figsize=(8,6))

# Use the custom colormap here
plt.imshow(transposed_sum_matrix, cmap=custom_cmap, interpolation='nearest')

# Display values inside the cells
for y in range(transposed_sum_matrix.shape[0]):
    for x in range(transposed_sum_matrix.shape[1]):
        plt.text(x, y, f'{transposed_sum_matrix[y, x]}', ha='center', va='center', color='white')

topic_labels = [f"Topic {i+1}" for i in range(transposed_sum_matrix.shape[1])]  # Explicitly set labels for all 10 topics
categories_for_ticks = ['FBI', 'FBT', 'FBW']  # Replace with actual categories

plt.xticks(ticks=np.arange(len(topic_labels)), labels=topic_labels, rotation=60, ha='right')
plt.yticks(ticks=np.arange(len(categories_for_ticks)), labels=categories_for_ticks)

plt.xlabel('Topics')
plt.ylabel('Categories')
plt.title('Heatmap of Cumulative Word Frequencies by Department for Each Topic')

# Show the heatmap
# plt.colorbar()
plt.tight_layout()
plt.savefig(f'heatmap/final-result-lda/heatmap_faculties_topics.png', bbox_inches='tight')

# Show the heatmap
plt.show()

In [None]:
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Load the heatmap data from the provided files
file_paths = [
    "heatmap/final-result-lda/heatmap_topic_1.npy",
    "heatmap/final-result-lda/heatmap_topic_2.npy",
    "heatmap/final-result-lda/heatmap_topic_3.npy",
    "heatmap/final-result-lda/heatmap_topic_4.npy",
    "heatmap/final-result-lda/heatmap_topic_5.npy",
    "heatmap/final-result-lda/heatmap_topic_6.npy",
    "heatmap/final-result-lda/heatmap_topic_7.npy",
    "heatmap/final-result-lda/heatmap_topic_8.npy",
    "heatmap/final-result-lda/heatmap_topic_9.npy",
    "heatmap/final-result-lda/heatmap_topic_10.npy",
]

# Load the heatmap data from the provided files and flip them vertically
flipped_heatmaps = [np.flipud(np.load(fp)) for fp in file_paths]

# Topics for each heatmap's x-axis
heatmap_x_axis_topics = topics_array  # Make sure this variable is defined earlier in your code

# Define the layout for the grid
rows, cols = 2, 5
fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f"Topic {i+1}" for i in range(len(flipped_heatmaps))])

y_axis_labels = ['FBW', 'FBT', 'FBI']

# Define a custom colorscale from blue to red
custom_colorscale = [
    [0, 'blue'],    # Blue for the lowest values
    [1, 'red'],     # Red for the highest values
]

# Load each heatmap and add it to the subplot grid
for i, heatmap_data in enumerate(flipped_heatmaps):
    row = (i // cols) + 1  # Determine the row for the subplot
    col = (i % cols) + 1   # Determine the column for the subplot

    fig.add_trace(
        go.Heatmap(
            z=heatmap_data,  # Use the flipped heatmap data
            x=heatmap_x_axis_topics[i],  # Omit x-axis labels by setting this to an empty list
            y=y_axis_labels,
            colorscale=custom_colorscale  # Use the custom colorscale
        ),
        row=row, col=col
    )

for trace in fig.data:
    trace.update(zmin=0, zmax=500)  # Assuming your data ranges from 0 to 500

# Update layout for better readability
fig.update_layout(
    height=900, width=1200,
    title_text= 'Consolidated Heatmap Grid of Word Frequency Distributions Across Departments for Topics 1-10',
    margin=dict(l=20, r=20, t=90, b=90),  # Adjust margins to make room for the title and labels
    xaxis_tickangle=-45,  # Rotate the x-axis labels
    coloraxis_colorbar=dict(
        title='Scale',  # Title for the color scale bar
        ticks='outside',  # Ticks are drawn outside the color bar
        tickvals=[0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500],  # Set specific tickvals
        ticktext=['0', '50', '100', '150', '200', '250', '300', '350', '400', '450', '500']  # Set ticktext to match tickvals
    )
)

for i in range(1, rows * cols + 1):
    fig.update_xaxes(tickangle=-45, tickfont=dict(size=10), row=i // cols + 1, col=i % cols + 1)

pio.write_html(fig,'heatmap/plotly/html/05/heatmap_grid_blue_red.html')  # Export the figure as interactive HTML
fig.write_image('heatmap/plotly/img/heatmap_grid_blue_red.png', width=1200, height=900, scale=2)

# Show the figure
fig.show()

In [None]:
string = 'Lasermaterialbearbeitung'
total = 0 
for i in range(23):
    count = df.loc[i, 'Words'].count(string)
    title = df.loc[i, 'Title']
    total = total + count
    print(f"Row {i}: Title - {title}, Count of {string} - {count}")
print(f"Total: {total}")