In [88]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/ChiLit_Topic_Modeling

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/ChiLit_Topic_Modeling


In [None]:
!pip install git+https://github.com/tonazzog/OCTIS.git

In [None]:
!pip install bertopic
!pip install dotenv

In [54]:
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
import pandas as pd
import pickle
from openai import OpenAI
import textwrap
import os
from dotenv import load_dotenv
from bertopic import BERTopic
import json

In [240]:
if not load_dotenv():
    os.environ['OPENAI_API_KEY'] = input("OpenAI API key: ")

In [4]:
optuna_folder = "./optuna_200/"

In [89]:
df_chilit = pd.read_csv("./data/ChiLit_Chunks_200.csv")
df_chilit = df_chilit.fillna("")
sentences = df_chilit['tokens'].to_list()

In [95]:
model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
    api_key=os.getenv('OPENAI_API_KEY')
)

In [96]:
class Topic(BaseModel):
    primary_label : str
    explanation : str
    alternative_labels : str

In [97]:
base_prompt = """I have a topic from a topic model that I need to label. Below are the most important words associated with this topic and the 5 most representative documents.
    
    Topic Words (in order of importance):
    {words_str}
    
    Representative Documents:
    {docs_str}
    
    Based on this information, please provide:
    1. A concise, descriptive label for this topic (2-3 words)
    2. A brief explanation of what this topic represents
    3. Alternative label suggestions (2-3 options)
    
    Please focus on capturing the main theme that connects both the words and the document content.
    
    Response format:
    Primary Label: [Your main label]
    Explanation: [Brief explanation]
    Alternative Labels: [Alternative 1, Alternative 2, Alternative 3"""

In [98]:
def get_prompt(base_prompt, parser):

    template = base_prompt + "\n" + "{format_instructions}'"

    prompt = PromptTemplate(
        template = template,
        input_variables = ["words_str", "docs_str"],
        partial_variables = {"format_instructions": parser.get_format_instructions()}
    )

    return prompt

In [99]:
parser = PydanticOutputParser(pydantic_object = Topic)
prompt = get_prompt(base_prompt, parser)
chain = prompt | model | parser

In [None]:
def label_topics(topic_words, top_docs):
    labels = {}
    for topic_id in topic_words.keys():
        words_str = topic_words[topic_id]
        docs_str = "\n\n".join([textwrap.shorten(doc, width=3000, placeholder="...") for doc in top_docs[topic_id]])
        result = chain.invoke({"words_str": words_str, "docs_str" : docs_str})
        labels[topic_id] = result.model_dump()

    reuturn labels

### ProdLDA topic labels

In [119]:
final_model = pickle.load(open(optuna_folder + "Optuna_ProdLDA_output.pkl", "rb"))

In [33]:
n_topics = len(final_model['topics'])
df_topics = pd.DataFrame(final_model['topic-document-matrix'].T, columns=[f"Topic {i}" for i in range(n_topics)])
df_topics['book_id'] = df_chilit['book_id'].to_list()
df_topics['paragraph_text'] = df_chilit['paragraph_text'].to_list()
df_topics['tokens'] = df_chilit['tokens'].to_list()

In [36]:
topic_words = {}
top_docs = {}

for index, topic in enumerate(final_model['topics']):
    topic_words[index] = topic
    top_docs[index] = []
    for _, row in df_topics.nlargest(5, f'Topic {index}').iterrows():
        top_docs[index].append(row['paragraph_text'])

In [246]:
ProdLDA_labels = label_topics(topic_words, top_docs)

In [46]:
with open(optuna_folder + "/ProdLDA_Topic_Labels.json", 'w') as outfile:
  json.dump(ProdLDA_labels, outfile, indent = 4)

In [105]:
# Display labels
with open(optuna_folder + "/ProdLDA_Topic_Labels.json", 'r') as file:
  ProdLDA_labels = json.load(file)

In [115]:
for key, value in ProdLDA_labels.items():
    print("\n---------------------------------")
    print(f"Topic {key} - words: {final_model['topics'][int(key)]}")
    print(f"Label: {value['primary_label']} \nExplanation: {value['explanation']}")


---------------------------------
Topic 0 - words: ['force', 'enemy', 'attack', 'troops', 'order', 'french', 'march', 'advance', 'army', 'number']
Label: Military Engagements 
Explanation: This topic represents the strategic and tactical aspects of military confrontations, focusing on troop movements, attacks, and the dynamics between opposing forces, particularly in historical contexts involving French and English armies.

---------------------------------
Topic 1 - words: ['face', 'arm', 'voice', 'heart', 'tear', 'eye', 'hand', 'lay', 'speak', 'bed']
Label: Loss and Grief 
Explanation: This topic represents the emotional experiences of loss, mourning, and the deep connections between family members, particularly in the context of children and their parents. The documents reflect themes of sorrow, the fragility of life, and the longing for connection amidst tragedy.

---------------------------------
Topic 2 - words: ['dragon', 'red', 'buy', 'engine', 'let', 'gold', 'birthday', 'tea'

In [118]:
for key, value in ProdLDA_labels.items():
    print(f"{key}\t{" ".join([word for word in final_model['topics'][int(key)]])}\t{value['primary_label']}\t{value['explanation']}")

0	force enemy attack troops order french march advance army number	Military Engagements	This topic represents the strategic and tactical aspects of military confrontations, focusing on troop movements, attacks, and the dynamics between opposing forces, particularly in historical contexts involving French and English armies.
1	face arm voice heart tear eye hand lay speak bed	Loss and Grief	This topic represents the emotional experiences of loss, mourning, and the deep connections between family members, particularly in the context of children and their parents. The documents reflect themes of sorrow, the fragility of life, and the longing for connection amidst tragedy.
2	dragon red buy engine let gold birthday tea handkerchief bun	Dragon Adventures	This topic revolves around whimsical stories featuring dragons, their interactions with children, and elements of fantasy such as magical items and playful adventures. The documents highlight themes of friendship, imagination, and the fantast

### BERTopic topic labels

In [248]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model = BERTopic.load("./bertopic", embedding_model=embedding_model)

{}

In [42]:
with open("./data/BERTopic_Documents.txt", 'r') as file:
  text = file.read()
  documents = text.split("\n")

In [30]:
len(documents)

5843

In [78]:
# Non ci sono le probabilità :(
#doc_topics = topic_model.get_document_info(documents)

In [249]:
doc_topics = pd.read_csv("./bertopic/doc_topics.csv", encoding="utf-8")

In [250]:
len(doc_topics)

5843

In [251]:
doc_topics = doc_topics.assign(row_number=range(len(doc_topics)))
doc_topics['book_id'] = df_chilit['book_id'].to_list()
doc_topics['paragraph_text'] = df_chilit['paragraph_text'].to_list()
doc_topics = doc_topics[['row_number','book_id','paragraph_text', 'Document', 'Topic', 'Probability']]

In [252]:
doc_order = doc_topics["row_number"]

# Pivot
doc_topic_matrix = (
    doc_topics.pivot_table(
        index="row_number",
        columns="Topic",
        values="Probability",
        fill_value=0
    )
    .reindex(doc_order)   # restore original order
    .reset_index()
)

# Add columns book_id and paragraph_text
doc_topic_matrix = doc_topic_matrix.merge(doc_topics[['row_number','book_id','paragraph_text','Document']], on="row_number", how="inner")

# Drop topic -1


# Rename columns
rename_dict = {col: f"Topic {col}" for col in doc_topic_matrix.columns if str(col).isnumeric()}

# Apply rename
doc_topic_matrix = doc_topic_matrix.rename(columns=rename_dict)
doc_topic_matrix = doc_topic_matrix.drop(-1, axis=1)

In [253]:
doc_topic_matrix.head()

Unnamed: 0,row_number,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,book_id,paragraph_text,Document
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alice,CHAPTER I. Down the Rabbit-Hole Alice was begi...,hole tired sister bank peep book sister read p...
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alice,"Alice was not a bit hurt, and she jumped up on...",bit hurt jump foot moment dark long passage hu...
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alice,‘Curiouser and curiouser!’ cried Alice (she wa...,curiouser curiouser cry surprise moment forget...
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alice,‘How doth the little crocodile Impr...,crocodile improve shine tail pour golden scale...
4,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,alice,CHAPTER III. A Caucus-Race and a Long Tale The...,race long tale queer party assemble bank bird ...


In [254]:
topic_info = topic_model.get_topic_info()

In [255]:
topic_info = topic_info[topic_info['Topic'] != -1]

In [256]:
topic_words = {}
top_docs = {}

for index, topic in topic_info.iterrows():
    topic_id = topic['Topic']
    topic_words[topic_id] = topic['Representation']
    top_docs[topic_id] = []
    for _, row in doc_topic_matrix.nlargest(5, f'Topic {topic_id}').iterrows():
        top_docs[topic_id].append(row['paragraph_text'])

In [257]:
BERTopic_labels = label_topics(topic_words, top_docs)

In [260]:
with open("./bertopic/BERTopic_Topic_Labels.json", 'w') as outfile:
  json.dump(BERTopic_labels, outfile)
