In [None]:
# Data processing
import pandas as pd
import numpy as np

# Dimension reduction
from umap import UMAP

# Clustering
from hdbscan import HDBSCAN

# Count vectorization
from sklearn.feature_extraction.text import CountVectorizer

# Sentence transformer
from sentence_transformers import SentenceTransformer

import openai

# Topic model
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer


import plotly.io as pio
pio.renderers.default='iframe'


In [None]:
# builds model from steps 1 to 5

def build_base_model(n_neighbors=15, n_components=5, min_clust_size=10, random_seed=100):
    # Step 1 - Extract embeddings
    embedding_model = SentenceTransformer("paraphrase-MiniLM-L3-v2") # customizable

    # Step 2 - Reduce dimensionality
    umap_model = UMAP(n_neighbors=n_neighbors, \
                      n_components=n_components, \
                      min_dist=0.0, \
                      metric='cosine', \
                      random_state=random_seed)

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=min_clust_size, \
                            metric='euclidean', \
                            cluster_selection_method='eom', \
                            prediction_data=True)

    # Step 4 - Tokenize topics
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1,3), min_df=3)

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) 
    
    return {'embedding': embedding_model, \
            'umap': umap_model, \
            'hdbscan': hdbscan_model, \
            'vectorizer': vectorizer_model, \
            'ctfidf': ctfidf_model}


In [None]:
# adds MaximalMarginalRelevance as step 6 to the base model

def build_model_mmr(n_neighbors=15, n_components=5, min_clust_size=10, random_seed=100):
    base_model_dict = build_base_model(n_neighbors, n_components, min_clust_size, random_seed)
    
    representation_model = MaximalMarginalRelevance(diversity=0.5)
    
    # All steps together
    topic_model = BERTopic(
        embedding_model=base_model_dict['embedding'],           # Step 1 - Extract embeddings
        umap_model=base_model_dict['umap'],                     # Step 2 - Reduce dimensionality
        hdbscan_model=base_model_dict['hdbscan'],               # Step 3 - Cluster reduced embeddings
        vectorizer_model=base_model_dict['vectorizer'],         # Step 4 - Tokenize topics
        ctfidf_model=base_model_dict['ctfidf'],                 # Step 5 - Extract topic words
        representation_model=representation_model,              # Step 6 - (Optional) Fine-tune topic represenations
        calculate_probabilities=True
    )
    return topic_model


In [None]:
# create and run BERTopic model to identify topic representations

random_seed = 105
n_neighbors = 15
n_components = 5
min_clust_size = 10
topic_model_mmr = build_model_mmr(n_neighbors, n_components, min_clust_size, random_seed=random_seed)

# Run BERTopic model
topics, probs = topic_model_mmr.fit_transform('<list of documents>')

# Get the list of topics
pd.set_option('display.max_colwidth', None)
display(topic_model_mmr.get_topic_info())
display(topic_model_mmr.visualize_barchart(top_n_topics=100, n_words=15, height=300))

In [None]:
def get_completion(prompt, model="gpt-3.5-turbo-16k"): 
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, 
    )
    return response.choices[0].message["content"]

openai.organization = "<ORGANIZATION KEY"
openai.api_key = '<API KEY>'

In [None]:
# ChatGPT to summarize topics

import json
import time

delay_in_seconds = 30

summarization_prompt = """
I have a topic that is described by the following keywords: [KEYWORDS]

In this topic, the following documents separated by triple tick marks are a small but representative subset of all documents in the topic:
[DOCUMENTS]

Based on the information above, please give a description of this topic in a JSON object in the following format:
{"topic": <descriptive topic label with at least 5 words>,
"description": <description, specifically with regards to attitude toward nephrology>
}

Reply with only the answer in JSON form or as a JSON-parsable string.
"""

chatgpt_responses = []
df_topics = topic_model_mmr.get_topic_info()

for t in df_topics['Topic']:
    rep_docs = topic_model_mmr.get_representative_docs(topic=t)
    topic_rep = topic_model_mmr.get_topic(topic=t)
    top_words = [pair[0] for pair in topic_rep]

    topic_prompt = summarization_prompt.replace('[DOCUMENTS]', '\n ``` '.join(rep_docs))
    topic_prompt = topic_prompt.replace('[KEYWORDS]', ', '.join(top_words))
    
    response = get_completion(topic_prompt)
    print('\n', t)

    response_json = json.loads(response)
    response_json['topic_index'] = t
    print(response_json)
    
    chatgpt_responses.append(response_json)
    
    time.sleep(delay_in_seconds)

    
df_chatgpt = pd.DataFrame(chatgpt_responses) # contains topic summaries generated by ChatGPT


In [None]:
# embed documents in 2D space for visualization

embedding_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")

embeddings = embedding_model.encode(df['message'], show_progress_bar=True)

topic_model_mmr.visualize_documents(df['message'], embeddings=embeddings, custom_labels=False)

In [None]:
# view topics in hierarchical tree

hierarchical_topics = topic_model.hierarchical_topics(df['message'])
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

# topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, custom_labels=True)

In [None]:
# Visualize term rank decrease
topic_model.visualize_term_rank(custom_labels=True)

In [None]:
# Visualize intertopic distance
topic_model_mmr.visualize_topics(custom_labels=True)

In [None]:
# Visualize similarity using heatmap
topic_model_mmr.visualize_heatmap(custom_labels=True)