In [1]:
import gensim
import nltk
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.llms import OpenAI
from bertopic import BERTopic
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import gensim
import spacy
import gensim.corpora as corpora
import gensim.models as models
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from ast import literal_eval
import string

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yijin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [4]:
# TEXT CLEANING
TEXT_CLEANING = "@\S+|https?:\S+|http?:\S|[^A-Za-z]+"

def preprocess(text, stem=False):

    text = re.sub(TEXT_CLEANING, ' ', str(text).lower()).strip()
    tokens = []

    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)

    return " ".join(tokens)

In [10]:
def get_topic_lists_from_corpus(file, num_topics, words_per_topic):
    """
    Extracts topics and their associated words from a PDF document using the
    BERTopic.

    Parameters:
        file (str): The path to the PDF file for topic extraction.
        num_topics (int): The number of topics to discover.
        words_per_topic (int): The number of words to include per topic.

    Returns:
        list: A list of num_topics sublists, each containing relevant words
        for a topic.
    """
    # Load the CSV file
    df = pd.read_csv(file, on_bad_lines='skip', encoding='latin1')

    # Check if 'text' column exists
    if 'conclusion' not in df.columns:
        raise ValueError("CSV file must contain a 'text' column")

    # Extract the text into a list. Each row is considered a document
    df.conclusion = df.conclusion.apply(lambda x: preprocess(x))
    processed_documents = df['conclusion'].tolist()

    bertmodel = BERTopic(nr_topics = 15, verbose=True)
    topics, probabilities = bertmodel.fit_transform(processed_documents)


    # Get the most frequent topics
    top_topics = bertmodel.get_topic_freq().head(num_topics)

    # Store each list of words from each topic into a list
    topics_ls = []
    keywords_per_topic = {}

    for topic_id, words in bertmodel.get_topics().items():
        keywords_per_topic[topic_id] = words[:words_per_topic]  # Extract first words_per_topic words as keywords
        topics_ls.append(words[:words_per_topic])

    return topics_ls

In [21]:
def topics_from_corpus(llm, file, num_topics, words_per_topic):
    """
    Generates descriptive prompts for LLM based on topic words extracted from a
    PDF document.

    This function takes the output of `get_topic_lists_from_corpus` function,
    which consists of a list of topic-related words for each topic, and
    generates an output string in table of content format.

    Parameters:
        llm (LLM): An instance of the Large Language Model (LLM) for generating
        responses.
        file (str): The path to the PDF file for extracting topic-related words.
        num_topics (int): The number of topics to consider.
        words_per_topic (int): The number of words per topic to include.

    Returns:
        str: A response generated by the language model based on the provided
        topic words.
    """

    # Extract topics and convert to string
    list_of_topicwords = get_topic_lists_from_corpus(file, num_topics,
                                                  words_per_topic)
    string_bert = ""
    for list in list_of_topicwords:
        string_bert += str(list) + "\n"

    # Create the template
    template_string = '''Describe the topic of each of the {num_topics} and give the topic a name.
        Double-quote delimited lists in a simple sentence and describe the topic.
        The lists are the result of an algorithm for topic discovery.
        Do not provide an introduction or a conclusion, only describe the
        topics. Do not mention the word "topic" when describing the topics.
        Use the following template for the response.

        1: (Topic Name) This topic discusses the [brief description of the topic].
        - Keywords: [Comma-separated keywords]

        2: (Topic Name) This topic focuses on the [brief description of the topic].
        - Keywords: [Comma-separated keywords]

        ...

        n: (Topic Name) This topic covers the [brief description of the topic].
        - Keywords: [Comma-separated keywords]

        Lists: """{string_bert}""" '''

    # LLM call
    prompt_template = ChatPromptTemplate.from_template(template_string)
    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run({
        "string_bert" : string_bert,
        "num_topics" : num_topics
        })

    return response

In [22]:
openai_key = ""
llm = OpenAI(openai_api_key=openai_key, max_tokens=-1)

In [23]:
file = "Report Dataset.csv"
num_topics = 15
words_per_topic = 15

summary = topics_from_corpus(llm, file, num_topics, words_per_topic)

2024-06-04 12:36:16,646 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 466/466 [00:05<00:00, 87.50it/s] 
2024-06-04 12:36:26,949 - BERTopic - Embedding - Completed ✓
2024-06-04 12:36:26,950 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-04 12:36:36,721 - BERTopic - Dimensionality - Completed ✓
2024-06-04 12:36:36,723 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-04 12:36:37,126 - BERTopic - Cluster - Completed ✓
2024-06-04 12:36:37,126 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-04 12:36:37,334 - BERTopic - Representation - Completed ✓
2024-06-04 12:36:37,335 - BERTopic - Topic reduction - Reducing number of topics
2024-06-04 12:36:37,477 - BERTopic - Topic reduction - Reduced number of topics from 219 to 15


In [24]:
print(summary)



1: (EEG Waves) This topic discusses the various types of EEG waves and their significance.
- Keywords: eeg, waves, abnormal, slow, temporal, background, focal, region, diffuse, consistent

2: (Seizure Disorder) This topic focuses on the diagnosis and characteristics of seizure disorders.
- Keywords: seizure, disorder, waves, focal, sharp, temporal, abnormal, eeg, region, head

3: (Normal EEG) This topic covers the normal range of EEG readings for different ages.
- Keywords: normal, within, limits, eeg, limit, age, alpha, compared, hyperventilation, otherwise

4: (Diffuse Brain Dysfunction) This topic discusses diffuse brain dysfunction and its impact on EEG readings.
- Keywords: diffuse, dysfunction, cortical, slow, waves, severe, encephalopathy, consistent, abnormal, eeg

5: (Sleep EEG) This topic covers the characteristics and findings of EEG readings during sleep.
- Keywords: sleep, awake, normal, eeg, discharges, epilepsy, showing, temporal, findings, background

6: (Epileptiform