In [1]:
import gensim
import nltk
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.llms import OpenAI
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [2]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [3]:
# TEXT CLEANING
TEXT_CLEANING = "@\S+|https?:\S+|http?:\S|[^A-Za-z]+"

def preprocess(text, stop_words, stem=False):
    """
    Preprocesses the text by removing unwanted characters, tokenizing, 
    removing stopwords, and optionally stemming.

    Parameters:
        text (str): The text to preprocess.
        stop_words (set): A set of stopwords to remove.
        stem (bool): Whether to apply stemming.

    Returns:
        list: A list of processed words (tokens).
    """
    from nltk.stem.porter import PorterStemmer

    # Initialize stemmer if stemming is needed
    stemmer = PorterStemmer() if stem else None

    # Clean and tokenize text
    text = re.sub(TEXT_CLEANING, ' ', str(text).lower()).strip()
    tokens = [stemmer.stem(token) if stemmer else token for token in text.split() if token not in stop_words]

    return tokens

In [4]:
def get_topic_lists_from_corpus(file, num_topics, words_per_topic):
    """
    Extracts topics and their associated words from a PDF document using the 
    Latent Dirichlet Allocation (LDA) algorithm.

    Parameters:
        file (str): The path to the PDF file for topic extraction.
        num_topics (int): The number of topics to discover.
        words_per_topic (int): The number of words to include per topic.

    Returns:
        list: A list of num_topics sublists, each containing relevant words 
        for a topic.
    """
    # Load the CSV file
    df = pd.read_csv(file, on_bad_lines='skip', encoding='latin1')

    # Check if 'text' column exists
    if 'conclusion' not in df.columns:
        raise ValueError("CSV file must contain a 'text' column")

    # Extract the text into a list. Each row is considered a document
    documents = df['conclusion'].tolist()

    # Preprocess the documents
    nltk.download('stopwords')
    stop_words = set(stopwords.words(['english','spanish']))
    processed_documents = [preprocess(doc, stop_words) for doc in documents]

    # Create a dictionary and a corpus
    dictionary = corpora.Dictionary(processed_documents)
    corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

    # Build the LDA model
    lda_model = LdaModel(
        corpus, 
        num_topics=num_topics, 
        id2word=dictionary, 
        passes=15
        )

    # Retrieve the topics and their corresponding words
    topics = lda_model.print_topics(num_words=words_per_topic)

    # Store each list of words from each topic into a list
    topics_ls = []
    for topic in topics:
        words = topic[1].split("+")
        topic_words = [word.split("*")[1].replace('"', '').strip() for word in words]
        topics_ls.append(topic_words)

    return topics_ls

In [5]:
def get_topic_lists_from_corpus(file, num_topics, words_per_topic):
    """
    Extracts topics and their associated words from a PDF document using the 
    Latent Dirichlet Allocation (LDA) algorithm.

    Parameters:
        file (str): The path to the PDF file for topic extraction.
        num_topics (int): The number of topics to discover.
        words_per_topic (int): The number of words to include per topic.

    Returns:
        list: A list of num_topics sublists, each containing relevant words 
        for a topic.
    """
    # Load the CSV file
    df = pd.read_csv(file, on_bad_lines='skip', encoding='latin1')

    # Check if 'text' column exists
    if 'conclusion' not in df.columns:
        raise ValueError("CSV file must contain a 'text' column")

    # Extract the text into a list. Each row is considered a document
    documents = df['conclusion'].tolist()

    # Preprocess the documents
    nltk.download('stopwords')
    stop_words = set(stopwords.words(['english','spanish']))
    processed_documents = [preprocess(doc, stop_words) for doc in documents]

    # Create a dictionary and a corpus
    dictionary = corpora.Dictionary(processed_documents)
    corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

    # Build the LDA model
    lda_model = LdaModel(
        corpus, 
        num_topics=num_topics, 
        id2word=dictionary, 
        passes=15
        )

    # Retrieve the topics and their corresponding words
    topics = lda_model.print_topics(num_words=words_per_topic)

    # Store each list of words from each topic into a list
    topics_ls = []
    for topic in topics:
        words = topic[1].split("+")
        topic_words = [word.split("*")[1].replace('"', '').strip() for word in words]
        topics_ls.append(topic_words)

    return topics_ls

In [6]:
def topics_from_corpus(llm, file, num_topics, words_per_topic):
    """
    Generates descriptive prompts for LLM based on topic words extracted from a 
    PDF document.

    This function takes the output of `get_topic_lists_from_pdf` function, 
    which consists of a list of topic-related words for each topic, and 
    generates an output string in table of content format.

    Parameters:
        llm (LLM): An instance of the Large Language Model (LLM) for generating 
        responses.
        file (str): The path to the PDF file for extracting topic-related words.
        num_topics (int): The number of topics to consider.
        words_per_topic (int): The number of words per topic to include.

    Returns:
        str: A response generated by the language model based on the provided 
        topic words.
    """

    # Extract topics and convert to string
    list_of_topicwords = get_topic_lists_from_corpus(file, num_topics, 
                                                  words_per_topic)
    string_lda = ""
    for topic_words in list_of_topicwords:
        string_lda += str(topic_words) + "\n"

    # Create the template
    template_string = '''Describe the topic of each of the {num_topics} and give the topic a name
        double-quote delimited lists in a simple sentence and provide a set of relevant keywords.
        The lists are the result of an algorithm for topic discovery.
        Do not provide an introduction or a conclusion, only describe the
        topics. Do not mention the word "topic" when describing the topics.
        Use the following template for the response.

        1: <<<(Topic Name)(sentence describing the topic)>>>
        - Keywords: <<<(Comma-separated keywords)>>>

        2: <<<(Topic Name)(sentence describing the topic)>>>
        - Keywords: <<<(Comma-separated keywords)>>>

        ...

        n: <<<(Topic Name)(sentence describing the topic)>>>
        - Keywords: <<<(Comma-separated keywords)>>>

        Lists: """{string_lda}""" '''

    # LLM call
    prompt_template = ChatPromptTemplate.from_template(template_string)
    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run({
        "string_lda": string_lda,
        "num_topics": num_topics
    })

    return response

In [9]:
openai_key = ""
llm = OpenAI(openai_api_key=openai_key, max_tokens=-1)

In [12]:
file = "Report Dataset.csv"
num_topics = 15
words_per_topic = 15

summary = topics_from_corpus(llm, file, num_topics, words_per_topic)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yijin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
print(summary)


1: (Non-convulsive Seizures) This topic discusses the physical manifestation of non-convulsive seizures, including their status, periodic features, and induction through midazolam.
- Keywords: non-convulsive, seizures, status, periodic, midazolam, induction, burst, suppression, stimulation, features

2: (Abnormal Brain Activity) This topic focuses on abnormal brain activity and its association with changes, events, and activities, as well as its comparison to normal brain activity.
- Keywords: abnormal, brain activity, changes, events, activities, comparison, beta, mildly, excessive 

3: (Epileptiform Discharges) This topic explores the presence of epileptiform discharges in EEG recordings, including their features, improvement, and focal characteristics.
- Keywords: epileptiform, discharges, EEG recordings, features, improvement, focal, background, findings 

4: (Underlying Brain Abnormality) This topic discusses the abnormal brain activity seen in EEG recordings, specifically slow w