In [1]:
# import libraries
import os
import logging
import json
import pickle
import glob
import re
from itertools import chain
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    average_precision_score,
)
from scipy.stats import gmean

# import gensim libraries
from gensim import corpora
from gensim.models import LdaModel

# from gensim.models.ldamulticore import LdaMulticore
from gensim.similarities import MatrixSimilarity
from gensim.parsing.preprocessing import preprocess_documents

In [1]:
def create_directories(directories):
    for directory in directories:
        if not os.path.exists(directory):
            os.makedirs(directory)
            print(f"Created directory: {directory}")


def extract_pmid(links):
    return [link.split("/")[-1] for link in links]


def prepare_dataset(dataset_name, dataset_dir, doc_df):
    dfs = []
    for json_file in glob.glob(os.path.join(dataset_dir, "*.json")):
        with open(json_file) as fp:
            json_data = "".join(fp)
        data = json.loads(json_data)
        data = data["questions"]
        dfs.append(pd.DataFrame(data))
    df = pd.concat(dfs)
    df.reset_index(drop=True, inplace=True)
    # rename 'body' column to 'query'
    df.rename(columns={"body": "query"}, inplace=True)
    print(f"total number of queries provided in {dataset_name}: {df['query'].shape[0]}")
    logger.info(
        "total number of queries provided in %s: %d", dataset_name, df["query"].shape[0]
    )

    df["pmids"] = df["documents"].apply(extract_pmid)
    pmids = list(chain.from_iterable(df["pmids"].to_list()))
    print(
        f"total number of unique docs provided in {dataset_name}: {len(set(pmids))}"
    )
    logger.info(
        "total number of unique docs provided in %s: %d",
        dataset_name,
        len(set(pmids)),
    )

    corpus_df = doc_df[doc_df["pmid"].isin(pmids)]
    print(f"num of docs found in corpus:{corpus_df.shape[0]}")
    logger.info("num of docs found in corpus: %d", corpus_df.shape[0])

    def filter_pmid(pmids):
        filtered_pmids = [
            pmid for pmid in pmids if pmid in doc_df["pmid"].to_list()
        ]
        return filtered_pmids

    df["pmids_found"] = df["pmids"].apply(filter_pmid)

    filtered_df = df[df["pmids_found"].apply(len) > 0]
    # average number of docs per query
    total_num_docs = sum(filtered_df["pmids_found"].apply(len))
    total_num_queries = filtered_df["query"].shape[0]
    avg_num_docs_per_query = total_num_docs / total_num_queries
    print(f"avg num of docs per query: {avg_num_docs_per_query}")
    logger.info("avg num of docs per query: %d", avg_num_docs_per_query)
    return filtered_df


# retrieve the top N similar documents for a given document or query
def retrieve_documents(query, lda_model, sim_matrix, topn=10):
    vec_bow = dictionary.doc2bow(query)
    vec_lda = lda_model[vec_bow]
    sims = sim_matrix[vec_lda]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    return sims[:topn]


# get pmids from doc indexes
def get_pmids_from_doc_indexes(doc_indexes, doc_df):
    return [doc_df["pmid"].iloc[doc_idx[0]] for doc_idx in doc_indexes]


def calculate_metrics(df, true_col, pred_col):
    # Calculate precision, recall, f1, and average precision for each row
    df["precision"] = 0
    df["recall"] = 0
    df["f1"] = 0
    df["avg_precision"] = 0

    for i in range(len(df)):
        # Fit MultiLabelBinarizer on each row separately
        mlb = MultiLabelBinarizer()
        mlb.fit(
            [df[true_col].iloc[i] + df[pred_col].iloc[i]]
        )  # Combining true and predicted labels

        # Transform true and predicted columns separately
        X_true = mlb.transform([df[true_col].iloc[i]])
        X_pred = mlb.transform([df[pred_col].iloc[i]])

        # Calculate precision, recall, f1, and average precision for the current row
        df.at[i, "precision"] = precision_score(
            X_true[0], X_pred[0], zero_division=0
        )
        df.at[i, "recall"] = recall_score(X_true[0], X_pred[0], zero_division=0)
        df.at[i, "f1"] = f1_score(X_true[0], X_pred[0], zero_division=0)
        df.at[i, "avg_precision"] = average_precision_score(
            X_true[0], X_pred[0]
        )

    # Calculate mean precision, mean recall, and mean f1
    mean_precision = df["precision"].mean()
    mean_recall = df["recall"].mean()
    mean_f1 = df["f1"].mean()

    # Calculate MAP and GMAP
    map_score = df["avg_precision"].mean()
    gmap_score = gmean(df["avg_precision"])

    # Create a new dataframe to store the mean scores
    mean_scores_df = pd.DataFrame(
        {
            "mean_precision": mean_precision,
            "mean_recall": mean_recall,
            "mean_f1": mean_f1,
            "MAP": map_score,
            "GMAP": gmap_score,
        },
        index=[0],
    )

    # Return both dataframes
    return mean_scores_df


def get_max(logs, metric):
    df = pd.concat(logs)
    print(f"max {metric}:")
    df = df.sort_values(by=[metric], ascending=False)
    # return df[df[metric] == df[metric].max()]
    return df.head()

In [3]:

try:
    # load dir_dict from json file in home directory
    home_dir = os.path.expanduser("~")
    with open(f"{home_dir}/.biomedqa_dir.json", encoding="utf-8") as fp:
        dir_dict = json.load(fp)
except Exception as exc:
    print("Error: unable to load directory dictionary. Please run setup.py")
    raise exc

# set directories
BASE_DIR = dir_dict["base_dir"]
DATA_DIR = dir_dict["data_dir"]
MODEL_DIR = dir_dict["model_dir"]
LOG_DIR = dir_dict["log_dir"]
RESULTS_DIR = dir_dict["results_dir"]

DATASET = "bioasq"
YEAR = "2021"
TRAIN_DATASET_NAME = "Task9BGoldenEnriched"
TEST_DATASET_NAME = "Task9BGoldenEnriched"
__file__ = "notebooks/topic_model_analysis.ipynb"

TRAIN_DATASET_DIR = f"{DATA_DIR}/raw/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}"
TRAIN_DOC_DIR = (
    f"{DATA_DIR}/processed/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}_documents/"
)
print(f"train dataset name:{TRAIN_DATASET_NAME}")
print(f"train dataset dir:{TRAIN_DATASET_DIR}")
print(f"train doc dir:{TRAIN_DOC_DIR}")

TEST_DATASET_DIR = f"{DATA_DIR}/raw/{DATASET}/{YEAR}/{TEST_DATASET_NAME}"
TEST_DOC_DIR = (
    f"{DATA_DIR}/processed/{DATASET}/{YEAR}/{TEST_DATASET_NAME}_documents/"
)
print(f"test dataset name:{TEST_DATASET_NAME}")
print(f"test dataset dir:{TEST_DATASET_DIR}")
print(f"test doc dir:{TEST_DOC_DIR}")

# get file directory
FILE_DIR = os.path.dirname(os.path.relpath(__file__))

# set log dir directory according to current file directory
LOG_DIR = f"{LOG_DIR}/{FILE_DIR}"
print(f"log dir:{LOG_DIR}")

# set model directory according to current file directory
MODEL_DIR = f"{MODEL_DIR}/{FILE_DIR}/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}/"
print(f"model dir:{MODEL_DIR}")

# set results directory according to current file directory
RESULTS_DIR = f"{RESULTS_DIR}/{FILE_DIR}/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}/"
print(f"results dir:{RESULTS_DIR}")

# create directories
create_directories([LOG_DIR, MODEL_DIR, RESULTS_DIR])

# set log file name
log_file = os.path.join(
    LOG_DIR, os.path.basename(__file__).split(".")[0] + ".log"
)
print(f"LOG_FILE: {log_file}")

# initialize logger

logging.basicConfig(
    filename=log_file,
    format="%(process)d\t%(asctime)s\t%(levelname)s\t%(message)s",
    level=logging.DEBUG,
)
logger = logging.getLogger(__name__)
logger.info("Logger initialized")

# load documents
logger.info("loading documents")
# train_doc_df = pickle.load(
#     open(f"{TRAIN_DOC_DIR}{TRAIN_DATASET_NAME}_documents_df.pkl", "rb")
# )
train_doc_df = pd.read_pickle(
    f"{TRAIN_DOC_DIR}{TRAIN_DATASET_NAME}_documents_df.pkl"
)

# test_doc_df = pickle.load(
#     open(f"{TEST_DOC_DIR}{TEST_DATASET_NAME}_documents_df.pkl", "rb")
# )
test_doc_df = pd.read_pickle(
    f"{TEST_DOC_DIR}{TEST_DATASET_NAME}_documents_df.pkl"
)

train_filtered_df = prepare_dataset(
    TRAIN_DATASET_NAME, TRAIN_DATASET_DIR, train_doc_df
)
test_filtered_df = prepare_dataset(
    TEST_DATASET_NAME, TEST_DATASET_DIR, test_doc_df
)

# preprocess documents using gensim's preprocess_documents function
logger.info("preprocessing train documents")
train_doc_df["abstractText_preprocessed"] = preprocess_documents(
    train_doc_df["abstractText"]
)
logger.info("preprocessing test documents")
test_doc_df["abstractText_preprocessed"] = preprocess_documents(
    test_doc_df["abstractText"]
)

# Create a dictionary from the preprocessed documents of the training set
logger.info("creating dictionary")
dictionary = corpora.Dictionary(train_doc_df["abstractText_preprocessed"])

# create bag of words corpus of the training set
logger.info("creating bag of words for train documents")
train_corpus = [
    dictionary.doc2bow(text)
    for text in train_doc_df["abstractText_preprocessed"]
]
# Create bag of words corpus of the test set
logger.info("creating bag of words for test documents")
test_corpus = [
    dictionary.doc2bow(text)
    for text in test_doc_df["abstractText_preprocessed"]
]

# preprocess questions
logger.info("preprocessing test questions")
test_filtered_df["query_preprocessed"] = preprocess_documents(
    test_filtered_df["query"].to_list()
)


train dataset name:Task9BGoldenEnriched
train dataset dir:/workspace/data/raw/bioasq/2021/Task9BGoldenEnriched
train doc dir:/workspace/data/processed/bioasq/2021/Task9BGoldenEnriched_documents/
test dataset name:Task9BGoldenEnriched
test dataset dir:/workspace/data/raw/bioasq/2021/Task9BGoldenEnriched
test doc dir:/workspace/data/processed/bioasq/2021/Task9BGoldenEnriched_documents/
log dir:/workspace/logs/notebooks
model dir:/workspace/models/notebooks/bioasq/2021/Task9BGoldenEnriched/
results dir:/workspace/results/notebooks/bioasq/2021/Task9BGoldenEnriched/
LOG_FILE: /workspace/logs/notebooks/topic_model_analysis.log
total number of queries provided in Task9BGoldenEnriched: 497
total number of unique docs provided in Task9BGoldenEnriched: 3523
num of docs found in corpus:3522
avg num of docs per query: 7.305835010060362
total number of queries provided in Task9BGoldenEnriched: 497
total number of unique docs provided in Task9BGoldenEnriched: 3523
num of docs found in corpus:3522
av

In [None]:
from IPython.display import Image, display
display(Image(filename='/workspace/notebooks/Dataset_overview.png'))

In [4]:
# train LDA model
lda_model = LdaModel(
    corpus=train_corpus,
    id2word=dictionary,
    num_topics=1216,
    chunksize=2576,
    passes=5,
    update_every=1,
    alpha="symmetric",
    eta="symmetric",
    decay=0.5,
    offset=1,
    eval_every=10,
    iterations=50,
    gamma_threshold=0.001,
    minimum_probability=0.01,
    random_state=1,
    minimum_phi_value=0.01,
)
# save model
logger.info("saving model")
lda_model.save(f"{MODEL_DIR}lda_model")

In [9]:
# load model
logger.info("loading model")
lda_model = LdaModel.load(f"{MODEL_DIR}lda_model")

In [7]:
# Calculate coherence score using different coherence measure
from gensim.models import CoherenceModel
coherence_measures = ['c_v', 'u_mass', 'c_uci', 'c_npmi']
coherence_scores = {}
for coherence_measure in coherence_measures:
    coherence_model = CoherenceModel(model=lda_model, texts=train_doc_df["abstractText_preprocessed"], dictionary=dictionary, coherence=coherence_measure)
    coherence_score = coherence_model.get_coherence()
    coherence_scores[coherence_measure] = coherence_score

# Print the coherence score
print("Coherence Scores:", coherence_scores)

Coherence Scores: {'c_v': 0.45027881342796156, 'u_mass': -4.170647331366011, 'c_uci': -2.975153693266967, 'c_npmi': -0.061874599812039875}


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming you have already trained your topic model and have the trained LDA model and corpus
trained_lda_model = lda_model
corpus = train_corpus

# Get the topic distribution for each document in the corpus
topic_distribution = [trained_lda_model.get_document_topics(doc) for doc in corpus]

# Count the number of documents assigned to each topic
topic_counts = np.zeros(trained_lda_model.num_topics)
for doc_topics in topic_distribution:
    for topic, _ in doc_topics:
        topic_counts[topic] += 1

# Sort topics based on document count in descending order
sorted_topics = np.argsort(topic_counts)[::-1]
sorted_counts = topic_counts[sorted_topics]

# Generate topic labels using most significant words
topic_labels = []
num_top_words = 5  # Define the number of significant words to use for each topic label

for topic in sorted_topics:
    top_words = trained_lda_model.show_topic(topic, topn=num_top_words)
    words = [word for word, _ in top_words]
    topic_labels.append(', '.join(words))

# Plot the topic distribution as a horizontal bar plot with adjusted margins
topics = range(trained_lda_model.num_topics)
plt.figure(figsize=(10, 0.15 * trained_lda_model.num_topics))
plt.barh(topics, sorted_counts)
plt.xlabel('Number of Documents')
plt.ylabel('Topic')
plt.title('Topic Distribution')
plt.yticks(topics, topic_labels)
plt.gca().invert_yaxis()
plt.margins(0.01)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming you have already trained your topic model and have the trained LDA model and corpus
trained_lda_model = lda_model
corpus = train_corpus

# Get the topic distribution for each document in the corpus
topic_distribution = [trained_lda_model.get_document_topics(doc) for doc in corpus]

# Count the number of documents assigned to each topic
topic_counts = np.zeros((len(corpus), trained_lda_model.num_topics))
for doc_idx, doc_topics in enumerate(topic_distribution):
    for topic, _ in doc_topics:
        topic_counts[doc_idx, topic] += 1

# Create stacked area chart
topics = range(trained_lda_model.num_topics)
fig, ax = plt.subplots(figsize=(10, 6))
ax.stackplot(range(len(corpus)), topic_counts.T, labels=topics)
ax.set_xlabel('Document Index')
ax.set_ylabel('Topic Proportion')
ax.set_title('Topic Distribution')
ax.legend(loc='upper right')
plt.show()

In [7]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()

In [None]:
gensimvis.prepare(lda_model, train_corpus, dictionary,mds='mmds')

In [8]:
viz_path = f"optimized_lda_{TRAIN_DATASET_NAME}_on_{TEST_DATASET_NAME}.html"

In [9]:
p = gensimvis.prepare(lda_model, train_corpus, dictionary,mds='mmds')
pyLDAvis.save_html(p, viz_path)



In [None]:
gensimvis.prepare(lda_model, train_corpus, dictionary,mds='tsne')