In [None]:
%pip install -r ../requirements.txt
%load_ext autoreload
%autoreload 2

In [None]:
# Import necessary libraries
import sys
import os
import shutil
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

sys.path.append(os.path.abspath(os.path.join(os.pardir, 'src')))

# Import project-specific internal modules
from preprocessor import DataPreprocessor
from plots import PlotGenerator
from src import config
from config import RUN_CONFIGURATION, EMOTION_STATES, NLP_CONFIG, SENTIMENT_MODEL_EXPORT_PATH_RAW, \
    SENTIMENT_MODEL_EXPORT_PATH_OPTIMIZED, EMOTION_VARIATIONS_PATH, NEGATION_PATTERNS_PATH, \
    HYPERPARAMETERS, RANDOM_STATE, PREDICTION_MODEL_EXPORT_PATH, TOPIC_MODEL_EXPORT_PATH, \
    is_step_enabled
from src.sentiment_analysis import SentimentAnalysisModel
from src.emotion_postprocessor import EmotionPostProcessor
from src.clinical_notes_prediction import ClinicalNotesNoShowPredictor
from clinical_topic_model import ClinicalTopicModel
from src import config
from src.plots import PlotGenerator

# Create an instance of the preprocessing and plotting classes
preprocessor = DataPreprocessor(config)
plotter = PlotGenerator(style='whitegrid', palette='viridis', figsize=(10, 6))

sns.set(style='whitegrid')
warnings.filterwarnings("ignore")

In [None]:
if is_step_enabled('dataload'):
    df = preprocessor.load_data(config.DATASET_PATH)
    display("shape:", df.shape)
    display("columns:", df.columns)
    display(df.head())
    display(df.describe())

# Pre Processing

In [None]:
if is_step_enabled('data_preprocess'):
    df = preprocessor.preprocess_data(df)
    display("shape:", df.shape)
    display("columns:", df.columns)
    display(df.head())
    display(df.describe())

In [None]:
# Topic Modeling for Diabetes, Hypertension, Alcoholism using ClinicalTopicModel class and project architecture
if is_step_enabled('nlp_topic_modeling'):
    model = ClinicalTopicModel(config)
    conditions = ['diabetes', 'hypertension', 'alcohol']
    perplexities = []
    silhouette_scores = []
    all_topics = []

    for cond in conditions:
        df_cond = model.preprocess_notes(df, cond)
        if df_cond.empty:
            print(f"Skipping {cond}: No clinical concepts found after MedSpaCy extraction.")
            perplexities.append(None)
            silhouette_scores.append(None)
            all_topics.append([])
            continue
        model.train(df_cond['PatientNotes_clean'])
        perplexity, sil_score = model.evaluate(df_cond['PatientNotes_clean'])
        print(f"\n--- {cond.title()} ---")
        print(f"Model Perplexity: {perplexity:.2f}")
        if sil_score is not None:
            print(f"Silhouette Score: {sil_score:.2f}")
        topics = model.get_topics(n_top_words=10)
        for idx, topic_words in enumerate(topics):
            print(f"Topic {idx+1}: {' '.join(topic_words)}")
        perplexities.append(perplexity)
        silhouette_scores.append(sil_score if sil_score is not None else 0)
        all_topics.append(topics)

In [None]:
if is_step_enabled('nlp_topic_modeling'):

    # Visualize clinical entities for a note after topic modeling
    if not df_cond.empty:
        # Visualize clinical entities for 10 notes after topic modeling
        for i in range(min(10, len(df_cond))):
            sample_note = df_cond['PatientNotes'].iloc[i]
            print(f'Visualizing clinical entities for a note {i+1}:')
            model.plot_medspacy_ents(sample_note)
        print('Visualizing clinical entities for a note:')
        model.plot_medspacy_ents(sample_note)
    else:
        print('No notes available for visualization.')

    # Plot word clouds for each condition after the loop
    for cond in conditions:
        plotter.plot_wordclouds(model.model, model.vectorizer, cond)

    # Plot Perplexity and Silhouette Score using plotter
    plotter.plot_bar(conditions, perplexities, title='LDA Model Perplexity by Condition (MedSpaCy)', ylabel='Perplexity')
    plotter.plot_bar(conditions, silhouette_scores, title='LDA Silhouette Score by Condition (MedSpaCy)', ylabel='Silhouette Score')

In [None]:
if is_step_enabled('nlp_topic_modeling'):
    # Export the trained topic model and vectorizer
    model.export(TOPIC_MODEL_EXPORT_PATH)
    print(f"Model and vectorizer exported to: {TOPIC_MODEL_EXPORT_PATH}_lda_model.joblib and {TOPIC_MODEL_EXPORT_PATH}_vectorizer.joblib")