### Topic Modeling 

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import matplotlib.pyplot as plt
import pandas as pd

In [26]:
# Download stopwords
nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("italian"))

[nltk_data] Downloading package punkt to /Users/ziade/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ziade/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2. Text Processing 

In [11]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove stopwords and punctuation
    return tokens


In [12]:
def train_lda(documents, num_topics=3):
    processed_docs = [preprocess_text(doc) for doc in documents]
    dictionary = corpora.Dictionary(processed_docs)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    return lda_model, corpus, dictionary


In [16]:
sample_text = """ 
**Artificial Intelligence (AI): A Mid-Level Overview**

Artificial Intelligence (AI) is a branch of computer science that focuses on creating systems capable of performing tasks that typically require human intelligence. These tasks include learning from data, recognizing patterns, understanding natural language, solving problems, and making decisions. AI is driven by various technologies, including machine learning, deep learning, neural networks, and natural language processing.

One of the key components of AI is machine learning, which enables computers to improve their performance by analyzing large amounts of data without explicit programming. Deep learning, a subset of machine learning, utilizes artificial neural networks to process complex data, making it particularly effective for image and speech recognition.

AI is widely used in multiple fields, such as finance, cybersecurity, manufacturing, transportation, and entertainment. Autonomous vehicles, recommendation systems, virtual assistants, and fraud detection systems all rely on AI technologies. Ethical concerns regarding AI include issues related to data privacy, bias in decision-making, and the impact of automation on employment.

Researchers and developers continue to refine AI models, aiming to enhance their efficiency, accuracy, and adaptability. As AI advances, it holds the potential to revolutionize industries and improve productivity in ways previously unimaginable.

---

**Understanding Healthcare: A Mid-Level Overview**

Healthcare refers to the organized efforts and systems designed to maintain or improve people’s physical and mental well-being. It encompasses a wide range of services, including disease prevention, diagnosis, treatment, rehabilitation, and health education. Healthcare is provided by medical professionals such as doctors, nurses, pharmacists, and therapists across various settings, including hospitals, clinics, and long-term care facilities.

The healthcare system consists of three primary components: primary, secondary, and tertiary care. **Primary care** is the first level of contact for patients, dealing with general health concerns, preventive measures, and routine check-ups. **Secondary care** includes specialized services such as cardiology, dermatology, and orthopedics, often provided by hospitals and specialists. **Tertiary care** involves highly specialized medical treatments, such as organ transplants and cancer therapies, typically available in advanced medical centers.

Public health initiatives play a crucial role in healthcare by promoting wellness, controlling disease outbreaks, and ensuring access to vaccinations. Governments, private organizations, and international institutions work together to improve healthcare infrastructure and policies. Key challenges in healthcare include accessibility, affordability, medical research advancements, and managing chronic diseases like diabetes and heart disease.

As technology evolves, healthcare is increasingly integrating digital solutions, such as electronic health records (EHRs), telemedicine, and personalized medicine, to enhance efficiency and patient outcomes. Ensuring quality care while addressing global health disparities remains a significant priority for healthcare professionals and policymakers.



"""

In [47]:
lda_model, corpus, dictionary = train_lda([sample_text], num_topics=5)

# Show topics
for i, topic in lda_model.show_topics(num_topics=5, formatted=True):
    print(f"Topic {i+1}: {topic}")


Topic 1: 0.004*"and" + 0.004*"of" + 0.004*"healthcare" + 0.004*"to" + 0.004*"the" + 0.004*"as" + 0.004*"such" + 0.004*"by" + 0.004*"is" + 0.004*"health"
Topic 2: 0.061*"and" + 0.025*"to" + 0.023*"of" + 0.021*"healthcare" + 0.016*"is" + 0.016*"as" + 0.014*"care" + 0.014*"learning" + 0.014*"the" + 0.012*"such"
Topic 3: 0.006*"and" + 0.005*"to" + 0.004*"of" + 0.004*"healthcare" + 0.004*"learning" + 0.004*"as" + 0.004*"is" + 0.004*"care" + 0.004*"the" + 0.004*"by"
Topic 4: 0.005*"and" + 0.004*"to" + 0.004*"of" + 0.004*"healthcare" + 0.004*"by" + 0.004*"is" + 0.004*"as" + 0.004*"health" + 0.004*"care" + 0.004*"such"
Topic 5: 0.005*"and" + 0.004*"to" + 0.004*"healthcare" + 0.004*"of" + 0.004*"as" + 0.004*"the" + 0.004*"by" + 0.004*"care" + 0.004*"health" + 0.004*"is"


In [48]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)


In [46]:
# Assuming lda_model is already trained and available
topics = lda_model.print_topics(num_words=10)  # Adjust the number of words as needed

# Display each topic with its top words
for topic in topics:
    print(f"Topic {topic[0]}: {topic[1]}")


Topic 0: 0.001*"anni" + 0.001*"paziente" + 0.001*"seguenti" + 0.001*"viene" + 0.001*"dolore" + 0.001*"diagnosi" + 0.001*"soccorso" + 0.001*"uomo" + 0.001*"pronto" + 0.001*"qual"
Topic 1: 0.001*"anni" + 0.001*"paziente" + 0.001*"viene" + 0.001*"seguenti" + 0.001*"diagnosi" + 0.001*"presenta" + 0.001*"soccorso" + 0.001*"uomo" + 0.001*"pronto" + 0.001*"giunge"
Topic 2: 0.002*"anni" + 0.001*"viene" + 0.001*"paziente" + 0.001*"seguenti" + 0.001*"diagnosi" + 0.001*"uomo" + 0.001*"presenta" + 0.001*"terapia" + 0.001*"qual" + 0.001*"giunge"
Topic 3: 0.001*"anni" + 0.001*"viene" + 0.001*"paziente" + 0.001*"seguenti" + 0.001*"diagnosi" + 0.001*"soccorso" + 0.001*"sinistro" + 0.001*"uomo" + 0.001*"pronto" + 0.001*"terapia"
Topic 4: 0.026*"anni" + 0.015*"paziente" + 0.013*"viene" + 0.012*"seguenti" + 0.007*"diagnosi" + 0.007*"uomo" + 0.006*"presenta" + 0.006*"pronto" + 0.006*"qual" + 0.006*"soccorso"
