### Load MIMIC III data from OpenStack

In [5]:
# !which python
# !conda activate hack

In [170]:
# !python --version

In [171]:
# !pip install psycopg2-binary
# !pip install SQLAlchemy

In [172]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

In [173]:
sql_engine = create_engine('postgresql://postgres:postgres123+@10.211.116.164:5432/mimic')

In [10]:
query = '''SELECT *
FROM
   mimiciii.diagnoses_icd b
   INNER JOIN mimiciii.noteevents e ON b.subject_id = e.subject_id
WHERE
   b.icd9_code IN
   ('390', '391', '392', '393', '394', '395', '396', '397', '398', '399',
    '400', '401', '402', '403', '404', '405', '406', '407', '408', '409',
    '410', '411', '412', '413', '414', '415', '416', '417', '418', '419',
    '420', '421', '422', '423', '424', '425', '426', '427', '428', '429',
    '430', '431', '432', '433', '434', '435', '436', '437', '438', '439',
    '440', '441', '442', '443', '444', '445', '446', '447', '448', '449',
    '450', '451', '452', '453', '454', '455', '456', '457', '458', '459');
'''

In [11]:
notes = pd.read_sql_query(query,con=sql_engine)

In [12]:
notes.to_csv('data/cvd_note.csv')

In [178]:
notes.shape

(364079, 16)

In [174]:
query = '''SELECT *
FROM
   mimiciii.diagnoses_icd b
   INNER JOIN mimiciii.PRESCRIPTIONS e ON b.subject_id = e.subject_id
WHERE
   b.icd9_code IN
   ('390', '391', '392', '393', '394', '395', '396', '397', '398', '399',
    '400', '401', '402', '403', '404', '405', '406', '407', '408', '409',
    '410', '411', '412', '413', '414', '415', '416', '417', '418', '419',
    '420', '421', '422', '423', '424', '425', '426', '427', '428', '429',
    '430', '431', '432', '433', '434', '435', '436', '437', '438', '439',
    '440', '441', '442', '443', '444', '445', '446', '447', '448', '449',
    '450', '451', '452', '453', '454', '455', '456', '457', '458', '459');
'''

In [175]:
prescriptions = pd.read_sql_query(query,con=sql_engine)

In [176]:
prescriptions.to_csv('data/cvd_prescriptions.csv')

In [177]:
prescriptions.shape

(877470, 24)

## Search and download PubMed data

In [19]:
# !pip install requests

In [144]:
import requests
import time
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
import os

In [112]:
def parse_pubmed_xml(xml_content):
    root = ET.fromstring(xml_content)
    
    articles = []
    
    for article in root.findall("PubmedArticle"):
        medline_citation = article.find("MedlineCitation")
        pmid = medline_citation.find("PMID").text

        article_data = medline_citation.find("Article")
        title = article_data.find("ArticleTitle").text

        journal = article_data.find("Journal")
        journal_title = journal.find("Title").text

        # Extract Authors
        authors = []
        author_list = article_data.find("AuthorList")
        if author_list:
            for author in author_list.findall("Author"):
                last_name = author.find("LastName").text if author.find("LastName") is not None else ""
                fore_name = author.find("ForeName").text if author.find("ForeName") is not None else ""
                full_name = f"{fore_name} {last_name}".strip()
                if full_name:
                    authors.append(full_name)

        # Extract Abstract
        abstract_section = article_data.find("Abstract")
        abstract_text = ""
        if abstract_section:
            for abs_text in abstract_section.findall("AbstractText"):
                label = abs_text.get("Label", "")
                text = abs_text.text if abs_text.text else ""
                abstract_text += f"{label}: {text}\n" if label else f"{text}\n"

        articles.append({
            "PMID": pmid,
            "Title": title,
            "Journal": journal_title,
            "Authors": authors,
            "Abstract": abstract_text.strip(),
        })

    return articles

In [141]:
# Estimated storage size: 185 TB – 10 PB.

In [149]:
# PubMed API settings
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
EMAIL = "wtgmme@gmail.com"  # Required for API usage
BATCH_SIZE = 50  # Max articles per request
OUTPUT_FILE = "pubmed_articles.json"

def get_article_ids(start_date, end_date):
    """Retrieve PubMed article IDs for a given date range."""
    esearch_url = f"{BASE_URL}esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": "Cardiovascular Disease",  # Change as needed
        "mindate": start_date,
        "maxdate": end_date,
        "retmax": 10000,  # Fetch max possible
        "retmode": "json",
        "usehistory": "y",
        "email": EMAIL
    }
    response = requests.get(esearch_url, params=params)
    data = response.json()
    
    return data.get("esearchresult", {}).get("idlist", []), data.get("esearchresult", {}).get("count", 0)

def fetch_articles(article_ids):
    """Fetch article metadata using PubMed efetch API."""
    if not article_ids:
        print("⚠️ No article IDs provided!")
        return None
    
    efetch_url = f"{BASE_URL}efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": ",".join(article_ids),
        "retmode": "xml",
        "email": EMAIL
    }
    headers = {
        "User-Agent": f"PubMedFetcher/1.0 ({EMAIL})"
    }

    try:
        response = requests.get(efetch_url, params=params, headers=headers, timeout=10)
        
        # Check for HTTP errors
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            return None
        
        return response.text

    except requests.exceptions.RequestException as e:
        print(f"❌ Request failed: {e}")
        return None

def save_articles_to_file(data, filename):
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)

# Loop through dates (split by half-month)
start_year = 2019
end_year = 2024

for year in range(start_year, end_year + 1):
    for month in range(1, 13):
        articles_month = []
        output = 'pubmed/' + str(year) + '-' + str(month) + '-'+ OUTPUT_FILE
        if os.path.exists(output)==False:
            for half in [1, 2]:  # Split into 1st and 2nd half of the month
                start_date = f"{year}/{month:02d}/01"
                first_end_date = f"{year}/{month:02d}/14"
                mid_date = f"{year}/{month:02d}/15"
                end_date = (datetime(year, month, 1) + timedelta(days=31)).replace(day=1) - timedelta(days=1)
                end_date = end_date.strftime("%Y/%m/%d")
    
                date_range_start = start_date if half == 1 else mid_date
                date_range_end = first_end_date if half == 1 else end_date
    
                print(f"Fetching {date_range_start} to {date_range_end}...")
                article_ids, total_count = get_article_ids(date_range_start, date_range_end)
                # print(article_ids)
    
                print(f"Total articles found: {total_count}, downloaded: {len(article_ids)}")
    
                if total_count == 0:
                    continue
    
                # Fetch articles in batches of BATCH_SIZE
                for i in range(0, len(article_ids), BATCH_SIZE):
                    batch_ids = article_ids[i:i + BATCH_SIZE]
                    # print(f"Fetching batch {i + 1} to {i + len(batch_ids)}...")
                    article_data = fetch_articles(batch_ids)
                    # print(article_data)
                    articles = parse_pubmed_xml(article_data)
                    articles_month.extend(articles)
                    time.sleep(2)  # Avoid API rate limits
            save_articles_to_file(articles_month, output)

print("✅ Download complete!")


✅ Download complete!


In [152]:
with open("pubmed/2019-1-pubmed_articles.json", "r", encoding="utf-8") as file:
    data = json.load(file)

In [153]:
len(data)

8045

In [161]:
folder_path = 'pubmed'

In [165]:
combined_data = []

In [166]:
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):  # Check if the file is a JSON file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            # Load JSON data from the file
            data = json.load(file)
            # Append the data to the combined list
            combined_data.extend(data)

In [168]:
len(combined_data)

633712

In [169]:
output_file = 'data/pubmed.json'
with open(output_file, 'w', encoding='utf-8') as outfile:
    json.dump(combined_data, outfile, indent=4)

In [134]:
# TODO: store data in vector database