In [1]:
from sqlalchemy import create_engine, inspect, select, Table, MetaData
from typing import List
import pandas as pd
import datetime
import numpy as np
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['TRANSFORMERS_CACHE'] = 'cache'
from tabulate import tabulate

In [2]:
username = os.environ["username"]
password = os.environ["password"]
hostname = os.environ["hostname"]
port = os.environ["port"]

In [3]:
credentials = f"postgresql://{username}:{password}@{hostname}:{port}/aact"
engine = create_engine(credentials)

# create metadata object and reflect all tables
metadata = MetaData()
metadata.reflect(bind=engine)

sql_command = \
"""
SELECT
    studies.nct_id,
    MAX(studies.brief_title) AS brief_title,
    MAX(studies.official_title) AS official_title,
    STRING_AGG(DISTINCT brief_summaries.description, ' ') AS brief_summaries,
    STRING_AGG(DISTINCT detailed_descriptions.description, ' ') AS detailed_descriptions,
    MAX(eligibilities.criteria) AS criteria, 
    MAX(eligibilities.gender) AS gender, 
    MAX(eligibilities.minimum_age) AS minimum_age, 
    MAX(eligibilities.maximum_age) AS maximum_age, 
    MAX(facilities.name) AS facilities, 
    MAX(facilities.city) AS city, 
    MAX(facilities.state) AS state, 
    MAX(facilities.zip) AS zip, 
    MAX(facilities.country) AS country, 
    MAX(studies.study_type) AS study_type
FROM 
    ctgov.studies
INNER JOIN ctgov.brief_summaries ON brief_summaries.nct_id = studies.nct_id 
INNER JOIN ctgov.detailed_descriptions ON detailed_descriptions.nct_id = studies.nct_id 
INNER JOIN ctgov.eligibilities ON eligibilities.nct_id = studies.nct_id 
INNER JOIN ctgov.facilities ON facilities.nct_id = studies.nct_id 
GROUP BY studies.nct_id
ORDER BY studies.nct_id ASC;
"""

assert sql_command is not None
df = pd.read_sql_query(
    sql=sql_command,
    con=engine,
    # params={
    #    # "prompt": f"%{self.prompt}%",
    # }  # type: ignore
)

# Set 'NCTId' as index and remove the 'NCTId' column
df.set_index(df['nct_id'], inplace=True)
df.drop(columns=['nct_id'], inplace=True)

# Convert lists to appropriate data types
for column in df.columns:
    # Remove double quotes from the DataFrame
    df[column] = df[column].apply(lambda x: x.replace('"', '') if isinstance(x, str) else x)
    df[column] = pd.to_datetime(df[column], errors='coerce') if 'date' in column else df[column]

# Close the connection
engine.dispose()

df.to_csv("ctgov.csv")
df

In [None]:
credentials = f"postgresql://{username}:{password}@{hostname}:{port}/mimiciii"
engine = create_engine(credentials)

# create metadata object and reflect all tables
metadata = MetaData()
metadata.reflect(bind=engine)

sql_command = \
"""
SELECT
    noteevents.subject_id,
    MAX(noteevents.hadm_id) AS hadm_id,
    MAX(noteevents.category) AS category,
    MAX(noteevents.description) AS description,
    STRING_AGG(DISTINCT noteevents.text, ' ') AS text
FROM 
    mimic.noteevents
GROUP BY noteevents.subject_id
ORDER BY noteevents.subject_id ASC
"""

assert sql_command is not None
df = pd.read_sql_query(
    sql=sql_command,
    con=engine,
)

# Set 'NCTId' as index and remove the 'NCTId' column
df.set_index(df['subject_id'], inplace=True)
df.drop(columns=['subject_id'], inplace=True)

# Convert lists to appropriate data types
for column in df.columns:
    # Remove double quotes from the DataFrame
    df[column] = df[column].apply(lambda x: x.replace(
        '"', '') if isinstance(x, str) else x)
    df[column] = pd.to_datetime(
        df[column], errors='coerce') if 'date' in column else df[column]

# Close the connection
engine.dispose()

df.to_csv("mimiciii.csv")
df