In [1]:
import papermill as pm
import nbformat
from nbconvert import HTMLExporter
from minio import Minio
from urllib.parse import urlparse
from tempfile import TemporaryDirectory
import os
import shutil
import psycopg2 
import dotenv
import json
import io
from datetime import date 
from nbconvert.preprocessors import Preprocessor
import glob

class RemoveEmptyCodeCellsPreprocessor(Preprocessor):
    def preprocess(self, nb, resources):
        nb.cells = [cell for cell in nb.cells if not(cell.cell_type=='code' and not cell.get('outputs'))]
        return nb, resources

dotenv.load_dotenv()

def run_notebook(gse_id, tmpdir):
    root_dir = os.path.realpath(os.path.join(os.getcwd(), '..')) #check on this. still a bit uneasy.
    print(root_dir)
    print(f"temp directory created at: {tmpdir}")
    input_path = os.path.join(root_dir, "notebooks", "report_template.ipynb") #where template notebook is located
    temp_input_path = os.path.join(tmpdir, "report_template.ipynb") 
    shutil.copyfile(input_path, temp_input_path) #copy it into the temp directory
    temp_output_path = os.path.join(tmpdir, f"{gse_id}.ipynb")
    output_html = os.path.join(tmpdir, f"{gse_id}.html")

    pm.execute_notebook(
        input_path=temp_input_path,
        output_path=temp_output_path,
        parameters={
            "gse": gse_id,
            "working_dir": tmpdir
        },
    )
    print(f"Notebook executed and saved at {temp_output_path}")

    #save to html
    with open(temp_output_path, 'r') as f:
        nb = nbformat.read(f, as_version=4)

    preprocessor = RemoveEmptyCodeCellsPreprocessor()
    nb, _ = preprocessor.preprocess(nb, {})
    
    html_exporter = HTMLExporter() #optional: template
    html_exporter.exclude_input = True
    html_exporter.exclude_output_prompt = True
    html_exporter.exclude_input_prompt = True

    html_data, _ = html_exporter.from_notebook_node(nb)
    
    with open(output_html, 'w') as f:
        f.write(html_data)

    print(f"HTML generated and saved at {output_html}")
    os.remove(temp_input_path) #remove to avoid it being uploaded to S3

    for file in glob.glob(os.path.join(tmpdir, "*.soft.gz")):
        os.remove(file) #remove the soft.gz file that GEOParse downloads

def update_postgres(tmpdir, conn, cur):
    json_path = os.path.join(tmpdir, "metadata.json")
    with open(json_path, 'r') as f:
        metadata = json.load(f)
    
    cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';")
    print(cur.fetchall()) #print the table names for debugging.

    columns = metadata.keys()
    values = [metadata[col] for col in columns]

    query = f"""
        INSERT INTO reports ({', '.join(columns)})
        VALUES ({', '.join(['%s'] * len(columns))})
        ON CONFLICT (id) DO UPDATE SET
        {', '.join([f"{col}=EXCLUDED.{col}" for col in columns if col != 'id'])}
    """

    cur.execute(query, values)
    conn.commit()
    print("successfully committed")
    os.remove(json_path)

def update_s3(gse_id, tmpdir, s3, bucket):
    
    for root, _, files in os.walk(tmpdir):
        for filename in files:
            local_path = os.path.join(root, filename)
            relative_path = os.path.relpath(local_path, tmpdir).replace("\\", "/")
            object_key = f"{gse_id}/{relative_path}"

            s3.fput_object(bucket, object_key, local_path)
    
    print(f"✅ Uploaded GSE {gse_id} contents to MinIO bucket '{bucket}'")

        

def process_gse(gse_id, conn, cur, s3, bucket):
    cur.execute("SELECT 1 FROM reports WHERE id = %s LIMIT 1;", (gse_id,))
    exists = cur.fetchone() is not None
    if exists:
        print(f"GSE {gse_id} already exists in Postgres. Skipping processing.")
        return
    
    with TemporaryDirectory() as tmpdir:
        print(f"started processing for {gse_id} in temp directory {tmpdir}")
        try:
            run_notebook(gse_id, tmpdir)
            update_s3(gse_id, tmpdir, s3, bucket)
            update_postgres(tmpdir, conn, cur)
            print("processing successful!")
        except Exception as e:
            print(f"Error processing {gse_id}: {e}")
            raise #get rid of in production so it doesnt interrupt execution

In [10]:
dotenv.load_dotenv()

password = os.getenv('POSTGRES_PASSWORD')
conn=psycopg2.connect(os.environ['DATABASE_URL'])
cur = conn.cursor()
# See what DB you're connected to

cur.execute("SELECT current_database(), inet_server_addr(), inet_server_port();")
print("📍 Connected to:", cur.fetchone())

# See which tables exist
cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';")
print("📄 Tables in 'public':", cur.fetchall())


# connect to db
S3_URL_parsed = urlparse(os.environ['S3_URL'])
s3 = Minio(
  f"{S3_URL_parsed.hostname}:{S3_URL_parsed.port}",
  access_key=f"{S3_URL_parsed.username}",
  secret_key=f"{S3_URL_parsed.password}",
  secure=S3_URL_parsed.scheme == 'https',
)

# create the bucket if it doesn't exist
bucket, _, _ = S3_URL_parsed.path[1:].partition('/')
if not s3.bucket_exists(bucket):
  s3.make_bucket(bucket)
  # enable anonymous downloading of files in this bucket
  s3.set_bucket_policy(bucket, json.dumps({
    'Version': '2012-10-17',
    'Statement': [
      {'Effect': 'Allow', 'Principal': {'AWS': '*'}, 'Action': 's3:GetBucketLocation', 'Resource': f"arn:aws:s3:::{bucket}"},
      {'Effect': 'Allow', 'Principal': {'AWS': '*'}, 'Action': 's3:GetObject', 'Resource': f"arn:aws:s3:::{bucket}/*"},
    ],
  }))
  # create a file
  # content = b'Hello World!'
  # s3.put_object(bucket, 'test.txt', io.BytesIO(content), len(content), content_type='plain/text')
  # print(f"File available at <{os.environ['PUBLIC_S3_URL']}/test.txt>")


📍 Connected to: ('postgres', '172.18.0.3', 5432)
📄 Tables in 'public': [('kysely_migration',), ('kysely_migration_lock',), ('reports',)]


In [5]:
from minio import Minio
from urllib.parse import urlparse
import json

def reset_s3_bucket():
    S3_URL_parsed = urlparse(os.environ['S3_URL'])
    bucket, _, _ = S3_URL_parsed.path[1:].partition('/')
    s3 = Minio(
        f"{S3_URL_parsed.hostname}:{S3_URL_parsed.port}",
        access_key=S3_URL_parsed.username,
        secret_key=S3_URL_parsed.password,
        secure=S3_URL_parsed.scheme == 'https',
    )

    if s3.bucket_exists(bucket):
        objects = s3.list_objects(bucket, recursive=True)
        for obj in objects:
            s3.remove_object(bucket, obj.object_name)
        print(f"✅ Cleared all objects from MinIO bucket '{bucket}'")
    else:
        print(f"⚠️ Bucket '{bucket}' does not exist.")


In [6]:
def reset_postgres():
    conn = psycopg2.connect(os.environ['DATABASE_URL'])
    cur = conn.cursor()
    cur.execute("TRUNCATE TABLE reports;")  # deletes all rows, keeps schema
    conn.commit()
    cur.close()
    conn.close()
    print("✅ Postgres 'reports' table reset.")

In [8]:
gses = ["GSE247883", "GSE247175"]

for gse in gses:
    print(f"processing started for {gse}")
    try:
        process_gse(gse_id=gse, conn=conn, cur=cur, s3=s3, bucket=bucket)
    except:
        print(f"Error processing {gse}")
cur.close()
conn.close()

processing started for GSE247883
started processing for GSE247883 in temp directory /tmp/tmpij_4s2om
/home/ajy20/geo2reports/python
temp directory created at: /tmp/tmpij_4s2om


Executing:   0%|          | 0/97 [00:00<?, ?cell/s]

No handler found for comm target 'dash'


Notebook executed and saved at /tmp/tmpij_4s2om/GSE247883.ipynb
HTML generated and saved at /tmp/tmpij_4s2om/GSE247883.html
✅ Uploaded GSE GSE247883 contents to MinIO bucket 'geo2reports'
[('kysely_migration',), ('kysely_migration_lock',), ('reports',)]
successfully committed
processing successful!
processing started for GSE247175
started processing for GSE247175 in temp directory /tmp/tmpopahwsjx
/home/ajy20/geo2reports/python
temp directory created at: /tmp/tmpopahwsjx


Executing:   0%|          | 0/97 [00:00<?, ?cell/s]

No handler found for comm target 'dash'


Notebook executed and saved at /tmp/tmpopahwsjx/GSE247175.ipynb
HTML generated and saved at /tmp/tmpopahwsjx/GSE247175.html
✅ Uploaded GSE GSE247175 contents to MinIO bucket 'geo2reports'
[('kysely_migration',), ('kysely_migration_lock',), ('reports',)]
successfully committed
processing successful!


In [12]:
#test everything here.
gse = "GSE247303"
process_gse(gse_id=gse, conn=conn, cur=cur, s3=s3, bucket=bucket)


started processing for GSE247303 in temp directory /tmp/tmpup0gf7ji
/home/ajy20/geo2reports/python
temp directory created at: /tmp/tmpup0gf7ji


Executing:   0%|          | 0/97 [00:00<?, ?cell/s]

No handler found for comm target 'dash'


Notebook executed and saved at /tmp/tmpup0gf7ji/GSE247303.ipynb
HTML generated and saved at /tmp/tmpup0gf7ji/GSE247303.html
✅ Uploaded GSE GSE247303 contents to MinIO bucket 'geo2reports'
[('kysely_migration',), ('kysely_migration_lock',), ('reports',)]
successfully committed
processing successful!


In [None]:
cur.close()
conn.close()

In [None]:
#test if postgres was successfully updated
conn = psycopg2.connect(os.environ['DATABASE_URL'])
cur = conn.cursor()

cur.execute("SELECT * FROM reports WHERE id = %s", (gse,))  # replace with actual GSE ID
row = cur.fetchone()

print(row)  # Should show the inserted metadata

cur.close()
conn.close()

In [None]:
#test if s3 was updated correctly.
S3_URL_parsed = urlparse(os.environ['S3_URL'])
bucket, _, _ = S3_URL_parsed.path[1:].partition('/')

s3 = Minio(
    f"{S3_URL_parsed.hostname}:{S3_URL_parsed.port}",
    access_key=S3_URL_parsed.username,
    secret_key=S3_URL_parsed.password,
    secure=S3_URL_parsed.scheme == 'https',
)

# List all files in the folder for this GSE
prefix = f"{gse}/"  # replace with your GSE ID
objects = s3.list_objects(bucket, prefix=prefix, recursive=True)

for obj in objects:
    print(obj.object_name)  # Shows files like GSE123456/GSE123456.html, etc.


In [None]:
reset_postgres()
reset_s3_bucket()

to do: 
- fix image resolutions and dpi. Make it consistent, maybe 600-700px wide (DONE)
- delete the template to avoid it being uploaded to S3 (DONE)
- stop saving htmls (DONE)
- fix the clustergrammer color thresholds
- fix citations from APA to AMA (DONE)
- fix clustergrammer link (DONE)
- get metadata matrix from elsewhere not from ARCHS4 H5 file. (DONE)
- create a metadata list
