# Goals

* test out SRA Big query to see if it can be used as an agent tool

In [1]:
# pip install google-cloud-bigquery google-cloud-bigquery-storage db-dtypes 
from google.cloud import bigquery
import os
import json
import decimal

In [2]:
class SRABigQueryTool:
    def __init__(self, credentials_path=None, project_id=None):
        """
        Initialize the SRA BigQuery tool.
        Args:
            credentials_path (str): Path to Google Cloud credentials JSON file
            project_id (str): Your Google Cloud project ID
        """
        if credentials_path:
            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
        self.client = bigquery.Client(project=project_id)
    
    def get_study_metadata(self, study_accession: str) -> str:
        """
        Get metadata for all experiments in a study.
        Args:
            study_accession (str): SRA study accession number (SRP)
        Returns:
            json string of the query results
        """
        query = f"""
        WITH distinct_values AS (
            SELECT DISTINCT
                m.sra_study,
                m.bioproject,
                m.experiment
            FROM `nih-sra-datastore.sra.metadata` as m
            WHERE m.sra_study = '{study_accession}'
        )
        SELECT 
            sra_study,
            bioproject,
            STRING_AGG(experiment, ',') as experiments
        FROM distinct_values
        GROUP BY sra_study, bioproject
        """
        return self.to_json(self.client.query(query))

    def get_experiment_metadata(self, exp_accession: str) -> str:
        """
        Get metadata for all experiments in a study.
        Args:
            study_accession: SRA study accession number (SRX)
        Returns:
            json string of the query results 
        """
        query = f"""
        WITH distinct_values AS (
            SELECT DISTINCT
                m.sra_study,
                m.experiment,
                m.library_name, 
                m.librarylayout,
                m.libraryselection, 
                m.librarysource,
                m.platform,
                m.instrument,
                m.acc,
            FROM `nih-sra-datastore.sra.metadata` as m
            WHERE m.experiment = '{exp_accession}'
        )
        SELECT
            experiment,
            library_name,
            librarylayout,
            libraryselection,
            librarysource,
            platform,
            instrument,
            STRING_AGG(acc, ',') as acc
        FROM distinct_values
        GROUP BY experiment, library_name, librarylayout, libraryselection, librarysource, platform, instrument
        """
        return self.to_json(self.client.query(query))

    def get_run_metadata(self, run_accession: str) -> str:
        """
        Get metadata for all experiments in a study.
        Args:
            run_accession: SRA run accession number (SRR)
        Returns:
            json string of the query results 
        """
        query = f"""
        SELECT 
            m.experiment,
            m.acc,
            m.biosample,
            m.organism,
            m.assay_type,
            m.mbases,
            m.avgspotlen,
            m.insertsize,   
        FROM `nih-sra-datastore.sra.metadata` as m
        WHERE m.acc = '{run_accession}'
        """
        return self.to_json(self.client.query(query))
    
    @staticmethod
    def to_json(results, indent: int=None):
        """
        Convert a dictionary to a JSON string.
        Args:
            results: a bigquery query result object
        Returns:
            str: JSON string
        """
        def datetime_handler(obj):
            if hasattr(obj, 'isoformat'):
                return obj.isoformat()
            elif isinstance(obj, decimal.Decimal):
                return str(obj)
            raise TypeError(f'Object of type {type(obj)} is not JSON serializable')

        return json.dumps(
            [dict(row) for row in results],
            default=datetime_handler,
            indent=indent
        )

In [3]:
sra_tool = SRABigQueryTool()

In [15]:
sra_tool.get_run_metadata("SRR31573627")

'[{"experiment": "SRX26939191", "acc": "SRR31573627", "biosample": "SAMN45131480", "organism": "Pseudomonas aeruginosa", "assay_type": "RNA-Seq", "mbases": 1055, "avgspotlen": 94, "insertsize": null}]'

In [16]:
sra_tool.get_experiment_metadata("SRX26939191")

'[{"experiment": "SRX26939191", "library_name": "GSM8660072", "librarylayout": "PAIRED", "libraryselection": "cDNA", "librarysource": "TRANSCRIPTOMIC", "platform": "ILLUMINA", "instrument": "Illumina NovaSeq 6000", "acc": "SRR31573627"}]'

In [118]:
sra_tool.get_study_metadata("SRP548813")

'[{"sra_study": "SRP548813", "bioproject": "PRJNA1193093", "experiments": "SRX26939186,SRX26939195,SRX26939193,SRX26939190,SRX26939196,SRX26939189,SRX26939188,SRX26939187,SRX26939185,SRX26939192,SRX26939194,SRX26939191"}]'