#### Input 

In [1]:
ANALYSIS_SET = "IGVFDS4389OUWU" # YOUR ANALYSIS SET ID
IGVF_ACCESS_KEY = "OMYEHTNY" # YOUR ACCESS KEY
IGVF_SECRET_KEY = "ie6esgatnahb6tom" # YOUR SECRET KEY
# hash_library = ""

#### Analysis_set

In [2]:
import pandas as pd
import requests
from typing import List, Dict
import io

def fetch_and_process_data(analysis_set: str, access_key: str, secret_key: str) -> pd.DataFrame:
    """Fetch and process data based on analysis set ID."""
    def fetch_data(url: str) -> pd.DataFrame:
        response = requests.get(url, auth=(access_key, secret_key))
        lines = response.text.split('\n')
        tsv_data = '\n'.join(line for line in lines if not line.startswith('20'))
        df = pd.read_csv(io.StringIO(tsv_data), sep='\t')
        df.columns = df.columns.str.strip()
        return df
    
    # Fetch analysis sets
    url1 = f"https://api.data.igvf.org/multireport.tsv?type=AnalysisSet&input_for=/analysis-sets/{analysis_set}/&field=@id&field=accession&field=input_file_sets"
    table1 = fetch_data(url1)
    
    # Extract measurement sets
    def extract_measurement_sets(input_str):
        if pd.isna(input_str):
            return []
        # Find all measurement set matches
        parts = input_str.split(',')
        measurement_sets = []
        for part in parts:
            if '/measurement-sets/' in part:
                match = pd.Series(part).str.extract(r'/measurement-sets/([^/]+)/')[0].iloc[0]
                if match is not None:
                    measurement_sets.append(match)
        return measurement_sets
    
    # Apply extraction and expand DataFrame
    table1['measurement_sets'] = table1['Input File Sets'].apply(extract_measurement_sets)
    expanded_df = table1.explode('measurement_sets')
    
    # Fetch auxiliary sets and merge
    all_data = []
    api_base = "https://api.data.igvf.org/multireport.tsv"
    for acc in expanded_df['Accession'].unique().tolist():
        url2 = f"{api_base}?type=MeasurementSet&input_for=%2Fanalysis-sets%2F{acc}%2F&field=%40id&field=accession&field=auxiliary_sets"
        df = fetch_data(url2)
        df['measurement_set'] = df['ID'].str.extract(r'/measurement-sets/([^/]+)/')
        all_data.append(df)
    
    table2 = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()
    
    # Merge and select columns
    final_table = pd.merge(expanded_df, table2, 
                        left_on='measurement_sets', 
                        right_on='measurement_set', 
                        how='left')
    result = final_table[['Accession_x', 'measurement_sets', 'Auxiliary Sets']]
    
    # Add analysis_set column and rename
    result.insert(0, 'analysis_set_id', analysis_set)
    result.columns = ['analysis_set_id', 'input_file_sets', 'measurement_sets', 'associated_auxiliary_sets']
    
    return result

if __name__ == "__main__":
    ANALYSIS_SET = ANALYSIS_SET
    IGVF_ACCESS_KEY = IGVF_ACCESS_KEY
    IGVF_SECRET_KEY = IGVF_SECRET_KEY

In [3]:
analysis_set = fetch_and_process_data(ANALYSIS_SET, IGVF_ACCESS_KEY, IGVF_SECRET_KEY)
analysis_set

Unnamed: 0,analysis_set_id,input_file_sets,measurement_sets,associated_auxiliary_sets
0,IGVFDS4389OUWU,IGVFDS3231RKUL,IGVFDS4852MAUM,"/auxiliary-sets/IGVFDS6861GRJI/,/auxiliary-set..."
1,IGVFDS4389OUWU,IGVFDS2378SIVI,IGVFDS7733OTCA,"/auxiliary-sets/IGVFDS0997MTQA/,/auxiliary-set..."
2,IGVFDS4389OUWU,IGVFDS7715GHKL,IGVFDS2628PYOH,"/auxiliary-sets/IGVFDS8139DLEM/,/auxiliary-set..."
3,IGVFDS4389OUWU,IGVFDS8280DBSV,IGVFDS7834BXBN,"/auxiliary-sets/IGVFDS8280AMDA/,/auxiliary-set..."


#### Per-experiment file

In [4]:
import pandas as pd
import requests
from typing import List, Dict
from urllib.parse import quote
import io

def create_experiment_file(analysis_set: pd.DataFrame, access_key: str, secret_key: str) -> pd.DataFrame:
    if 'input_file_sets' not in analysis_set.columns:
        raise ValueError("analysis_set must contain 'input_file_sets' column")
        
    experiment_file = analysis_set[['input_file_sets']].copy()
    guide_libraries = []
    
    def fetch_data(url: str) -> pd.DataFrame:
        response = requests.get(url, auth=(access_key, secret_key))
        response.raise_for_status()
        lines = response.text.split('\n')
        tsv_data = '\n'.join(line for line in lines if not line.startswith('20'))
        df = pd.read_csv(io.StringIO(tsv_data), sep='\t')
        df.columns = df.columns.str.strip()
        return df
    
    for ms in experiment_file['input_file_sets']:
        try:
            # First fetch construct library set
            construct_url = f"https://api.data.igvf.org/multireport.tsv?type=ConstructLibrarySet&input_for=%2Fanalysis-sets%2F{ms}%2F&field=%40id&field=accession"
            df = fetch_data(construct_url)
            fetched_library = df['Accession'].iloc[0] if not df.empty else None
            
            if fetched_library:
                # Then fetch guide library file
                guide_url = f"https://api.data.igvf.org/multireport.tsv?type=File&integrated_in.@id=%2Fconstruct-library-sets%2F{fetched_library}%2F&field=%40id&field=accession"
                df = fetch_data(guide_url)
                guide_library = df['Accession'].iloc[0] if not df.empty else None
                guide_libraries.append(guide_library)
            else:
                guide_libraries.append(None)
            
        except Exception as e:
            print(f"Error fetching guide library for {ms}: {str(e)}")
            guide_libraries.append(None)
    
    experiment_file['guide_library'] = guide_libraries
    return experiment_file

if __name__ == "__main__":
    IGVF_ACCESS_KEY = IGVF_ACCESS_KEY
    IGVF_SECRET_KEY = IGVF_SECRET_KEY

In [5]:
experiment_file = create_experiment_file(analysis_set, IGVF_ACCESS_KEY, IGVF_SECRET_KEY)
experiment_file

Unnamed: 0,input_file_sets,guide_library
0,IGVFDS3231RKUL,IGVFFI7568FEHI
1,IGVFDS2378SIVI,IGVFFI7568FEHI
2,IGVFDS7715GHKL,IGVFFI7568FEHI
3,IGVFDS8280DBSV,IGVFFI7568FEHI


#### Download guide metadata

In [23]:
import pandas as pd
import requests
import os
from typing import List, Dict

def download_guide_library(experiment_file: pd.DataFrame, access_key: str, secret_key: str):

    os.makedirs('guide_libraries', exist_ok=True)
    
    for guide_library in experiment_file['guide_library'].dropna().unique():
        try:
            url = f"https://api.data.igvf.org/files/{guide_library}/@@download"
            response = requests.get(url, auth=(access_key, secret_key))
            response.raise_for_status()
            
            output_path = f"guide_metadata_{guide_library}.tsv"
            with open(output_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded guide_metadata {guide_library}")
            
        except Exception as e:
            print(f"Error downloading guide library {guide_library}: {str(e)}")


In [24]:
download_guide_library(experiment_file, IGVF_ACCESS_KEY, IGVF_SECRET_KEY)

Downloaded guide_metadata IGVFFI7568FEHI


#### Per-sample file: sequence files (w/ cell hash)

In [12]:
import pandas as pd
import requests
import io

def fetch_data(url: str, access_key: str, secret_key: str) -> pd.DataFrame:
    """Helper function to fetch data from API."""
    response = requests.get(url, auth=(access_key, secret_key))
    lines = response.text.split('\n')
    tsv_data = '\n'.join(line for line in lines if not line.startswith('20'))
    return pd.read_csv(io.StringIO(tsv_data), sep='\t')

def get_sequence_files(result: pd.DataFrame, access_key: str, secret_key: str) -> pd.DataFrame:
    """
    Fetch sequence files from measurement sets and auxiliary sets.
    
    Args:
        result: DataFrame containing measurement_sets and associated_auxiliary_sets
        access_key: API access key
        secret_key: API secret key
        
    Returns:
        DataFrame containing combined sequence files information
    """
    all_tables = []
    
    # Process measurement sets
    for _, row in result.iterrows():
        # Get measurement set ID
        measurement_id = row['measurement_sets']
        url = f"https://api.data.igvf.org/multireport.tsv?type=SequenceFile&file_set.@id=%2Fmeasurement-sets%2F{measurement_id}%2F&illumina_read_type=*&field=%40id&field=accession&field=illumina_read_type&field=lane&field=md5sum&field=flowcell_id&field=seqspecs"
        df = fetch_data(url, access_key, secret_key)
        df['measurement_sets'] = measurement_id
        df['file_modality'] = 'scRNA'
        all_tables.append(df)
        
        # Process auxiliary sets if they exist
        if pd.notna(row['associated_auxiliary_sets']):
            aux_sets = str(row['associated_auxiliary_sets']).strip().split(',')
            aux_sets = [s.strip() for s in aux_sets]  # Clean any whitespace
            
            # Handle single auxiliary set - grna
            if len(aux_sets) == 1:
                aux_id = aux_sets[0].split('/')[-2]
                url = f"https://api.data.igvf.org/multireport.tsv?type=SequenceFile&file_set.@id=%2Fauxiliary-sets%2F{aux_id}%2F&illumina_read_type=*&field=%40id&field=accession&field=illumina_read_type&field=lane&field=md5sum&field=seqspecs"
                df = fetch_data(url, access_key, secret_key)
                df['measurement_sets'] = measurement_id
                df['file_modality'] = 'gRNA'
                all_tables.append(df)

            # Handle multiple auxiliary sets - grna and hash
            else:
                aux_types = ['gRNA', 'hash']
                for aux_type, aux_set in zip(aux_types, aux_sets):
                    if pd.notna(aux_set):
                        aux_id = aux_set.split('/')[-2]
                        url = f"https://api.data.igvf.org/multireport.tsv?type=SequenceFile&file_set.@id=%2Fauxiliary-sets%2F{aux_id}%2F&illumina_read_type=*&field=%40id&field=accession&field=illumina_read_type&field=lane&field=md5sum&field=seqspecs"
                        df = fetch_data(url, access_key, secret_key)
                        df['measurement_sets'] = measurement_id
                        df['file_modality'] = aux_type
                        all_tables.append(df)
    
    return pd.concat(all_tables, ignore_index=True) if all_tables else pd.DataFrame()

def rearrange_sequence_files(sequence_files: pd.DataFrame) -> pd.DataFrame:
    """Pivot sequence files to wide format based on read type."""
    # Filter for R1 and R2 read types only
    df = sequence_files[sequence_files['Illumina Read Type'].isin(['R1', 'R2'])].copy()
    
    # Create separate DataFrames for R1 and R2
    r1_data = df[df['Illumina Read Type'] == 'R1'].copy()
    r2_data = df[df['Illumina Read Type'] == 'R2'].copy()
    
    # Rename columns
    r1_data = r1_data.rename(columns={
        'Accession': 'R1_path',
        'MD5sum': 'R1_md5sum'
    })
    
    r2_data = r2_data.rename(columns={
        'Accession': 'R2_path',
        'MD5sum': 'R2_md5sum'
    })
    
    r1_cols = ['ID', 'Lane', 'measurement_sets', 'file_modality', 'R1_path', 'R1_md5sum', 'Flowcell ID', 'Seqspecs']
    r2_cols = ['ID', 'Lane', 'measurement_sets', 'file_modality', 'R2_path', 'R2_md5sum', 'Flowcell ID', 'Seqspecs']
    
    r1_data = r1_data[r1_cols]
    r2_data = r2_data[r2_cols]
    
    # Merge R1 and R2 data
    merged_df = pd.merge(
        r1_data,
        r2_data,
        on=['Lane', 'measurement_sets', 'file_modality', 'Flowcell ID', 'Seqspecs'],
        how='outer'
    )

    # Extract seqspec Accession ID
    if 'Seqspecs' in merged_df.columns:
        merged_df['Seqspecs'] = merged_df['Seqspecs'].fillna('').astype(str)
        merged_df['Seqspecs'] = merged_df['Seqspecs'].str.extract(r'/configuration-files/([^/]+)/')

    # Sort and reorder columns
    merged_df = merged_df.sort_values(['measurement_sets', 'Lane', 'file_modality', 'Flowcell ID'])
    
    final_cols = [
        'R1_path',
        'R1_md5sum',
        'R2_path',
        'R2_md5sum',
        'measurement_sets',
        'Lane',
        'file_modality',
        'Flowcell ID',
        'Seqspecs'
    ]
    
    return merged_df[final_cols].reset_index(drop=True)

def process_sequence_data(result: pd.DataFrame, access_key: str, secret_key: str) -> pd.DataFrame:
    """Main function to process sequence data end-to-end."""
    sequence_files = get_sequence_files(result, access_key, secret_key)
    return rearrange_sequence_files(sequence_files)

if __name__ == "__main__":

    IGVF_ACCESS_KEY = IGVF_ACCESS_KEY
    IGVF_SECRET_KEY = IGVF_SECRET_KEY

In [13]:
# Process the data
sample_file = process_sequence_data(analysis_set, IGVF_ACCESS_KEY, IGVF_SECRET_KEY)
sample_file

Unnamed: 0,R1_path,R1_md5sum,R2_path,R2_md5sum,measurement_sets,Lane,file_modality,Flowcell ID,Seqspecs
0,IGVFFI7706SWGW,e2ca5e0b3059d8fb64b9f6e06c17a8dc,IGVFFI7788FDIR,5c55836d72df5acab5f5c10ac3f429b6,IGVFDS2628PYOH,1,gRNA,,
1,IGVFFI5460OSRQ,8e08ada4ba4b86e7e52c05525af159d7,IGVFFI1587BLSX,5d9b02afe94f103e1f76cbe8d36d1227,IGVFDS2628PYOH,1,hash,,
2,IGVFFI1946LEGM,e74e507f714b2f1369a94f3da1a58712,IGVFFI5195OGCL,8ab3678971ee4d8926e7bdf5799f5eff,IGVFDS2628PYOH,1,scRNA,AAC5W2WHV,
3,IGVFFI0240MDNW,1d739060ff362c1c4869694cb729b35f,IGVFFI2281LVOV,f0711502952f1693a71316022f0520f3,IGVFDS2628PYOH,1,scRNA,AAC7VLGHV,
4,IGVFFI3447MKLI,cf24fbe601871dcd50352b3a2bd93924,IGVFFI6246XEYZ,e17ed90217c29b045c1bcdb190376510,IGVFDS2628PYOH,2,gRNA,,
5,IGVFFI8395IXDC,881304b96dd5b2935b9ec5221e431346,IGVFFI3244CIGW,78c36265485162980574b570b74a4dc8,IGVFDS2628PYOH,2,hash,,
6,IGVFFI7820TAWX,a0748346b12ceadf0980876daac0432f,IGVFFI7535OPQC,9e9fd89a7e18d9995daec9c6bb5351db,IGVFDS2628PYOH,2,scRNA,AAC5W2WHV,
7,IGVFFI6854CNMR,61ce5a6705d836cf62156d208a0ddd2c,IGVFFI3549FRZL,db510e471e7e8f1ae7b6ecb6aab8cdb2,IGVFDS2628PYOH,2,scRNA,AAC7VLGHV,
8,IGVFFI7902JDLP,f170871e03b7533f397e29c6ad6a0ea8,IGVFFI7335PJUM,2cf1714d0ed3004f9aa52f1f6be21359,IGVFDS4852MAUM,1,gRNA,,
9,IGVFFI3880AUGY,cc30349f966b9a2c15fc5a4351c7df9c,IGVFFI3453EWPB,595b70257e13d894297abbc32f843a58,IGVFDS4852MAUM,1,hash,,


#### (optional) Filter samples

In [14]:
### filter Lane 1 and Flowcell ID=AAC5W2WHV
sample_file = sample_file[(sample_file['Lane'] == 1) & (sample_file['Flowcell ID'].isin(['AAC5W2WHV']) | sample_file['Flowcell ID'].isna())]
sample_file

Unnamed: 0,R1_path,R1_md5sum,R2_path,R2_md5sum,measurement_sets,Lane,file_modality,Flowcell ID,Seqspecs
0,IGVFFI7706SWGW,e2ca5e0b3059d8fb64b9f6e06c17a8dc,IGVFFI7788FDIR,5c55836d72df5acab5f5c10ac3f429b6,IGVFDS2628PYOH,1,gRNA,,
1,IGVFFI5460OSRQ,8e08ada4ba4b86e7e52c05525af159d7,IGVFFI1587BLSX,5d9b02afe94f103e1f76cbe8d36d1227,IGVFDS2628PYOH,1,hash,,
2,IGVFFI1946LEGM,e74e507f714b2f1369a94f3da1a58712,IGVFFI5195OGCL,8ab3678971ee4d8926e7bdf5799f5eff,IGVFDS2628PYOH,1,scRNA,AAC5W2WHV,
8,IGVFFI7902JDLP,f170871e03b7533f397e29c6ad6a0ea8,IGVFFI7335PJUM,2cf1714d0ed3004f9aa52f1f6be21359,IGVFDS4852MAUM,1,gRNA,,
9,IGVFFI3880AUGY,cc30349f966b9a2c15fc5a4351c7df9c,IGVFFI3453EWPB,595b70257e13d894297abbc32f843a58,IGVFDS4852MAUM,1,hash,,
10,IGVFFI3192CBTL,ce4fcae92f281279c9202a395639c74a,IGVFFI7829KDAL,97827a081d3fa927f844afc5215e0ed3,IGVFDS4852MAUM,1,scRNA,AAC5W2WHV,
16,IGVFFI9797JSJB,e8ae2066216fb9080d549ccf6c462a71,IGVFFI2834JYDA,8df98094e7d2f1f9e550d2790c59d203,IGVFDS7733OTCA,1,gRNA,,
17,IGVFFI8807NJZP,f1fcaa5a2401a3e0032df6b53ecade9f,IGVFFI2898JUNF,0dfe02380fe7f157ed9caa013465be31,IGVFDS7733OTCA,1,hash,,
18,IGVFFI8182EUUN,24df8700cfa8b88275c9d21b9a494a31,IGVFFI4798ZEXE,b6d7029645916edcc29058e2bbec53d5,IGVFDS7733OTCA,1,scRNA,AAC5W2WHV,
24,IGVFFI7205PTPJ,7bc2b7bf5415c8d80d347ba3a07a8fc2,IGVFFI0105QRUB,8c8a36250ae12df77dc22abc1fa6873b,IGVFDS7834BXBN,1,gRNA,,


#### Export Files

In [15]:
def export_data(analysis_set, experiment_file, sample_file):
    analysis_set.to_csv('analysis_sets.tsv', sep='\t', index=False)
    experiment_file.to_csv('per-experiment_file.tsv', sep='\t', index=False)
    sample_file.to_csv('per-sample_file.tsv', sep='\t', index=False)

export_data(analysis_set, experiment_file, sample_file)

#### Download all required files

In [None]:
# IGVF_ACCESS_KEY=YOUR_ACCESS_KEY
# IGVF_SECRET_KEY=YOUR_SECRET_KEY

# ACCESSIONS=(IGVFFI1946LEGM IGVFFI5195OGCL IGVFFI7706SWGW IGVFFI7788FDIR IGVFFI5460OSRQ IGVFFI1587BLSX)
# for ACCESSION in ${ACCESSIONS[@]};
# do
# curl -O -L \
#         -u ${IGVF_ACCESS_KEY}:${IGVF_SECRET_KEY} \
# http://api.igvf.org.sequence-files/${ACCESION}/@@download/${ACCESION}.fastq.gz
# done

In [12]:
import os
import requests
from typing import List, Tuple

def download_fastq_files(df, access_key: str, secret_key: str):
    """
    Download fastq files for pairs of R1 and R2 accessions into a fastq_files directory
    """
    # Create fastq_files directory if it doesn't exist
    fastq_dir = "fastq_files"
    os.makedirs(fastq_dir, exist_ok=True)
    
    base_url = "https://api.data.igvf.org/sequence-files"
    accession_pairs = list(zip(df['R1'], df['R2']))
    
    for r1, r2 in accession_pairs:
        # Download R1
        r1_url = f"{base_url}/{r1}/@@download/{r1}.fastq.gz"
        r1_response = requests.get(r1_url, auth=(access_key, secret_key))
        if r1_response.status_code == 200:
            output_path = os.path.join(fastq_dir, f"{r1}.fastq.gz")
            with open(output_path, 'wb') as f:
                f.write(r1_response.content)
            print(f"Downloaded {r1} to {fastq_dir}")
        else:
            print(f"Failed to download {r1}: {r1_response.status_code}")

        # Download R2 
        r2_url = f"{base_url}/{r2}/@@download/{r2}.fastq.gz"
        r2_response = requests.get(r2_url, auth=(access_key, secret_key))
        if r2_response.status_code == 200:
            output_path = os.path.join(fastq_dir, f"{r2}.fastq.gz")
            with open(output_path, 'wb') as f:
                f.write(r2_response.content)
            print(f"Downloaded {r2} to {fastq_dir}")
        else:
            print(f"Failed to download {r2}: {r2_response.status_code}")

if __name__ == "__main__":
    
    IGVF_ACCESS_KEY = IGVF_ACCESS_KEY
    IGVF_SECRET_KEY = IGVF_SECRET_KEY

In [47]:
import os
import requests
from typing import List

def download_yaml_files(df, access_key: str, secret_key: str):
    """
    Download yaml files for each seqspecs accession ID
    """
    # Create yaml_files directory if it doesn't exist
    yaml_dir = "seqspecs_files"
    os.makedirs(yaml_dir, exist_ok=True)
    
    base_url = "https://api.data.igvf.org/configuration-files"
    
    seqspecs_ids = df['Seqspecs'].dropna().unique()
    
    for seqspecs_id in seqspecs_ids:
        yaml_url = f"{base_url}/{seqspecs_id}/@@download/{seqspecs_id}.yaml.gz"
        
        # Download yaml file
        response = requests.get(yaml_url, auth=(access_key, secret_key))
        if response.status_code == 200:
            output_path = os.path.join(yaml_dir, f"{seqspecs_id}.yaml.gz")
            with open(output_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {seqspecs_id} to {yaml_dir}")
        else:
            print(f"Failed to download {seqspecs_id}: {response.status_code}")

if __name__ == "__main__":
    IGVF_ACCESS_KEY = IGVF_ACCESS_KEY
    IGVF_SECRET_KEY = IGVF_SECRET_KEY

In [None]:
# Download files
download_fastq_files(sample_file, IGVF_ACCESS_KEY, IGVF_SECRET_KEY)
download_yaml_files(sample_file, IGVF_ACCESS_KEY, IGVF_SECRET_KEY)

#### What we need now:

- analysis_set_id
- filter samples
- link to hash metadata