# Goal

* Create a list of processed SRX accessions per species, with file paths
* These will be used to create UMAP plots for certain species

# Init

In [19]:
import os
from glob import glob
from dotenv import load_dotenv
import pandas as pd
from pypika import Query, Table, Criterion, functions as fn
from SRAgent.db.connect import db_connect

In [2]:
load_dotenv(override=True)
os.environ["DYNACONF"] = "prod"

# Load data

In [9]:
# read in star results
tbl = Table("screcounter_star_results")
stmt = Query \
    .from_(tbl) \
    .select("*") \
    .where(tbl.feature == "gene_ex50") \
    .where(tbl.estimated_number_of_cells > 3000)

with db_connect() as conn:
    star_results = pd.read_sql(str(stmt), conn)
star_results

Unnamed: 0,sample,feature,estimated_number_of_cells,fraction_of_unique_reads_in_cells,mean_gene_per_cell,mean_umi_per_cell,mean_feature_per_cell,median_gene_per_cell,median_umi_per_cell,median_feature_per_cell,...,reads_with_valid_barcodes,sequencing_saturation,total_feature_detected,umis_in_cells,unique_reads_in_cells_mapped_to_gene,unique_reads_in_cells_mapped_to_genefull,unique_reads_in_cells_mapped_to_genefull_ex50pas,unique_reads_in_cells_mapped_to_genefull_exonoverintron,created_at,updated_at
0,ERX10396585,gene_ex50,4875,0.869061,,33160.0,5162.0,,33087.0,6150.0,...,0.977392,0.219096,25479.0,161657597,,,207340012.0,,2025-01-13 16:03:19.007361,2025-01-14 20:06:25.557352
1,ERX10855741,gene_ex50,4299,0.876242,,5126.0,2524.0,,4492.0,2416.0,...,0.961318,0.723613,28667.0,22039225,,,81002428.0,,2025-01-13 16:56:43.936232,2025-01-14 20:06:25.557352
2,ERX9427783,gene_ex50,6404,0.889150,,8624.0,2714.0,,7230.0,2694.0,...,0.975229,0.484923,29386.0,55230288,,,107808456.0,,2025-01-14 21:19:47.161614,2025-01-14 21:19:47.161614
3,ERX10855742,gene_ex50,3428,0.888622,,5346.0,2551.0,,4247.0,2329.0,...,0.959722,0.725166,28140.0,18326986,,,68051908.0,,2025-01-13 16:50:48.249748,2025-01-14 20:06:25.557352
4,ERX10981676,gene_ex50,7703,0.890946,,6834.0,2250.0,,5547.0,2094.0,...,0.975521,0.605853,23180.0,52648371,,,136342362.0,,2025-01-13 17:08:48.729106,2025-01-14 20:06:25.557352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22439,SRX25708138,gene_ex50,10802,0.936122,,8037.0,2342.0,,5020.0,1971.0,...,0.921812,0.393687,20388.0,86820516,,,143097465.0,,2025-02-20 22:57:54.885096,2025-02-20 22:57:54.885096
22440,SRX22169670,gene_ex50,8198,0.711641,,1860.0,953.0,,1425.0,862.0,...,0.967051,0.277983,10472.0,15256050,,,21361026.0,,2025-02-20 23:18:06.387448,2025-02-20 23:18:06.387448
22441,SRX25713883,gene_ex50,12881,0.904704,,20517.0,3982.0,,10257.0,3793.0,...,0.947238,0.421060,28701.0,264287183,,,461781442.0,,2025-02-20 23:40:10.887207,2025-02-20 23:40:10.887207
22442,SRX25708136,gene_ex50,11669,0.890490,,10461.0,2872.0,,6899.0,2536.0,...,0.917599,0.599476,21799.0,122069877,,,304941897.0,,2025-02-20 23:41:32.051897,2025-02-20 23:41:32.051897


In [39]:
# read in srx-metadata as pandas dataframe
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select("*") \
    .distinct() \
    .where(tbl.czi_collection_id.isnull())

with db_connect() as conn:
    srx_metadata = pd.read_sql(str(stmt), conn)
srx_metadata

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,notes,created_at,updated_at
0,sra,100000,SRX087804,yes,no,yes,not_applicable,not_applicable,not_applicable,other,not specified,not specified,not specified,not specified,,,Metadata obtained by SRAgent,2025-02-11 11:20:27.383612,2025-02-11 11:22:05.479718
1,sra,100001,SRX087805,yes,no,yes,not_applicable,not_applicable,not_applicable,other,unsure,unsure,unsure,not_applicable,,,Metadata obtained by SRAgent,2025-02-11 11:20:27.383612,2025-02-11 11:21:54.748818
2,sra,100002,SRX087806,yes,no,yes,not_applicable,not_applicable,not_applicable,other,unsure,unsure,unsure,unsure,,,Metadata obtained by SRAgent,2025-02-11 11:20:27.383612,2025-02-11 11:22:00.892413
3,sra,100003,SRX087807,yes,no,yes,not_applicable,not_applicable,not_applicable,other,unsure,unsure,unsure,unsure,,,Metadata obtained by SRAgent,2025-02-11 11:20:27.383612,2025-02-11 11:21:55.084329
4,sra,100004,SRX087808,yes,no,yes,not_applicable,not_applicable,not_applicable,other,unsure,unsure,unsure,unsure,,,Metadata obtained by SRAgent,2025-02-11 11:20:27.383612,2025-02-11 11:21:53.522703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67598,sra,100000005,,,,,,,,,,,,,,,New dataset found by Find-Datasets agent,2025-01-16 13:20:31.752503,2025-01-16 13:20:31.752503
67599,sra,100000006,,,,,,,,,,,,,,,New dataset found by Find-Datasets agent,2025-01-16 13:20:31.752503,2025-01-16 13:20:31.752503
67600,sra,100000007,,,,,,,,,,,,,,,New dataset found by Find-Datasets agent,2025-01-16 13:20:31.752503,2025-01-16 13:20:31.752503
67601,sra,100000008,,,,,,,,,,,,,,,New dataset found by Find-Datasets agent,2025-01-16 13:20:31.752503,2025-01-16 13:20:31.752503


In [41]:
# filter to star results
srx_metadata = srx_metadata[srx_metadata["srx_accession"].isin(star_results["sample"])]
srx_metadata.shape

(22422, 19)

In [None]:
# get 100 of each organism
srx_metadata = srx_metadata.groupby("organism").head(100).reset_index(drop=True)
srx_metadata

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,notes,created_at,updated_at
0,sra,10094477,SRX7723647,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Drosophila melanogaster,ureter and lower tubules,not specified,not specified,not applicable,,,Metadata obtained by SRAgent,2025-02-10 13:20:27.590900,2025-02-10 13:23:31.772165
1,sra,10524643,SRX8082006,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Danio rerio,whole organism,not specified,not specified,not applicable,,,Metadata obtained by SRAgent,2025-02-10 14:05:29.747741,2025-02-10 14:09:52.175351
2,sra,10525137,SRX8082500,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Danio rerio,whole organism,not specified,micro-injected with gRNA pool and Cas9 mRNA mi...,not applicable,,,Metadata obtained by SRAgent,2025-02-08 05:25:29.492257,2025-02-08 05:26:38.375022
3,sra,10969899,SRX8434172,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Danio rerio,caudal fin,Strongyloidiasis,"Ivermectin, Thiabendazole, Mebendazole, Albend...",unsure,,,Metadata obtained by SRAgent,2025-02-09 18:20:32.244925,2025-02-09 18:21:47.191491
4,sra,11032409,SRX8487983,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Zea mays,"SAM (Shoot Apical Meristem), plastochrons 1-6",none,none,single cells (protoplasts),,,Metadata obtained by SRAgent,2025-02-10 18:25:31.352826,2025-02-10 18:29:03.676546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,sra,36084756,SRX26708354,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Gallus gallus,Somite,unsure,unsure,unsure,,,Metadata obtained by SRAgent,2025-02-08 16:15:27.270171,2025-02-08 16:18:16.793937
1100,sra,36084758,SRX26708356,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Gallus gallus,somite,not specified,not specified,not specified,,,Metadata obtained by SRAgent,2025-02-10 04:45:31.519252,2025-02-10 04:46:54.953339
1101,sra,36879056,SRX27335701,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Caenorhabditis elegans,rab-3::tagRFP+ neurons,none,young adult males,MOS264,,,Metadata obtained by SRAgent,2025-02-09 15:55:28.997356,2025-02-09 15:57:05.945129
1102,sra,37124417,SRX27536440,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Solanum lycopersicum,Adventitious root,unsure,Hydroponic culture,unsure,,,Metadata obtained by SRAgent,2025-02-16 10:30:26.807923,2025-02-16 10:31:36.464400


In [47]:
# 
srx_metadata["organism"].value_counts()

organism
Drosophila melanogaster    100
Danio rerio                100
Mus musculus               100
Macaca mulatta             100
Homo sapiens               100
Sus scrofa                 100
Arabidopsis thaliana       100
Gallus gallus               72
Heterocephalus glaber       68
Bos taurus                  42
Caenorhabditis elegans      35
Pan troglodytes             34
Zea mays                    31
Oryctolagus cuniculus       29
Oryza sativa                27
Callithrix jacchus          21
Ovis aries                  20
Equus caballus              11
Solanum lycopersicum         9
Schistosoma mansoni          3
Gorilla gorilla              2
Name: count, dtype: int64

In [48]:
# get all SRX directories
to_keep = set(srx_metadata["srx_accession"].tolist())
files = []
for x in glob("/processed_datasets/scRecount/scRecounter/prod3/*/STAR/*"):
    if os.path.basename(x) in to_keep:
        files.append([os.path.basename(x), x])
        
# convert to dataframe
files_df = pd.DataFrame(files, columns=["srx_accession", "path"])
files_df

Unnamed: 0,srx_accession,path
0,SRX19907345,/processed_datasets/scRecount/scRecounter/prod...
1,SRX19907347,/processed_datasets/scRecount/scRecounter/prod...
2,SRX19907342,/processed_datasets/scRecount/scRecounter/prod...
3,SRX22821312,/processed_datasets/scRecount/scRecounter/prod...
4,SRX19498729,/processed_datasets/scRecount/scRecounter/prod...
...,...,...
1085,SRX9689819,/processed_datasets/scRecount/scRecounter/prod...
1086,SRX9689818,/processed_datasets/scRecount/scRecounter/prod...
1087,ERX10138352,/processed_datasets/scRecount/scRecounter/prod...
1088,ERX10138355,/processed_datasets/scRecount/scRecounter/prod...


In [50]:
# merge with metadata
srx_metadata = srx_metadata.merge(files_df, on="srx_accession", how="inner")
srx_metadata

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,notes,created_at,updated_at,path
0,sra,10094477,SRX7723647,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Drosophila melanogaster,ureter and lower tubules,not specified,not specified,not applicable,,,Metadata obtained by SRAgent,2025-02-10 13:20:27.590900,2025-02-10 13:23:31.772165,/processed_datasets/scRecount/scRecounter/prod...
1,sra,10524643,SRX8082006,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Danio rerio,whole organism,not specified,not specified,not applicable,,,Metadata obtained by SRAgent,2025-02-10 14:05:29.747741,2025-02-10 14:09:52.175351,/processed_datasets/scRecount/scRecounter/prod...
2,sra,10525137,SRX8082500,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Danio rerio,whole organism,not specified,micro-injected with gRNA pool and Cas9 mRNA mi...,not applicable,,,Metadata obtained by SRAgent,2025-02-08 05:25:29.492257,2025-02-08 05:26:38.375022,/processed_datasets/scRecount/scRecounter/prod...
3,sra,10969899,SRX8434172,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Danio rerio,caudal fin,Strongyloidiasis,"Ivermectin, Thiabendazole, Mebendazole, Albend...",unsure,,,Metadata obtained by SRAgent,2025-02-09 18:20:32.244925,2025-02-09 18:21:47.191491,/processed_datasets/scRecount/scRecounter/prod...
4,sra,11032409,SRX8487983,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Zea mays,"SAM (Shoot Apical Meristem), plastochrons 1-6",none,none,single cells (protoplasts),,,Metadata obtained by SRAgent,2025-02-10 18:25:31.352826,2025-02-10 18:29:03.676546,/processed_datasets/scRecount/scRecounter/prod...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085,sra,36084756,SRX26708354,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Gallus gallus,Somite,unsure,unsure,unsure,,,Metadata obtained by SRAgent,2025-02-08 16:15:27.270171,2025-02-08 16:18:16.793937,/processed_datasets/scRecount/scRecounter/prod...
1086,sra,36084758,SRX26708356,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Gallus gallus,somite,not specified,not specified,not specified,,,Metadata obtained by SRAgent,2025-02-10 04:45:31.519252,2025-02-10 04:46:54.953339,/processed_datasets/scRecount/scRecounter/prod...
1087,sra,36879056,SRX27335701,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Caenorhabditis elegans,rab-3::tagRFP+ neurons,none,young adult males,MOS264,,,Metadata obtained by SRAgent,2025-02-09 15:55:28.997356,2025-02-09 15:57:05.945129,/processed_datasets/scRecount/scRecounter/prod...
1088,sra,37124417,SRX27536440,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,Solanum lycopersicum,Adventitious root,unsure,Hydroponic culture,unsure,,,Metadata obtained by SRAgent,2025-02-16 10:30:26.807923,2025-02-16 10:31:36.464400,/processed_datasets/scRecount/scRecounter/prod...


In [52]:
srx_metadata["organism"].value_counts()

organism
Mus musculus               100
Macaca mulatta             100
Sus scrofa                 100
Homo sapiens               100
Danio rerio                 98
Arabidopsis thaliana        98
Drosophila melanogaster     96
Gallus gallus               72
Heterocephalus glaber       68
Bos taurus                  41
Pan troglodytes             34
Zea mays                    31
Caenorhabditis elegans      30
Oryctolagus cuniculus       29
Oryza sativa                27
Callithrix jacchus          21
Ovis aries                  20
Equus caballus              11
Solanum lycopersicum         9
Schistosoma mansoni          3
Gorilla gorilla              2
Name: count, dtype: int64

In [55]:
# write to csv
srx_metadata.sort_values(["organism"]).to_csv("species_umap.csv", index=False)