# Goals

* Check on the test and/or production databases

In [10]:
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [11]:
import os
import pandas as pd
from pypika import Query, Table, Field, Column, Criterion

In [12]:
from SRAgent.db.connect import db_connect
from SRAgent.db.upsert import db_upsert
from SRAgent.db.utils import db_list_tables, db_glimpse_tables, db_get_table, execute_query
from SRAgent.db.get import db_find_srx
from SRAgent.db.create import create_table, create_table_router

In [13]:
# list all of the tables in prod
os.environ['DYNACONF'] = 'prod'
conn_prod = db_connect() 
print("\n".join(db_list_tables(conn_prod)))

screcounter_star_results
eval
screcounter_trace
srx_srr
srx_metadata
screcounter_log
screcounter_star_params


# Production

In [5]:
db_list_tables(conn_prod)

['screcounter_star_results',
 'eval',
 'screcounter_trace',
 'srx_srr',
 'srx_metadata',
 'screcounter_log',
 'screcounter_star_params']

In [6]:
db_glimpse_tables(conn_prod)

#-- Table: screcounter_star_results --#
sample	feature	estimated_number_of_cells	fraction_of_unique_reads_in_cells	mean_gene_per_cell	mean_umi_per_cell	mean_feature_per_cell	median_gene_per_cell	median_umi_per_cell	median_feature_per_cell	number_of_reads	median_reads_per_cell	q30_bases_in_cb_umi	q30_bases_in_rna_read	reads_mapped_to_gene__unique_gene	reads_mapped_to_gene__unique_multiple_gene	reads_mapped_to_genefull__unique_genefull	reads_mapped_to_genefull__unique_multiple_genefull	reads_mapped_to_genefull_ex50pas__unique_genefull_ex50pas	reads_mapped_to_genefull_ex50pas__unique_multiple_genefull_ex50	reads_mapped_to_genefull_exonoverintron__unique_genefull_exonov	reads_mapped_to_genefull_exonoverintron__unique_multiple_genefu	reads_mapped_to_genome__unique	reads_mapped_to_genome__unique_multiple	reads_mapped_to_velocyto__unique_velocyto	reads_mapped_to_velocyto__unique_multiple_velocyto	reads_with_valid_barcodes	sequencing_saturation	total_feature_detected	umis_in_cells	unique_reads

In [7]:
# read in srx-metadata as pandas dataframe
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select("*") \
    .distinct() \
    .where(tbl.notes != "Processed by Chris Carpenter")
srx_metadata = pd.read_sql(str(stmt), conn_prod)
srx_metadata

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,notes,created_at,updated_at
0,sra,8732970,SRX6621276,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,mouse,RPE/choroid single cells,not specified,cultured choroidal GLI1+ cells treated with DM...,"B6129PF1/J, 3 month-old, RPE/choroid single cells",,,Metadata obtained by SRAgent,2025-01-23 17:20:30.662485,2025-01-23 17:21:51.385908
1,sra,8732971,SRX6621277,yes,yes,yes,10x_Genomics,other,single_cell,mouse,RPE/choroid single cells,not specified,Cultured choroidal GLI1+ cells treated with DM...,"Strain background: C57BL/6J, Age: 3 month-old,...",,,Metadata obtained by SRAgent,2025-01-22 22:45:26.805249,2025-01-22 22:47:13.950104
2,sra,10263497,SRX7845469,yes,yes,yes,10x_Genomics,5_prime_gex,single_cell,human,Peripheral Blood Mononuclear Cells (PBMC),not specified,not specified,not specified,,,Metadata obtained by SRAgent,2025-01-27 07:30:27.589695,2025-01-27 07:32:05.085304
3,sra,10855787,SRX8344829,yes,yes,yes,Smart-seq2,not_applicable,single_cell,mouse,lung,unsure,unsure,unsure,,,Metadata obtained by SRAgent,2025-01-21 17:28:27.920468,2025-01-21 17:29:55.383231
4,sra,10855906,SRX8344948,yes,yes,yes,Smart-seq2,not_applicable,single_cell,mouse,lung,unsure,unsure,unsure,,,Metadata obtained by SRAgent,2025-01-21 17:28:27.920468,2025-01-21 17:36:38.159210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40841,sra,100000005,,,,,,,,,,,,,,,New dataset found by Find-Datasets agent,2025-01-16 13:20:31.752503,2025-01-16 13:20:31.752503
40842,sra,100000006,,,,,,,,,,,,,,,New dataset found by Find-Datasets agent,2025-01-16 13:20:31.752503,2025-01-16 13:20:31.752503
40843,sra,100000007,,,,,,,,,,,,,,,New dataset found by Find-Datasets agent,2025-01-16 13:20:31.752503,2025-01-16 13:20:31.752503
40844,sra,100000008,,,,,,,,,,,,,,,New dataset found by Find-Datasets agent,2025-01-16 13:20:31.752503,2025-01-16 13:20:31.752503


In [None]:
# duplicate srx_accession values?
x = srx_metadata[srx_metadata.srx_accession.notnull()]
x[x.duplicated(subset="srx_accession", keep=False)]

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,notes,created_at,updated_at


In [None]:
# duplicate srx_accession values?
x["srx_accession"].value_counts().describe()

count    40082.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: count, dtype: float64

In [13]:
# duplicate srx_accession values?
x["entrez_id"].value_counts().describe()

count    14630.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: count, dtype: float64

In [10]:
# number of "10X_Genomics" samples
srx_metadata[srx_metadata["lib_prep"] == "10x_Genomics"].shape[0]

8434

# STAR results table

In [11]:
# read in srx-metadata as pandas dataframe
tbl = Table("screcounter_star_results")
stmt = Query \
    .from_(tbl) \
    .select("*") 
star_results = pd.read_sql(str(stmt), conn_prod)
star_results

Unnamed: 0,sample,feature,estimated_number_of_cells,fraction_of_unique_reads_in_cells,mean_gene_per_cell,mean_umi_per_cell,mean_feature_per_cell,median_gene_per_cell,median_umi_per_cell,median_feature_per_cell,...,reads_with_valid_barcodes,sequencing_saturation,total_feature_detected,umis_in_cells,unique_reads_in_cells_mapped_to_gene,unique_reads_in_cells_mapped_to_genefull,unique_reads_in_cells_mapped_to_genefull_ex50pas,unique_reads_in_cells_mapped_to_genefull_exonoverintron,created_at,updated_at
0,ERX10024831,gene_full,2292,0.911745,,12557.0,2730.0,,10513.0,2775.0,...,0.983305,0.593299,21731.0,28780763,,71181072.0,,,2025-01-13 15:27:29.589416,2025-01-14 20:06:25.557352
1,ERX10024831,gene_ex50,2290,0.912572,,17603.0,2756.0,,11784.0,2799.0,...,0.983334,0.601129,21911.0,40311609,,,101599277.0,,2025-01-13 15:27:29.589416,2025-01-14 20:06:25.557352
2,ERX10024831,gene_ex_int,2290,0.912561,,17625.0,2767.0,,11803.0,2809.0,...,0.983322,0.601073,22001.0,40362108,,,,101712901.0,2025-01-13 15:27:29.589416,2025-01-14 20:06:25.557352
3,ERX10024831,velocyto,0,,,,,,,,...,0.984647,-inf,,0,,,,,2025-01-13 15:27:29.589416,2025-01-14 20:06:25.557352
4,ERX10086874,gene_full,2262,0.909667,,4977.0,2018.0,,4615.0,2116.0,...,0.967224,0.887163,21453.0,11258108,,108541887.0,,,2025-01-13 15:17:54.767872,2025-01-14 20:06:25.557352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15940,ERX11899886,gene,19391,0.903745,,6698.0,1871.0,,4492.0,1617.0,...,0.984381,0.761567,29240.0,129894014,576267904.0,,,,2025-01-21 17:33:06.489468,2025-01-21 17:33:06.489468
15941,ERX11899886,gene_ex50,20030,0.914242,,7989.0,2432.0,,5618.0,2163.0,...,0.983904,0.765714,32603.0,160037142,,,726711495.0,,2025-01-21 17:33:06.489468,2025-01-21 17:33:06.489468
15942,ERX11899886,gene_ex_int,20035,0.914298,,8012.0,2447.0,,5639.0,2178.0,...,0.983833,0.765700,32621.0,160539579,,,,729034278.0,2025-01-21 17:33:06.489468,2025-01-21 17:33:06.489468
15943,ERX11899886,gene_full,20016,0.914711,,7865.0,2423.0,,5565.0,2159.0,...,0.983820,0.765057,31181.0,157427372,,712975893.0,,,2025-01-21 17:33:06.489468,2025-01-21 17:33:06.489468


In [12]:
star_results["sample"].value_counts().describe()

count    3189.0
mean        5.0
std         0.0
min         5.0
25%         5.0
50%         5.0
75%         5.0
max         5.0
Name: count, dtype: float64

# Parameters table

In [11]:
# read in srx-metadata as pandas dataframe
tbl = Table("screcounter_star_params")
stmt = Query \
    .from_(tbl) \
    .select("*") \
    .where(tbl.star_index == "")
params = pd.read_sql(str(stmt), conn_prod)
params

Unnamed: 0,sample,barcodes,star_index,cell_barcode_length,umi_length,strand,created_at,updated_at


In [50]:
# read in srx-metadata as pandas dataframe
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select(tbl.srx_accession, tbl.organism) \
    .distinct() \
    .where(tbl.notes != "Processed by Chris Carpenter")
srx_metadata = pd.read_sql(str(stmt), conn_prod)
srx_metadata

Unnamed: 0,srx_accession,organism
0,SRX22822724,mouse
1,,
2,SRX20516510,mouse
3,SRX26710136,mouse
4,SRX24387545,human
...,...,...
642,ERX10668489,mouse
643,SRX19719137,human
644,SRX22006554,human
645,SRX20274306,mouse


In [51]:
# merge by srx_accession
def set_star_ref(row):
    if row.organism == "human":
        return "star_refData_2020_hg38"
    elif row.organism == "mouse":
        return "star_refData_2020_mm10"
    else:
        raise ValueError(f"organism {row.organism} not supported")

params = params.merge(srx_metadata, left_on="sample", right_on="srx_accession") 
params["star_index"] = params.apply(lambda x: set_star_ref(x), axis=1)
params = params.drop(columns=["created_at", "updated_at", "srx_accession", "organism"], axis=1)
params

Unnamed: 0,sample,barcodes,star_index,cell_barcode_length,umi_length,strand
0,ERX11662340,3M-february-2018.txt,star_refData_2020_hg38,16,12,Forward
1,SRX19392127,3M-february-2018.txt,star_refData_2020_mm10,16,12,Forward
2,SRX24158623,3M-february-2018.txt,star_refData_2020_hg38,16,12,Forward
3,SRX22159680,737K-august-2016.txt,star_refData_2020_hg38,16,10,Reverse
4,SRX24228136,3M-february-2018.txt,star_refData_2020_hg38,16,12,Forward
5,SRX24387553,737K-august-2016.txt,star_refData_2020_hg38,16,10,Reverse
6,SRX24461697,3M-february-2018.txt,star_refData_2020_mm10,16,12,Forward
7,SRX24523046,3M-february-2018.txt,star_refData_2020_mm10,16,12,Forward
8,SRX24585614,3M-february-2018.txt,star_refData_2020_hg38,16,12,Forward
9,SRX25852474,3M-february-2018.txt,star_refData_2020_hg38,16,12,Forward


In [52]:
# upsert
db_upsert(params, "screcounter_star_params", conn_prod)

# --OLD--

### Filter out records with no SRX accessions

`New dataset found by Find-Datasets agent`

In [8]:
entrez_ids = srx_metadata[srx_metadata["srx_accession"].isna()]["entrez_id"].unique().tolist()
entrez_ids

[]

In [13]:
# delete extrez_ids from srx_metadata
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .delete() \
    .where(tbl.entrez_id.isin(entrez_ids))
with conn_prod.cursor() as cur:
    cur.execute(str(stmt))
    conn_prod.commit()

In [16]:
# delete from log
tbl = Table("screcounter_log")
to_rm = ["ERX11146221"]
stmt = Query \
    .from_(tbl) \
    .delete() \
    .where(tbl.sample.isin(to_rm))
with conn_prod.cursor() as cur:
    cur.execute(str(stmt))
    conn_prod.commit()