# Goals

* Check on the test and/or production databases

In [12]:
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [13]:
import os
import pandas as pd
from pypika import Query, Table, Field, Column, Criterion

In [14]:
from SRAgent.db.connect import db_connect
from SRAgent.db.upsert import db_upsert
from SRAgent.db.utils import db_list_tables, db_glimpse_tables, db_get_table, execute_query
from SRAgent.db.get import db_find_srx
from SRAgent.db.create import create_table, create_table_router

In [15]:
# list all of the tables in prod
os.environ['DYNACONF'] = 'prod'
conn_prod = db_connect() 
print("\n".join(db_list_tables(conn_prod)))

screcounter_star_results
eval
screcounter_trace
srx_srr
srx_metadata
screcounter_log
screcounter_star_params


# Production

In [26]:
db_list_tables(conn_prod)

['screcounter_star_results',
 'eval',
 'screcounter_trace',
 'srx_srr',
 'srx_metadata',
 'screcounter_log',
 'screcounter_star_params']

In [27]:
db_glimpse_tables(conn_prod)

#-- Table: screcounter_star_results --#
sample	feature	estimated_number_of_cells	fraction_of_unique_reads_in_cells	mean_gene_per_cell	mean_umi_per_cell	mean_feature_per_cell	median_gene_per_cell	median_umi_per_cell	median_feature_per_cell	number_of_reads	median_reads_per_cell	q30_bases_in_cb_umi	q30_bases_in_rna_read	reads_mapped_to_gene__unique_gene	reads_mapped_to_gene__unique_multiple_gene	reads_mapped_to_genefull__unique_genefull	reads_mapped_to_genefull__unique_multiple_genefull	reads_mapped_to_genefull_ex50pas__unique_genefull_ex50pas	reads_mapped_to_genefull_ex50pas__unique_multiple_genefull_ex50	reads_mapped_to_genefull_exonoverintron__unique_genefull_exonov	reads_mapped_to_genefull_exonoverintron__unique_multiple_genefu	reads_mapped_to_genome__unique	reads_mapped_to_genome__unique_multiple	reads_mapped_to_velocyto__unique_velocyto	reads_mapped_to_velocyto__unique_multiple_velocyto	reads_with_valid_barcodes	sequencing_saturation	total_feature_detected	umis_in_cells	unique_reads

# Database updates

## CZI datasets

### Organism 

In [28]:
# read in srx-metadata as pandas dataframe
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select("*") \
    .distinct() \
    .where(tbl.notes != "Processed by Chris Carpenter")
srx_metadata = pd.read_sql(str(stmt), conn_prod)
srx_metadata

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,notes,created_at,updated_at
0,sra,26659536,SRX19392127,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,mouse,S1 cortex,neuropsychiatric outcomes,maternal immune activation,"microglia, P14, male, C57Bl/6N, WT",,,Metadata obtained by SRAgent,2025-01-06 22:15:53.177758,2025-01-06 22:19:05.896057
1,sra,27454961,ERX10668462,yes,yes,yes,other,single_cell,mouse,ILC progenitors from femur,unsure,unsure,unsure,,,Metadata obtained by SRAgent,2025-01-06 22:15:53.177758,2025-01-06 22:19:02.038324,
2,sra,27454971,ERX10668472,yes,yes,yes,other,not_applicable,single_cell,mouse,ILC progenitors from femur,unsure,unsure,ILC2 cells,,,Metadata obtained by SRAgent,2025-01-06 22:15:53.177758,2025-01-06 22:38:48.116573
3,sra,27455317,ERX10668818,yes,yes,yes,other,not_applicable,single_cell,mouse,femur,unsure,Bcl11b regulatory networks,unsure,,,Metadata obtained by SRAgent,2025-01-07 00:28:36.611049,2025-01-07 00:30:22.045996
4,sra,27694563,SRX20273630,yes,yes,yes,other,not_applicable,single_cell,human,HCT116,not specified,infected,HCT116,,,Metadata obtained by SRAgent,2025-01-03 18:42:38.083215,2025-01-03 18:44:17.048627
5,sra,29919592,SRX22006379,yes,yes,yes,10x_Genomics,5_prime_gex,single_cell,human,blastocyst,unsure,unsure,inner cell mass (ICM),,,Metadata obtained by SRAgent,2025-01-03 18:42:38.083215,2025-01-03 18:43:57.602382
6,sra,30864469,SRX22822524,yes,yes,yes,MARS-seq,not_applicable,single_cell,mouse,tumor,unsure,unsure,OT1 CD8+,,,Metadata obtained by SRAgent,2025-01-06 22:15:53.177758,2025-01-06 22:18:04.534748
7,sra,30865352,SRX22823407,yes,yes,yes,other,not_applicable,single_cell,mouse,tumor,phaeohyphomycosis,anti-PD1 treatment,endogenous CD8+ T cells,,,Metadata obtained by SRAgent,2025-01-03 18:42:38.083215,2025-01-03 18:45:48.003195
8,sra,30865785,SRX22823840,yes,yes,yes,MARS-seq,not_applicable,single_cell,mouse,tumor,cancer immunotherapy,none specified,"B16-OVA, MC38, LLC",,,Metadata obtained by SRAgent,2025-01-06 22:15:53.177758,2025-01-06 22:18:33.867744
9,sra,32469529,SRX24158623,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,human,primary cell culture,glaucoma,primary hTM cells isolated from human donor co...,healthy,,,Metadata obtained by SRAgent,2025-01-06 22:37:10.538238,2025-01-06 22:40:04.684759


In [None]:
# duplicate srx_accession values?
x = srx_metadata[srx_metadata.srx_accession.notnull()]
x[x.duplicated(subset="srx_accession", keep=False)]

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,notes,created_at,updated_at


In [None]:
# duplicate srx_accession values?
x["srx_accession"].value_counts().describe()

count    40082.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: count, dtype: float64

In [13]:
# duplicate srx_accession values?
x["entrez_id"].value_counts().describe()

count    14630.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: count, dtype: float64

In [10]:
# number of "10X_Genomics" samples
srx_metadata[srx_metadata["lib_prep"] == "10x_Genomics"].shape[0]

8434

# STAR results table

In [11]:
# # update "other" organisms to correct
# idx = [
#     ['SRX9556667', 'human'], 
#     ['ERX12060278', 'human'], 
#     ['ERX12060274', 'human'], 
#     ['ERX12060276', 'human'], 
#     ['ERX12060273', 'human'], 
#     ['ERX12060297', 'human'], 
#     ['ERX12060299', 'human'], 
#     ['ERX12060275', 'human'], 
#     ['SRX13549222', 'other'],
#     ['SRX9556651', 'human'], 
#     ['ERX12060277', 'human'], 
# ]
# idx = pd.DataFrame(idx, columns=["srx_accession", "organism"])
# idx

Unnamed: 0,sample,feature,estimated_number_of_cells,fraction_of_unique_reads_in_cells,mean_gene_per_cell,mean_umi_per_cell,mean_feature_per_cell,median_gene_per_cell,median_umi_per_cell,median_feature_per_cell,...,reads_with_valid_barcodes,sequencing_saturation,total_feature_detected,umis_in_cells,unique_reads_in_cells_mapped_to_gene,unique_reads_in_cells_mapped_to_genefull,unique_reads_in_cells_mapped_to_genefull_ex50pas,unique_reads_in_cells_mapped_to_genefull_exonoverintron,created_at,updated_at
19,sra,33249739,SRX24915001,yes,yes,yes,Smart-seq2,not_applicable,single_cell,human,blood,Hepatitis C Virus (HCV) reinfection outcomes,Botulinum toxin A injection,Memory B cells,,,Metadata obtained by SRAgent,2025-01-06 22:37:10.538238,2025-01-06 22:39:12.479315,,
20,sra,33249812,SRX24915074,yes,yes,yes,Smart-seq2,not_applicable,single_cell,human,blood,HCV reinfection,mobile health intervention post-ACL reconstruc...,Memory B cells,,,Metadata obtained by SRAgent,2025-01-06 22:37:10.538238,2025-01-06 22:39:14.822953,,
21,sra,33249944,SRX24915206,yes,yes,yes,Smart-seq2,not_applicable,single_cell,human,blood,not specified,"exercised, E2-specific B cells",human samples (Memory B cells),,,Metadata obtained by SRAgent,2025-01-06 22:15:53.177758,2025-01-06 22:17:04.727422,,
22,sra,34280289,ERX11662340,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,human,lung,non-small cell lung cancer,unsure,not_applicable,,,Metadata obtained by SRAgent,2025-01-06 22:15:53.177758,2025-01-06 22:17:23.956995,,
23,sra,34602736,SRX25601950,yes,yes,yes,10x_Genomics,other,single_cell,human,Peripheral Blood Mononuclear Cells (PBMCs),unsure,unsure,not_applicable,,,Metadata obtained by SRAgent,2025-01-07 00:28:36.611049,2025-01-07 00:30:58.868003,,


In [17]:
# read in srx-metadata as pandas dataframe
tbl = Table("screcounter_star_results")
stmt = Query \
    .from_(tbl) \
    .select("*") 
star_results = pd.read_sql(str(stmt), conn_prod)
star_results

Unnamed: 0,database,entrez_id,srx_accession,organism,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
24,sra,34602788,SRX25602002,yes,yes,yes,other,not_applicable,single_cell,human,PBMCs,Type 2 Diabetes Mellitus,Randomized controlled study on salt intake eff...,other,,,Metadata obtained by SRAgent,2025-01-03 18:42:38.083215,2025-01-03 18:45:27.430411
25,sra,34602910,SRX25602124,yes,yes,yes,10x_Genomics,not_applicable,single_cell,human,Peripheral Blood Mononuclear Cells (PBMCs),not specified,not specified,not specified,,,Metadata obtained by SRAgent,2025-01-07 00:28:36.611049,2025-01-07 00:30:39.316203
26,sra,34602926,SRX25602140,yes,yes,yes,other,not_applicable,single_cell,human,PBMCs (Peripheral Blood Mononuclear Cells),none,effects of salt intake in healthy humans,none,,,Metadata obtained by SRAgent,2025-01-06 22:37:10.538238,2025-01-06 22:38:49.139233
27,sra,34648698,SRX25634198,yes,yes,yes,other,not_applicable,single_cell,mouse,prostate,prostate cancer,anti-PD1 + Vehicle,Myc-CaP,,,Metadata obtained by SRAgent,2025-01-03 18:42:38.083215,2025-01-03 18:44:30.489428
28,sra,34914388,SRX25852474,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,human,epidermis,none,untreated,none,,,Metadata obtained by SRAgent,2025-01-06 22:37:10.538238,2025-01-06 22:39:43.424840
29,sra,36086779,SRX26710377,yes,yes,yes,Smart-seq2,not_applicable,single_cell,mouse,D-LN (Duodenal Lymph Nodes),Desmoid-type fibromatosis,Surgical resection,not_applicable,,,Metadata obtained by SRAgent,2025-01-07 00:28:36.611049,2025-01-07 00:31:53.332445


In [12]:
star_results["sample"].value_counts().describe()

count    3189.0
mean        5.0
std         0.0
min         5.0
25%         5.0
50%         5.0
75%         5.0
max         5.0
Name: count, dtype: float64

# Parameters table

In [11]:
# read in srx-metadata as pandas dataframe
tbl = Table("screcounter_star_params")
stmt = Query \
    .from_(tbl) \
    .select("*") \
    .where(tbl.star_index == "")
params = pd.read_sql(str(stmt), conn_prod)
params

Unnamed: 0,sample,barcodes,star_index,cell_barcode_length,umi_length,strand,created_at,updated_at


In [50]:
# read in srx-metadata as pandas dataframe
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .select("*") \
    .distinct() \
    .where(tbl.notes != "Processed by Chris Carpenter")
srx_metadata = pd.read_sql(str(stmt), conn_prod)
srx_metadata

Unnamed: 0,srx_accession,organism,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
30,sra,36086788,SRX26710386,yes,yes,yes,10x_Genomics,other,single_cell,mouse,D-LN (Dendritic Lymph Nodes),unsure,SV (specific treatment not detailed),unsure,,,Metadata obtained by SRAgent,2025-01-06 22:15:53.177758,2025-01-06 22:17:31.151426
31,sra,36086856,SRX26710454,yes,yes,yes,Smart-seq2,other,single_cell,mouse,D-LN (Duodenal Lymph Nodes),none,HP (Helminth Infection),none,,,Metadata obtained by SRAgent,2025-01-03 18:42:38.083215,2025-01-03 18:45:29.642017
32,sra,36086915,SRX26710513,yes,yes,yes,10x_Genomics,other,single_cell,mouse,D-LN (Duodenal Lymph Nodes),not specified,OV (presumably refers to a treatment condition),not specified,,,Metadata obtained by SRAgent,2025-01-07 00:28:36.611049,2025-01-07 00:30:10.729039
33,sra,36087069,SRX26710667,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,mouse,D-LN (Duodenal Lymph Node),related to cancerous mouse prostate samples,OV treatment,unsure,,,Metadata obtained by SRAgent,2025-01-06 22:37:10.538238,2025-01-06 22:39:46.008134
34,sra,36087118,SRX26710716,yes,yes,yes,10x_Genomics,other,single_cell,mouse,D-LN (Duodenal Lymph Nodes),unsure,OV,unsure,,,Metadata obtained by SRAgent,2025-01-07 00:28:36.611049,2025-01-07 00:30:26.053848
35,sra,36087317,SRX26710915,yes,yes,yes,10x_Genomics,not_applicable,single_cell,mouse,duodenal lymph nodes,unsure,SV treatment,not_applicable,,,Metadata obtained by SRAgent,2025-01-07 00:28:36.611049,2025-01-07 00:30:23.602324


In [51]:
# merge by srx_accession
def set_star_ref(row):
    if row.organism == "human":
        return "star_refData_2020_hg38"
    elif row.organism == "mouse":
        return "star_refData_2020_mm10"
    else:
        raise ValueError(f"organism {row.organism} not supported")

params = params.merge(srx_metadata, left_on="sample", right_on="srx_accession") 
params["star_index"] = params.apply(lambda x: set_star_ref(x), axis=1)
params = params.drop(columns=["created_at", "updated_at", "srx_accession", "organism"], axis=1)
params

Unnamed: 0,sample,barcodes,star_index,cell_barcode_length,umi_length,strand
0,ERX11662340,3M-february-2018.txt,star_refData_2020_hg38,16,12,Forward
1,SRX19392127,3M-february-2018.txt,star_refData_2020_mm10,16,12,Forward
2,SRX24158623,3M-february-2018.txt,star_refData_2020_hg38,16,12,Forward
3,SRX22159680,737K-august-2016.txt,star_refData_2020_hg38,16,10,Reverse
4,SRX24228136,3M-february-2018.txt,star_refData_2020_hg38,16,12,Forward
5,SRX24387553,737K-august-2016.txt,star_refData_2020_hg38,16,10,Reverse
6,SRX24461697,3M-february-2018.txt,star_refData_2020_mm10,16,12,Forward
7,SRX24523046,3M-february-2018.txt,star_refData_2020_mm10,16,12,Forward
8,SRX24585614,3M-february-2018.txt,star_refData_2020_hg38,16,12,Forward
9,SRX25852474,3M-february-2018.txt,star_refData_2020_hg38,16,12,Forward


In [52]:
# upsert
db_upsert(params, "screcounter_star_params", conn_prod)

# --OLD--

### Filter out records with no SRX accessions

`New dataset found by Find-Datasets agent`

In [8]:
entrez_ids = srx_metadata[srx_metadata["srx_accession"].isna()]["entrez_id"].unique().tolist()
entrez_ids

[]

In [13]:
# delete extrez_ids from srx_metadata
tbl = Table("srx_metadata")
stmt = Query \
    .from_(tbl) \
    .delete() \
    .where(tbl.entrez_id.isin(entrez_ids))
with conn_prod.cursor() as cur:
    cur.execute(str(stmt))
    conn_prod.commit()

In [16]:
# delete from log
tbl = Table("screcounter_log")
to_rm = ["ERX11146221"]
stmt = Query \
    .from_(tbl) \
    .delete() \
    .where(tbl.sample.isin(to_rm))
with conn_prod.cursor() as cur:
    cur.execute(str(stmt))
    conn_prod.commit()