# Goals

* Migrate data from test to production

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
import pandas as pd

In [3]:
from SRAgent.db.connect import db_connect
from SRAgent.db.upsert import db_upsert
from SRAgent.db.utils import db_list_tables, db_glimpse_tables, execute_query
from SRAgent.db.get import db_find_srx
from SRAgent.db.create import create_table, create_table_router

In [4]:
# list all of the tables in test
os.environ['DYNACONF'] = 'test'
conn_test = db_connect() 
print("\n".join(db_list_tables(conn_test)))

srx_srr
screcounter_star
eval
screcounter_log
srx_metadata


In [5]:
db_glimpse_tables(conn_test)

#-- Table: srx_srr --#
srx_accession	srr_accession	created_at	updated_at
SRX26727599	SRR31350667	2024-12-18 17:37:23.074909	2024-12-18 17:37:23.074909
SRX23538581	SRR27876733	2024-12-18 17:37:23.074909	2024-12-18 17:37:23.074909
SRX23261451	SRR27592690	2024-12-18 17:37:23.074909	2024-12-18 17:37:23.074909
SRX23261451	SRR27592688	2024-12-18 17:37:23.074909	2024-12-18 17:37:23.074909
SRX23261451	SRR27592689	2024-12-18 17:37:23.074909	2024-12-18 17:37:23.074909

#-- Table: screcounter_star --#
sample	accession	feature	estimated_number_of_cells	fraction_of_unique_reads_in_cells	mean_gene_per_cell	mean_umi_per_cell	mean_feature_per_cell	median_gene_per_cell	median_umi_per_cell	median_feature_per_cell	number_of_reads	reads_with_valid_barcodes	sequencing_saturation	umis_in_cells	created_at	updated_at
SRX19729045	SRR23917610	gene	139	1.0		4.0	3.0		1.0	1.0	100000	0	0.171386	556	2024-12-23 21:49:11.388231	2024-12-23 21:49:11.388231
SRX19729045	SRR23917610	gene_ex50	197	1.0		3.0	3.0		1.0	1.0	1000

In [6]:
# list all of the tables in prod
os.environ['DYNACONF'] = 'prod'
conn_prod = db_connect() 
print("\n".join(db_list_tables(conn_prod)))

srx_srr
eval
screcounter_log
screcounter_star
srx_metadata


In [7]:
db_glimpse_tables(conn_prod)

#-- Table: srx_srr --#
srx_accession	srr_accession	created_at	updated_at

#-- Table: eval --#
dataset_id	database	entrez_id	srx_accession	is_illumina	is_single_cell	is_paired_end	lib_prep	tech_10x	organism	cell_prep	created_at	updated_at

#-- Table: screcounter_log --#
sample	accession	process	step	status	message	created_at	updated_at

#-- Table: screcounter_star --#
sample	accession	feature	estimated_number_of_cells	fraction_of_unique_reads_in_cells	mean_gene_per_cell	mean_umi_per_cell	mean_feature_per_cell	median_gene_per_cell	median_umi_per_cell	median_feature_per_cell	number_of_reads	reads_with_valid_barcodes	sequencing_saturation	umis_in_cells	created_at	updated_at

#-- Table: srx_metadata --#
database	entrez_id	srx_accession	is_illumina	is_single_cell	is_paired_end	lib_prep	tech_10x	cell_prep	organism	tissue	disease	purturbation	cell_line	czi_collection_id	czi_collection_name	notes	created_at	updated_at



# Load tables

In [8]:
data_dir = "../db_bkup/2025-01-02"

In [9]:
# list csv files in the data directory
import os
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
print("\n".join(csv_files))

eval.csv
srx_metadata.csv
screcounter_log.csv
screcounter_star.csv
srx_srr.csv


## Metadata

In [10]:
infile = os.path.join(data_dir, "srx_metadata.csv")
srx_metadata = pd.read_csv(infile)
srx_metadata.head()

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,notes,created_at,updated_at
0,sra,33249542,SRX24914804,yes,yes,yes,Smart-seq2,not_applicable,single_cell,human,blood,Hepatitis C Virus (HCV) reinfection outcomes,Akebia Saponin D (ASD) and TNF-induced optic n...,Memory B cells,Metadata obtained by SRAgent,2024-12-18 18:23:36.181001,2024-12-18 18:25:14.322667
1,sra,34603139,SRX25602353,yes,yes,yes,10x_Genomics,other,single_cell,human,PBMCs,unsure,effects of salt intake in healthy humans,unsure,Metadata obtained by SRAgent,2024-12-18 18:23:36.181001,2024-12-18 18:25:33.937056
2,sra,31225478,SRX23114342,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,mouse,sciatic nerve,spontaneous autoimmune neuropathy (CIDP),unsure,immune cells (CD45+),Metadata obtained by SRAgent,2024-12-18 18:40:50.574099,2024-12-18 18:42:11.558669
3,sra,34603098,SRX25602312,yes,yes,yes,other,not_applicable,single_cell,human,PBMCs,none,effects of salt intake,none,Metadata obtained by SRAgent,2024-12-18 18:40:50.574099,2024-12-18 18:42:33.593136
4,sra,31787850,SRX23530678,yes,yes,yes,10x_Genomics,3_prime_gex,single_cell,mouse,bone marrow,neuroblastoma,Mxra7 knockout,bone marrow cells from Mxra7 knockout mice,Metadata obtained by SRAgent,2024-12-18 18:43:35.370648,2024-12-18 18:45:18.280506


#### Chris' processed records

In [11]:
# Chris' records
srx_metadata["notes"].unique()

array(['Metadata obtained by SRAgent',
       'New dataset found by Find-Datasets agent',
       'Processed by Chris Carpenter', nan], dtype=object)

In [12]:
srx_metadata_chris = srx_metadata[srx_metadata["notes"] == "Processed by Chris Carpenter"]
srx_metadata_chris

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,notes,created_at,updated_at
103,sra,29110064,ERX11148781,,,,,,,,,,,,Processed by Chris Carpenter,2024-12-18 17:31:16.385013,2024-12-18 17:31:35.121583
108,sra,19007785,SRX13670569,,,,,,,,,,,,Processed by Chris Carpenter,2024-12-18 17:31:16.385013,2024-12-18 17:31:35.121583
109,sra,27175908,SRX19828711,,,,,,,,,,,,Processed by Chris Carpenter,2024-12-18 17:31:16.385013,2024-12-18 17:31:35.121583
110,sra,27176348,SRX19829151,,,,,,,,,,,,Processed by Chris Carpenter,2024-12-18 17:31:16.385013,2024-12-18 17:31:35.121583
123,sra,12488012,SRX9556597,,,,,,,,,,,,Processed by Chris Carpenter,2024-12-18 17:31:16.385013,2024-12-18 17:31:35.121583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16774,sra,12773490,SRX9770752,,,,,,,,,,,,Processed by Chris Carpenter,2024-12-18 17:31:16.385013,2024-12-18 17:31:35.121583
16775,sra,15435584,SRX11523521,,,,,,,,,,,,Processed by Chris Carpenter,2024-12-18 17:31:16.385013,2024-12-18 17:31:35.121583
16776,sra,27177388,SRX19830191,,,,,,,,,,,,Processed by Chris Carpenter,2024-12-18 17:31:16.385013,2024-12-18 17:31:35.121583
16777,sra,27172504,SRX19825307,,,,,,,,,,,,Processed by Chris Carpenter,2024-12-18 17:31:16.385013,2024-12-18 17:31:35.121583


In [14]:
infile = "../data/20241227_ChrisC_metadata.tsv"
chris_cur_meta = pd.read_csv(infile, sep="\t").drop_duplicates()
chris_cur_meta

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,notes,created_at,updated_at,experiment,CZI_collection_name
0,sra,29110064,ERX11148781,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,ERX11148781,Developmental cell programs are co-opted in in...
1,sra,19007785,SRX13670569,,,,,,,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,SRX13670569,Neuron type-specific effects of human aging an...
2,sra,27175908,SRX19828711,,,,,,,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,SRX19828711,A single-cell transcriptional timelapse of mou...
3,sra,27176348,SRX19829151,,,,,,,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,SRX19829151,A single-cell transcriptional timelapse of mou...
4,sra,12488012,SRX9556597,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,SRX9556597,Time-resolved Systems Immunology Reveals a Lat...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69490,sra,12773490,SRX9770752,,,,\t10x_Genomics,3'v3,single_cell,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,SRX9770752,An Integrated Single Cell Atlas of Human Perio...
69492,sra,15435584,SRX11523521,,,,,,,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,SRX11523521,MSK SPECTRUM ‚Äì Ovarian cancer mutational pro...
69494,sra,27177388,SRX19830191,,,,,,,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,SRX19830191,A single-cell transcriptional timelapse of mou...
69495,sra,27172504,SRX19825307,,,,,,,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,SRX19825307,A single-cell transcriptional timelapse of mou...


In [15]:
# remove the columns that are not in the database
chris_cur_meta.drop(["experiment", "created_at", "updated_at"], axis=1, inplace=True)

In [16]:
# lower case column names
chris_cur_meta.columns = chris_cur_meta.columns.str.lower()
chris_cur_meta

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,notes,czi_collection_name
0,sra,29110064,ERX11148781,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,Developmental cell programs are co-opted in in...
1,sra,19007785,SRX13670569,,,,,,,,,,,,Processed by Chris Carpenter,Neuron type-specific effects of human aging an...
2,sra,27175908,SRX19828711,,,,,,,,,,,,Processed by Chris Carpenter,A single-cell transcriptional timelapse of mou...
3,sra,27176348,SRX19829151,,,,,,,,,,,,Processed by Chris Carpenter,A single-cell transcriptional timelapse of mou...
4,sra,12488012,SRX9556597,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,Time-resolved Systems Immunology Reveals a Lat...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69490,sra,12773490,SRX9770752,,,,\t10x_Genomics,3'v3,single_cell,,,,,,Processed by Chris Carpenter,An Integrated Single Cell Atlas of Human Perio...
69492,sra,15435584,SRX11523521,,,,,,,,,,,,Processed by Chris Carpenter,MSK SPECTRUM ‚Äì Ovarian cancer mutational pro...
69494,sra,27177388,SRX19830191,,,,,,,,,,,,Processed by Chris Carpenter,A single-cell transcriptional timelapse of mou...
69495,sra,27172504,SRX19825307,,,,,,,,,,,,Processed by Chris Carpenter,A single-cell transcriptional timelapse of mou...


#### Format metadata

In [27]:
## count values in each character column
for col in ["is_illumina", "is_single_cell", "is_paired_end"]:
    print(chris_cur_meta[col].value_counts())

Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)


In [37]:
# format tech_10x
idx = {
    "5'": "5_prime_gex",
    "3'v3": "3_prime_gex",
    "3'v2": "3_prime_gex",
    "multiome GEX": "multiome",
}
for k,v in idx.items():
    chris_cur_meta["tech_10x"] = chris_cur_meta["tech_10x"].replace(k, v)

In [39]:
# format lib_prep
idx = {
    "\t10x_Genomics": "10x_Genomics",
}
for k,v in idx.items():
    chris_cur_meta["lib_prep"] = chris_cur_meta["lib_prep"].replace(k, v)

In [42]:
## count values in each character column
for col in ["lib_prep", "tech_10x", "cell_prep"]:
    print(chris_cur_meta[col].value_counts())

lib_prep
10x_Genomics    2367
Name: count, dtype: int64
tech_10x
3_prime_gex    1323
5_prime_gex     873
multiome        122
Name: count, dtype: int64
cell_prep
single_cell    2367
Name: count, dtype: int64


In [44]:
## count values in each character column
for col in ["organism", "tissue", "disease", "purturbation", "cell_line"]:
    print(chris_cur_meta[col].value_counts())

Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)


In [45]:
# write to file
outfile = os.path.join("../data/", "20241227_ChrisC_metadata_v2.tsv")
chris_cur_meta.to_csv(outfile, sep="\t", index=False)

In [50]:
%%bash

../scripts/db-tools.py --upsert-csv ../data/20241227_ChrisC_metadata_v2.tsv --upsert-target srx_metadata

Upserted ../data/20241227_ChrisC_metadata_v2.tsv into srx_metadata


In [51]:
%%bash

../scripts/db-tools.py --glimpse

#-- Table: srx_srr --#
srx_accession	srr_accession	created_at	updated_at

#-- Table: eval --#
dataset_id	database	entrez_id	srx_accession	is_illumina	is_single_cell	is_paired_end	lib_prep	tech_10x	organism	cell_prep	created_at	updated_at

#-- Table: screcounter_log --#
sample	accession	process	step	status	message	created_at	updated_at

#-- Table: screcounter_star --#
sample	accession	feature	estimated_number_of_cells	fraction_of_unique_reads_in_cells	mean_gene_per_cell	mean_umi_per_cell	mean_feature_per_cell	median_gene_per_cell	median_umi_per_cell	median_feature_per_cell	number_of_reads	reads_with_valid_barcodes	sequencing_saturation	umis_in_cells	created_at	updated_at

#-- Table: srx_metadata --#
database	entrez_id	srx_accession	is_illumina	is_single_cell	is_paired_end	lib_prep	tech_10x	cell_prep	organism	tissue	disease	purturbation	cell_line	czi_collection_id	czi_collection_name	notes	created_at	updated_at
sra	29110064	ERX11148781	NaN	NaN	NaN	10x_Genomics	5_prime_gex	single_cell	NaN

# sessionInfo

In [52]:
!conda list

# packages in environment at /home/nickyoungblut/miniforge3/envs/SRAgent:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
aiohappyeyeballs          2.4.4                    pypi_0    pypi
aiohttp                   3.11.10                  pypi_0    pypi
aiosignal                 1.3.2                    pypi_0    pypi
annotated-types           0.7.0                    pypi_0    pypi
anyio                     4.7.0                    pypi_0    pypi
asttokens                 3.0.0              pyhd8ed1ab_1    conda-forge
attrs                     24.2.0                   pypi_0    pypi
beautifulsoup4            4.12.3                   pypi_0    pypi
biopython                 1.84                     pypi_0    pypi
build                     1.2.2.post1              pypi_0    pypi
bzip2                     1.0.8             