# Goals

* Migrate data from test to production

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
import pandas as pd

In [3]:
from SRAgent.db.connect import db_connect
from SRAgent.db.upsert import db_upsert
from SRAgent.db.utils import db_list_tables, db_glimpse_tables, db_get_table, execute_query
from SRAgent.db.get import db_find_srx
from SRAgent.db.create import create_table, create_table_router

In [4]:
# list all of the tables in test
os.environ['DYNACONF'] = 'test'
conn_test = db_connect() 
print("\n".join(db_list_tables(conn_test)))

srx_srr
screcounter_star
eval
screcounter_log
srx_metadata


In [5]:
db_glimpse_tables(conn_test)

#-- Table: srx_srr --#
srx_accession	srr_accession	created_at	updated_at
SRX26727599	SRR31350667	2024-12-18 17:37:23.074909	2024-12-18 17:37:23.074909
SRX23538581	SRR27876733	2024-12-18 17:37:23.074909	2024-12-18 17:37:23.074909
SRX23261451	SRR27592690	2024-12-18 17:37:23.074909	2024-12-18 17:37:23.074909
SRX23261451	SRR27592688	2024-12-18 17:37:23.074909	2024-12-18 17:37:23.074909
SRX23261451	SRR27592689	2024-12-18 17:37:23.074909	2024-12-18 17:37:23.074909

#-- Table: screcounter_star --#
sample	accession	feature	estimated_number_of_cells	fraction_of_unique_reads_in_cells	mean_gene_per_cell	mean_umi_per_cell	mean_feature_per_cell	median_gene_per_cell	median_umi_per_cell	median_feature_per_cell	number_of_reads	reads_with_valid_barcodes	sequencing_saturation	umis_in_cells	created_at	updated_at
SRX19729045	SRR23917610	gene	139	1.0		4.0	3.0		1.0	1.0	100000	0	0.171386	556	2024-12-23 21:49:11.388231	2024-12-23 21:49:11.388231
SRX19729045	SRR23917610	gene_ex50	197	1.0		3.0	3.0		1.0	1.0	1000

In [6]:
# list all of the tables in prod
os.environ['DYNACONF'] = 'prod'
conn_prod = db_connect() 
print("\n".join(db_list_tables(conn_prod)))

eval
screcounter_log
screcounter_star
srx_srr
srx_metadata


In [7]:
db_glimpse_tables(conn_prod)

#-- Table: eval --#
dataset_id	database	entrez_id	srx_accession	is_illumina	is_single_cell	is_paired_end	lib_prep	tech_10x	organism	cell_prep	created_at	updated_at

#-- Table: screcounter_log --#
sample	accession	process	step	status	message	created_at	updated_at

#-- Table: screcounter_star --#
sample	accession	feature	estimated_number_of_cells	fraction_of_unique_reads_in_cells	mean_gene_per_cell	mean_umi_per_cell	mean_feature_per_cell	median_gene_per_cell	median_umi_per_cell	median_feature_per_cell	number_of_reads	reads_with_valid_barcodes	sequencing_saturation	umis_in_cells	created_at	updated_at

#-- Table: srx_srr --#
srx_accession	srr_accession	created_at	updated_at

#-- Table: srx_metadata --#
database	entrez_id	srx_accession	is_illumina	is_single_cell	is_paired_end	lib_prep	tech_10x	cell_prep	organism	tissue	disease	purturbation	cell_line	czi_collection_id	czi_collection_name	notes	created_at	updated_at



# SRX Metadata

## Chris' processed records

* Add to production database

In [8]:
infile = "../data/20241227_ChrisC_metadata_v1.tsv"
chris_cur_meta = pd.read_csv(infile, sep="\t").drop_duplicates()
chris_cur_meta

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,notes,created_at,updated_at,experiment,CZI_collection_name,CZI_collection_id
0,sra,29110064,ERX11148781,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,ERX11148781,Developmental cell programs are co-opted in in...,73f82ac8-15cc-4fcd-87f8-5683723fce7f
1,sra,19007785,SRX13670569,,,,,,,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,SRX13670569,Neuron type-specific effects of human aging an...,91c8e321-566f-4f9d-b89e-3a164be654d5
2,sra,12488012,SRX9556597,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,SRX9556597,Time-resolved Systems Immunology Reveals a Lat...,ed9185e3-5b82-40c7-9824-b2141590c7f0
14,sra,21270435,ERX8791959,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,ERX8791959,Cross-tissue immune cell analysis reveals tiss...,62ef75e4-cbea-454e-a0ce-998ec40223d3
15,sra,34493113,ERX12558784,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,ERX12558784,Early human lung immune cell development and i...,ec329aed-22bc-4d6e-8935-8282dcb1acac
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52899,sra,19008576,SRX13671360,,,,,,,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,SRX13671360,Neuron type-specific effects of human aging an...,91c8e321-566f-4f9d-b89e-3a164be654d5
52900,sra,21270500,ERX8792024,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,ERX8792024,Cross-tissue immune cell analysis reveals tiss...,62ef75e4-cbea-454e-a0ce-998ec40223d3
52901,sra,29110088,ERX11148805,,,,,,,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,ERX11148805,Developmental cell programs are co-opted in in...,73f82ac8-15cc-4fcd-87f8-5683723fce7f
52902,sra,10884261,ERX4126865,,,,,,,,,,,,Processed by Chris Carpenter,31:16.4,31:35.1,ERX4126865,"scRNA-seq assessment of the human lung, spleen...",4d74781b-8186-4c9a-b659-ff4dc4601d91


In [9]:
# remove the columns that are not in the database
chris_cur_meta.drop(["experiment", "created_at", "updated_at"], axis=1, inplace=True)

In [10]:
# lower case column names
chris_cur_meta.columns = chris_cur_meta.columns.str.lower()
chris_cur_meta

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,notes,czi_collection_name,czi_collection_id
0,sra,29110064,ERX11148781,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,Developmental cell programs are co-opted in in...,73f82ac8-15cc-4fcd-87f8-5683723fce7f
1,sra,19007785,SRX13670569,,,,,,,,,,,,Processed by Chris Carpenter,Neuron type-specific effects of human aging an...,91c8e321-566f-4f9d-b89e-3a164be654d5
2,sra,12488012,SRX9556597,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,Time-resolved Systems Immunology Reveals a Lat...,ed9185e3-5b82-40c7-9824-b2141590c7f0
14,sra,21270435,ERX8791959,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,Cross-tissue immune cell analysis reveals tiss...,62ef75e4-cbea-454e-a0ce-998ec40223d3
15,sra,34493113,ERX12558784,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,Early human lung immune cell development and i...,ec329aed-22bc-4d6e-8935-8282dcb1acac
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52899,sra,19008576,SRX13671360,,,,,,,,,,,,Processed by Chris Carpenter,Neuron type-specific effects of human aging an...,91c8e321-566f-4f9d-b89e-3a164be654d5
52900,sra,21270500,ERX8792024,,,,\t10x_Genomics,5',single_cell,,,,,,Processed by Chris Carpenter,Cross-tissue immune cell analysis reveals tiss...,62ef75e4-cbea-454e-a0ce-998ec40223d3
52901,sra,29110088,ERX11148805,,,,,,,,,,,,Processed by Chris Carpenter,Developmental cell programs are co-opted in in...,73f82ac8-15cc-4fcd-87f8-5683723fce7f
52902,sra,10884261,ERX4126865,,,,,,,,,,,,Processed by Chris Carpenter,"scRNA-seq assessment of the human lung, spleen...",4d74781b-8186-4c9a-b659-ff4dc4601d91


#### Format metadata

In [12]:
## count values in each character column
for col in ["is_illumina", "is_single_cell", "is_paired_end"]:
    print(chris_cur_meta[col].value_counts())

Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)


In [13]:
# format tech_10x
idx = {
    "5'": "5_prime_gex",
    "3'v3": "3_prime_gex",
    "3'v2": "3_prime_gex",
    "multiome GEX": "multiome",
}
for k,v in idx.items():
    chris_cur_meta["tech_10x"] = chris_cur_meta["tech_10x"].replace(k, v)

In [14]:
# format lib_prep
idx = {
    "\t10x_Genomics": "10x_Genomics",
}
for k,v in idx.items():
    chris_cur_meta["lib_prep"] = chris_cur_meta["lib_prep"].replace(k, v)

In [15]:
## count values in each character column
for col in ["lib_prep", "tech_10x", "cell_prep"]:
    print(chris_cur_meta[col].value_counts())

lib_prep
10x_Genomics    2223
Name: count, dtype: int64
tech_10x
3_prime_gex    1179
5_prime_gex     873
multiome        122
Name: count, dtype: int64
cell_prep
single_cell    2223
Name: count, dtype: int64


In [16]:
## count values in each character column
for col in ["organism", "tissue", "disease", "purturbation", "cell_line"]:
    print(chris_cur_meta[col].value_counts())

Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)


In [17]:
# write to file
outfile = os.path.join("../data/", "20241227_ChrisC_metadata_v2.tsv")
chris_cur_meta.to_csv(outfile, sep="\t", index=False)

In [18]:
%%bash

../scripts/db-tools.py --upsert-csv ../data/20241227_ChrisC_metadata_v2.tsv --upsert-target srx_metadata

Upserted ../data/20241227_ChrisC_metadata_v2.tsv into srx_metadata


In [19]:
%%bash

../scripts/db-tools.py --glimpse

#-- Table: eval --#
dataset_id	database	entrez_id	srx_accession	is_illumina	is_single_cell	is_paired_end	lib_prep	tech_10x	organism	cell_prep	created_at	updated_at

#-- Table: screcounter_log --#
sample	accession	process	step	status	message	created_at	updated_at

#-- Table: screcounter_star --#
sample	accession	feature	estimated_number_of_cells	fraction_of_unique_reads_in_cells	mean_gene_per_cell	mean_umi_per_cell	mean_feature_per_cell	median_gene_per_cell	median_umi_per_cell	median_feature_per_cell	number_of_reads	reads_with_valid_barcodes	sequencing_saturation	umis_in_cells	created_at	updated_at

#-- Table: srx_srr --#
srx_accession	srr_accession	created_at	updated_at

#-- Table: srx_metadata --#
database	entrez_id	srx_accession	is_illumina	is_single_cell	is_paired_end	lib_prep	tech_10x	cell_prep	organism	tissue	disease	purturbation	cell_line	czi_collection_id	czi_collection_name	notes	created_at	updated_at
sra	29110064	ERX11148781	NaN	NaN	NaN	10x_Genomics	5_prime_gex	single_cell	NaN

# Check production database

In [20]:
db_glimpse_tables(conn_prod)

#-- Table: eval --#
dataset_id	database	entrez_id	srx_accession	is_illumina	is_single_cell	is_paired_end	lib_prep	tech_10x	organism	cell_prep	created_at	updated_at

#-- Table: screcounter_log --#
sample	accession	process	step	status	message	created_at	updated_at

#-- Table: screcounter_star --#
sample	accession	feature	estimated_number_of_cells	fraction_of_unique_reads_in_cells	mean_gene_per_cell	mean_umi_per_cell	mean_feature_per_cell	median_gene_per_cell	median_umi_per_cell	median_feature_per_cell	number_of_reads	reads_with_valid_barcodes	sequencing_saturation	umis_in_cells	created_at	updated_at

#-- Table: srx_srr --#
srx_accession	srr_accession	created_at	updated_at

#-- Table: srx_metadata --#
database	entrez_id	srx_accession	is_illumina	is_single_cell	is_paired_end	lib_prep	tech_10x	cell_prep	organism	tissue	disease	purturbation	cell_line	czi_collection_id	czi_collection_name	notes	created_at	updated_at
sra	29110064	ERX11148781	NaN	NaN	NaN	10x_Genomics	5_prime_gex	single_cell	NaN

In [21]:
srx_metadata_prod = db_get_table("srx_metadata", conn_prod)
srx_metadata_prod

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,notes,created_at,updated_at
0,sra,29110064,ERX11148781,,,,10x_Genomics,5_prime_gex,single_cell,,,,,,73f82ac8-15cc-4fcd-87f8-5683723fce7f,Developmental cell programs are co-opted in in...,Processed by Chris Carpenter,2025-01-03 18:39:40.221883,2025-01-03 18:39:40.221883
1,sra,19007785,SRX13670569,,,,,,,,,,,,91c8e321-566f-4f9d-b89e-3a164be654d5,Neuron type-specific effects of human aging an...,Processed by Chris Carpenter,2025-01-03 18:39:40.221883,2025-01-03 18:39:40.221883
2,sra,12488012,SRX9556597,,,,10x_Genomics,5_prime_gex,single_cell,,,,,,ed9185e3-5b82-40c7-9824-b2141590c7f0,Time-resolved Systems Immunology Reveals a Lat...,Processed by Chris Carpenter,2025-01-03 18:39:40.221883,2025-01-03 18:39:40.221883
3,sra,21270435,ERX8791959,,,,10x_Genomics,5_prime_gex,single_cell,,,,,,62ef75e4-cbea-454e-a0ce-998ec40223d3,Cross-tissue immune cell analysis reveals tiss...,Processed by Chris Carpenter,2025-01-03 18:39:40.221883,2025-01-03 18:39:40.221883
4,sra,34493113,ERX12558784,,,,10x_Genomics,5_prime_gex,single_cell,,,,,,ec329aed-22bc-4d6e-8935-8282dcb1acac,Early human lung immune cell development and i...,Processed by Chris Carpenter,2025-01-03 18:39:40.221883,2025-01-03 18:39:40.221883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6055,sra,19008576,SRX13671360,,,,,,,,,,,,91c8e321-566f-4f9d-b89e-3a164be654d5,Neuron type-specific effects of human aging an...,Processed by Chris Carpenter,2025-01-03 18:39:40.221883,2025-01-03 18:39:40.221883
6056,sra,21270500,ERX8792024,,,,10x_Genomics,5_prime_gex,single_cell,,,,,,62ef75e4-cbea-454e-a0ce-998ec40223d3,Cross-tissue immune cell analysis reveals tiss...,Processed by Chris Carpenter,2025-01-03 18:39:40.221883,2025-01-03 18:39:40.221883
6057,sra,29110088,ERX11148805,,,,,,,,,,,,73f82ac8-15cc-4fcd-87f8-5683723fce7f,Developmental cell programs are co-opted in in...,Processed by Chris Carpenter,2025-01-03 18:39:40.221883,2025-01-03 18:39:40.221883
6058,sra,10884261,ERX4126865,,,,,,,,,,,,4d74781b-8186-4c9a-b659-ff4dc4601d91,"scRNA-seq assessment of the human lung, spleen...",Processed by Chris Carpenter,2025-01-03 18:39:40.221883,2025-01-03 18:39:40.221883


In [22]:
# filter out Chris' records
srx_metadata_prod_noChris = srx_metadata_prod[srx_metadata_prod["notes"] != "Processed by Chris Carpenter"]
srx_metadata_prod_noChris

Unnamed: 0,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,lib_prep,tech_10x,cell_prep,organism,tissue,disease,purturbation,cell_line,czi_collection_id,czi_collection_name,notes,created_at,updated_at


# sessionInfo

In [52]:
!conda list

# packages in environment at /home/nickyoungblut/miniforge3/envs/SRAgent:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
aiohappyeyeballs          2.4.4                    pypi_0    pypi
aiohttp                   3.11.10                  pypi_0    pypi
aiosignal                 1.3.2                    pypi_0    pypi
annotated-types           0.7.0                    pypi_0    pypi
anyio                     4.7.0                    pypi_0    pypi
asttokens                 3.0.0              pyhd8ed1ab_1    conda-forge
attrs                     24.2.0                   pypi_0    pypi
beautifulsoup4            4.12.3                   pypi_0    pypi
biopython                 1.84                     pypi_0    pypi
build                     1.2.2.post1              pypi_0    pypi
bzip2                     1.0.8             