# Create SRA submission sheet
This Python Jupyter notebook creates a BioSample submission sheet for the Sequence Read Archive:

First, import Python modules:

In [1]:
import datetime
import glob
import itertools
import os

import natsort

import pandas as pd

import yaml

Display full columns of data frames:

In [2]:
pd.set_option('display.max_colwidth', None)

Read the configuration for the uploads:

In [3]:
with open('upload_config.yaml') as f:
    config = yaml.safe_load(f)

Read the Illumina runs to submit:

In [4]:
print(f"Reading Illumina runs from {config['barcode_runs']}")

illumina_runs = pd.read_csv(config['barcode_runs'])

print('Here are the first few entries in the Illumina runs:')
illumina_runs.head()

Reading Illumina runs from barcode_runs_to_upload.csv
Here are the first few entries in the Illumina runs:


Unnamed: 0,date,experiment,target,library,antibody,concentration,sort_bin,selection,sample,experiment_type,number_cells,frac_escape,R1
0,220816,exptREF,Wuhan_Hu_1,lib12,none,0,ref,reference,exptREF-none-0-ref,ab_selection,,,/uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib1_ref1_S70_R1_001.fastq.gz; /uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib1_ref2_S71_R1_001.fastq.gz; /uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib1_ref3_S72_R1_001.fastq.gz; /uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib1_ref4_S73_R1_001.fastq.gz
1,220816,exptREF,Wuhan_Hu_1,lib13,none,0,ref,reference,exptREF-none-0-ref,ab_selection,,,/uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib2_ref1_S74_R1_001.fastq.gz; /uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib2_ref2_S75_R1_001.fastq.gz; /uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib2_ref3_S76_R1_001.fastq.gz; /uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib2_ref4_S77_R1_001.fastq.gz
2,220816,expt8,Wuhan_Hu_1,lib12,C68_88,384,abneg,escape,expt8-C68_88-384-abneg,ab_selection,204245.0,0.056,/uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/expt8_lib1_S15_R1_001.fastq.gz
3,220816,expt8,Wuhan_Hu_1,lib13,C68_88,384,abneg,escape,expt8-C68_88-384-abneg,ab_selection,210346.0,0.059,/uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/expt8_lib2_S16_R1_001.fastq.gz
4,230714,exptREF2,Wuhan_Hu_1,lib12,none,0,ref,reference,exptREF2-none-0-ref,ab_selection,,,/uufs/chpc.utah.edu/common/home/starr-group1/sequencing/TNS/2023/230925_Azenta-pools/230714_Overbaugh-DMS-batch2/2023-07-14-lib12-pre-1_R1_001.fastq.gz; /uufs/chpc.utah.edu/common/home/starr-group1/sequencing/TNS/2023/230925_Azenta-pools/230714_Overbaugh-DMS-batch2/2023-07-14-lib12-pre-2_R1_001.fastq.gz; /uufs/chpc.utah.edu/common/home/starr-group1/sequencing/TNS/2023/230925_Azenta-pools/230714_Overbaugh-DMS-batch2/2023-07-14-lib12-pre-3_R1_001.fastq.gz; /uufs/chpc.utah.edu/common/home/starr-group1/sequencing/TNS/2023/230925_Azenta-pools/230714_Overbaugh-DMS-batch2/2023-07-14-lib12-pre-4_R1_001.fastq.gz


Next make submission entries for the barcode runs, initially in "tidy" format with one FASTQ file per row:

In [5]:
submissions_tidy = (
    illumina_runs
    .assign(
        biosample_accession=lambda x: config['biosample_accession'],
        library_ID=lambda x: x['library'] + '_' + x['sample'],
        title=lambda x: 'Illumina barcode sequencing from C68 donor mAb (v1) mutational antigenic profiling of the SARS-CoV-2 RBD ' + x['target'],
        library_strategy='AMPLICON',
        library_source='SYNTHETIC',
        library_selection='PCR',
        library_layout='single',
        platform='ILLUMINA',
        instrument_model='NextSeq 2000',
        design_description='PCR of barcodes from RBD variants',
        filetype='fastq',
        filename_fullpath=lambda x: x['R1'].str.split(';')
                                    .map(lambda flist: list(itertools.chain.from_iterable(glob.glob(f.strip()) for f in flist))),       
        )
    .explode('filename_fullpath')
    .assign(filename_fullpath=lambda x: x['filename_fullpath'].str.strip(),
            filename=lambda x: x['filename_fullpath'].map(os.path.basename))
    .drop(columns=illumina_runs.columns)
    .reset_index(drop=True)
    )

assert submissions_tidy['filename_fullpath'].map(os.path.isfile).all()

print('Here are the first few submission entries in tidy format:')
submissions_tidy.head()

Here are the first few submission entries in tidy format:


Unnamed: 0,biosample_accession,library_ID,title,library_strategy,library_source,library_selection,library_layout,platform,instrument_model,design_description,filetype,filename_fullpath,filename
0,SAMN40905401,lib12_exptREF-none-0-ref,Illumina barcode sequencing from C68 donor mAb (v1) mutational antigenic profiling of the SARS-CoV-2 RBD Wuhan_Hu_1,AMPLICON,SYNTHETIC,PCR,single,ILLUMINA,NextSeq 2000,PCR of barcodes from RBD variants,fastq,/uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib1_ref1_S70_R1_001.fastq.gz,WH1_lib1_ref1_S70_R1_001.fastq.gz
1,SAMN40905401,lib12_exptREF-none-0-ref,Illumina barcode sequencing from C68 donor mAb (v1) mutational antigenic profiling of the SARS-CoV-2 RBD Wuhan_Hu_1,AMPLICON,SYNTHETIC,PCR,single,ILLUMINA,NextSeq 2000,PCR of barcodes from RBD variants,fastq,/uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib1_ref2_S71_R1_001.fastq.gz,WH1_lib1_ref2_S71_R1_001.fastq.gz
2,SAMN40905401,lib12_exptREF-none-0-ref,Illumina barcode sequencing from C68 donor mAb (v1) mutational antigenic profiling of the SARS-CoV-2 RBD Wuhan_Hu_1,AMPLICON,SYNTHETIC,PCR,single,ILLUMINA,NextSeq 2000,PCR of barcodes from RBD variants,fastq,/uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib1_ref3_S72_R1_001.fastq.gz,WH1_lib1_ref3_S72_R1_001.fastq.gz
3,SAMN40905401,lib12_exptREF-none-0-ref,Illumina barcode sequencing from C68 donor mAb (v1) mutational antigenic profiling of the SARS-CoV-2 RBD Wuhan_Hu_1,AMPLICON,SYNTHETIC,PCR,single,ILLUMINA,NextSeq 2000,PCR of barcodes from RBD variants,fastq,/uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib1_ref4_S73_R1_001.fastq.gz,WH1_lib1_ref4_S73_R1_001.fastq.gz
4,SAMN40905401,lib13_exptREF-none-0-ref,Illumina barcode sequencing from C68 donor mAb (v1) mutational antigenic profiling of the SARS-CoV-2 RBD Wuhan_Hu_1,AMPLICON,SYNTHETIC,PCR,single,ILLUMINA,NextSeq 2000,PCR of barcodes from RBD variants,fastq,/uufs/chpc.utah.edu/common/home/u6042467/starr-group1/sequencing/TNS/2022/220816_Overbaugh-mAb-MAP/WH1_lib2_ref1_S74_R1_001.fastq.gz,WH1_lib2_ref1_S74_R1_001.fastq.gz


For the actual submission, we need a "wide" data frame that for each unique `sample_name` / `library_ID` gives all of the files each in different columns.
These should be files without the full path.

First, look at how many files there are for each sample / library:

In [6]:
(submissions_tidy
 .groupby(['biosample_accession', 'library_ID'])
 .aggregate(n_files=pd.NamedAgg('filename_fullpath', 'count'))
 .sort_values('n_files', ascending=False)
 .reset_index()
 )

Unnamed: 0,biosample_accession,library_ID,n_files
0,SAMN40905401,lib25_exptREF3-none-0-ref,4
1,SAMN40905401,lib13_exptREF-none-0-ref,4
2,SAMN40905401,lib24_exptREF-none-0-ref,4
3,SAMN40905401,lib24_exptREF3-none-0-ref,4
4,SAMN40905401,lib13_exptREF2-none-0-ref,4
5,SAMN40905401,lib24_exptREF2-none-0-ref,4
6,SAMN40905401,lib25_exptREF2-none-0-ref,4
7,SAMN40905401,lib12_exptREF2-none-0-ref,4
8,SAMN40905401,lib12_exptREF-none-0-ref,4
9,SAMN40905401,lib25_exptREF-none-0-ref,2


Now make the wide submission data frame.
Note we keep only the filename column with the path lacking the full directory information:

In [7]:
submissions_wide = (
    submissions_tidy
    .assign(
        filename_count=lambda x: x.groupby(['biosample_accession', 'library_ID'])['filename'].cumcount() + 1,
        filename_col=lambda x: 'filename' + x['filename_count'].map(lambda c: str(c) if c > 1 else '')
        )
    .pivot(
        index='library_ID',
        columns='filename_col',
        values='filename',
        )
    )

submissions_wide = (
    submissions_tidy
    .drop(columns=['filename_fullpath', 'filename'])
    .drop_duplicates()
    .merge(submissions_wide[natsort.natsorted(submissions_wide.columns)],
           on='library_ID',
           validate='one_to_one',
           )
    )

print('Here are the first few submission entries in wide format:')
submissions_wide.head()

Here are the first few submission entries in wide format:


Unnamed: 0,biosample_accession,library_ID,title,library_strategy,library_source,library_selection,library_layout,platform,instrument_model,design_description,filetype,filename,filename2,filename3,filename4
0,SAMN40905401,lib12_exptREF-none-0-ref,Illumina barcode sequencing from C68 donor mAb (v1) mutational antigenic profiling of the SARS-CoV-2 RBD Wuhan_Hu_1,AMPLICON,SYNTHETIC,PCR,single,ILLUMINA,NextSeq 2000,PCR of barcodes from RBD variants,fastq,WH1_lib1_ref1_S70_R1_001.fastq.gz,WH1_lib1_ref2_S71_R1_001.fastq.gz,WH1_lib1_ref3_S72_R1_001.fastq.gz,WH1_lib1_ref4_S73_R1_001.fastq.gz
1,SAMN40905401,lib13_exptREF-none-0-ref,Illumina barcode sequencing from C68 donor mAb (v1) mutational antigenic profiling of the SARS-CoV-2 RBD Wuhan_Hu_1,AMPLICON,SYNTHETIC,PCR,single,ILLUMINA,NextSeq 2000,PCR of barcodes from RBD variants,fastq,WH1_lib2_ref1_S74_R1_001.fastq.gz,WH1_lib2_ref2_S75_R1_001.fastq.gz,WH1_lib2_ref3_S76_R1_001.fastq.gz,WH1_lib2_ref4_S77_R1_001.fastq.gz
2,SAMN40905401,lib12_expt8-C68_88-384-abneg,Illumina barcode sequencing from C68 donor mAb (v1) mutational antigenic profiling of the SARS-CoV-2 RBD Wuhan_Hu_1,AMPLICON,SYNTHETIC,PCR,single,ILLUMINA,NextSeq 2000,PCR of barcodes from RBD variants,fastq,expt8_lib1_S15_R1_001.fastq.gz,,,
3,SAMN40905401,lib13_expt8-C68_88-384-abneg,Illumina barcode sequencing from C68 donor mAb (v1) mutational antigenic profiling of the SARS-CoV-2 RBD Wuhan_Hu_1,AMPLICON,SYNTHETIC,PCR,single,ILLUMINA,NextSeq 2000,PCR of barcodes from RBD variants,fastq,expt8_lib2_S16_R1_001.fastq.gz,,,
4,SAMN40905401,lib12_exptREF2-none-0-ref,Illumina barcode sequencing from C68 donor mAb (v1) mutational antigenic profiling of the SARS-CoV-2 RBD Wuhan_Hu_1,AMPLICON,SYNTHETIC,PCR,single,ILLUMINA,NextSeq 2000,PCR of barcodes from RBD variants,fastq,2023-07-14-lib12-pre-1_R1_001.fastq.gz,2023-07-14-lib12-pre-2_R1_001.fastq.gz,2023-07-14-lib12-pre-3_R1_001.fastq.gz,2023-07-14-lib12-pre-4_R1_001.fastq.gz


Now write the wide submissions data frame to a `*.tsv` file that can be used for uploading the SRA submission website:

In [8]:
submissions_spreadsheet = 'SRA_submission_spreadsheet.tsv'

submissions_wide.to_csv(submissions_spreadsheet, sep='\t', index=False)

We also want to write a file with all of the FASTQ files that are in the submission spreadsheet:

In [9]:
fastq_file_list = 'FASTQs_to_upload.csv'

print(f"Writing list of all the FASTQ files to upload to {fastq_file_list}")

(submissions_tidy
 [['filename_fullpath', 'filename']]
 .to_csv(fastq_file_list, index=False)
 )

Writing list of all the FASTQ files to upload to FASTQs_to_upload.csv
