In [14]:
import sys
import os

# Get the path to the validate_assay folder
module_path = os.path.abspath(os.path.join('..', 'validate_assay'))

# Add the folder to the system path
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
from pcrvalidationtools import *
from Bio import Entrez, SeqIO

In [15]:
# files 
INPUT_PATH = "../Data/"
datasets_file = INPUT_PATH + 'enterovirus_metadata.tsv'
clustered_file =  INPUT_PATH + 'ev_clustered100.fasta'

In [30]:
# filter for entered collection date, human host, completeness 
df_filter1 = filter_data(datasets_file)

print(f'There are {df_filter1.shape[0]} sequences after filtering for complete human-host sequences with a collection date')

There are 6488 sequences after filtering for complete human-host sequences with a collection date


In [13]:
# create a text file of one accessions per line to retrieve sequences for
accession_output_filename = "enterovirus_filtered_acc.txt"
df_filter1['Accession'].to_csv(INPUT_PATH + accession_output_filename, index=False, header=False)

print(f'Filtered accessions saved to {accession_output_filename}')

Filtered accessions saved to enterovirus_filtered_acc.txt


In [17]:
# cluster at 100% identity 

# get list of accessions from headers of fasta file
# ex. header: >PP191126.1 Coxsackievirus A6 strain CVA6/16/GJ/KOR/2022, complete genome 
def get_acc_fasta(file_path):
    # parse headers
    headers = []
    with open(file_path, 'r') as fasta_file:
        for line in fasta_file:
            if line.startswith('>'):
                header = line.strip()[1:]
                headers.append(header)
    # retrieve accession from header
    accessions = [h.split(' ')[0] for h in headers]
    return accessions

In [32]:
# retrieve accessions after 100% dereplication
dereplicated_acc = get_acc_fasta(clustered_file)

# remove replicated sequences 
df_filter2 = df_filter1[df_filter1['Accession'].isin(dereplicated_acc)].copy()

print(f'There are {df_filter2.shape[0]} sequences after dereplication')
print(f'{df_filter1.shape[0] - df_filter2.shape[0]} sequences were removed here')

There are 5974 sequences after dereplication
514 sequences were removed here


In [19]:
# use entrez to determine the 5' utr length 

# info for NCBI Entrez
Entrez.email = "vantrinh@berkeley.edu"
Entrez.api_key = '621bd974d7f13581ac5112cf933598a06b08'

# returns a list of queries
# each query is a string of comma separated accessions
# this breaks up the requests we send to NCBI
# accessions (list): list of accession number strings
def define_query(accessions):
    # if getting invalid uid error, try lowering batch size
    BATCH_SIZE = 1000
    queries = []
    size = len(accessions)
    iters = (size // BATCH_SIZE) + (size % BATCH_SIZE > 0)
    for i in range(iters):
        start_range = i * BATCH_SIZE
        end_range = start_range + BATCH_SIZE
        accession_str = ", ".join(accessions[start_range:end_range])
        queries.append(accession_str)
    return queries

# returns genbank records for the list of accessions
# accessions : list of accession strings
def get_gb_records(accessions):
    gb_records = []
    queries = define_query(accessions)
    for query in queries:
        handle = Entrez.efetch(db="nucleotide", id=query, rettype="gb", retmode="text")
        gb_records.extend(SeqIO.parse(handle, "genbank"))
        handle.close()
    return gb_records

# returns length of 5' utr 
# record: gb record
def get_utr_len(record):
    utr_len = np.nan
    # parse through features for CDS or 5'UTR
    for feature in record.features:
        if feature.type == '5\'UTR':
            # get 5' utr location
            loc = feature.location
            # define utr length as start of cds annotation
            utr_len = loc.start
            # if direction is reverse, calculate utr length
            if loc.strand == -1:
                utr_len = len(record.seq) - loc.start
            break
        if feature.type == 'CDS':
            # get cds location
            loc = feature.location
            # define utr length as start of cds annotation
            utr_len = loc.start
            # if direction is reverse, calculate utr length
            if loc.strand == -1:
                utr_len = len(record.seq) - loc.end
            break
    return utr_len

# returns a df with accession number and corresponding 5' utr length
def accession_utr_df(gb_records):
    # parse records and get utr length record
    df_dict = {'Accession': [], '5\'UTR Length': []}
    for record in gb_records:
        df_dict['Accession'].append(record.id)
        df_dict['5\'UTR Length'].append(get_utr_len(record))

    # return df
    return pd.DataFrame(df_dict)

In [33]:
# retrieve length of 5' utr for filtered genomes
gb_records = get_gb_records(df_filter2['Accession'].tolist())
utr_df = accession_utr_df(gb_records)



In [34]:
utr_df

Unnamed: 0,Accession,5'UTR Length
0,PP191126.1,0.0
1,PP191125.1,0.0
2,PP191124.1,0.0
3,PP191123.1,0.0
4,PP191122.1,0.0
...,...,...
5969,AB705310.1,742.0
5970,AB705309.1,742.0
5971,AB705308.1,742.0
5972,AB686524.1,0.0


In [35]:
# filter genomes for 5'UTR length >= 400 
valid_utr_acc = utr_df[utr_df['5\'UTR Length'] >= 400]['Accession'].tolist()
df_filter3 = df_filter2[df_filter2['Accession'].isin(valid_utr_acc)].copy()

print(f'There are {df_filter3.shape[0]} sequences after removing 5\' UTRs < 400nt')
print(f'{df_filter2.shape[0] - df_filter3.shape[0]} sequences were removed here')

There are 4555 sequences after removing 5' UTRs < 400nt
1419 sequences were removed here


In [36]:
# save filtered datasets as tsv file 
output_filename = 'ev_datasets_filtered.tsv'
df_filter3.to_csv(INPUT_PATH + output_filename, sep='\t', index=False)

print(f'Completely filtered genomes saved to {output_filename}')

Completely filtered genomes saved to ev_datasets_filtered.tsv


In [39]:
# check that the right file was saved 
pd.read_csv(INPUT_PATH + output_filename, sep='\t')

Unnamed: 0,Accession,Virus Name,Virus Taxonomic ID,Host Taxonomic ID,Completeness,Isolate Collection date,Release date
0,PP157596.1,enterovirus D68,42789,9606.0,COMPLETE,2020,2024-01-24T00:00:00Z
1,PP157595.1,enterovirus D68,42789,9606.0,COMPLETE,2020,2024-01-24T00:00:00Z
2,PP125279.1,Rhinovirus B,147712,9606.0,COMPLETE,2021-06-07,2024-01-24T00:00:00Z
3,PP125278.1,Rhinovirus B,147712,9606.0,COMPLETE,2021-05-13,2024-01-24T00:00:00Z
4,PP028461.1,Coxsackievirus A24,12089,9606.0,COMPLETE,2023-09,2024-01-11T00:00:00Z
...,...,...,...,...,...,...,...
4550,AB769152.1,Coxsackievirus A24,12089,9606.0,COMPLETE,2011-06,2012-12-05T00:00:00Z
4551,AB705311.1,Echovirus E6,12062,9606.0,COMPLETE,2011-09-09,2012-09-14T00:00:00Z
4552,AB705310.1,Echovirus E6,12062,9606.0,COMPLETE,2011-09-07,2012-09-14T00:00:00Z
4553,AB705309.1,Echovirus E6,12062,9606.0,COMPLETE,2011-09-02,2012-09-14T00:00:00Z


In [None]:
pd.read_csv(INPUT_PATH + output_filename, sep='\t')