# Imports and set up

In [103]:
import os, sys
import re

import requests
from lxml import etree
from bs4 import BeautifulSoup

from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio import Entrez
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from io import StringIO


In [104]:
TERM = 'Flavivirus'
FASTA_DIR_PATH = '/home/timofeiryko/Downloads/fasta'

In [105]:
# Set up the Entrez email address
Entrez.email = "timofei.ryko@gmail.com"

# Set up the search parameters
query = TERM
db = "cdd"

In [106]:
! pip freeze > requirements.txt

In [107]:
! python --version

Python 3.11.10


# Search protein identifiers by term

In [108]:
# Perform the Entrez search to get the unique identifiers
handle = Entrez.esearch(db=db, term=query)
record = Entrez.read(handle)
handle.close()

# Get the list of unique identifiers
ids = record['IdList']

In [109]:
ids

['477363', '473069', '470726', '466632', '460022', '460013', '453075', '438054', '438028', '409908', '409906', '400138', '398938', '395758', '395698', '366710', '366413', '355797', '355796', '341208']

# Convert of IDs using bs4

In [110]:
page = requests.get('https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=394953')
soup = BeautifulSoup(page.content, 'html.parser')

In [111]:
prefixes = [
    "Source Database",
    "cd",
    "sd",
    "NF",
    "pfam",
    "smart",
    "COG",
    "KOG",
    "PRK",
    "CHL",
    "MTH",
    "PHA",
    "PLN",
    "PTZ",
    "TIGR",
    "LOAD_",
    "cl"
]

# Prepare a regex pattern to match prefixes followed by digits
prefix_pattern = r'|'.join([re.escape(prefix) + r'\d+' for prefix in prefixes])

In [112]:
cd_ids = []
proteins = []

for pssm_id in ids:

        page = requests.get(f'https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid={pssm_id}')
        soup = BeautifulSoup(page.content, 'html.parser')

        div = soup.find('div', class_='title')
        links = div.find_all('a')
        for link in links:
            href = link.get('href', '')
            # Use regex to check if href starts with the desired pattern
            if re.match(r'^/Structure/cdd/cddsrv\.cgi\?.*', href):
                matching_link = link
                break  # Stop after finding the first matching link

        if matching_link:
            # Extract the href and parse the uid parameter
            href = matching_link['href']
            uid_match = re.search(r'uid=([^&]+)', href)  # Extract uid using regex
            uid = uid_match.group(1)
            # Check if uid starts with any of the specified prefixes followed by digits
            if re.match(rf'^{prefix_pattern}', uid):
                cd_ids.append(uid)

                span = div.find('span')
                if span and span.text.startswith('\xa0'):  # Check for non-breaking space
                    # If it starts with &nbsp; (which is \xa0 in Python)
                    print(f"Found span with non-breaking space: {span.text}")
                    proteins.append(span.text.strip())
                else:
                    print(f"Span does not start with non-breaking space for UID {uid}.")

            else:
                print(f"UID {uid} does not match the required pattern.")

assert len(ids) == len(cd_ids)

Found span with non-breaking space:  ps-ssRNAv_RdRp-like Superfamily 
Found span with non-breaking space:  RRM_SF Superfamily 
Found span with non-breaking space:  Flavi_M Superfamily 
Found span with non-breaking space:  Flavi_NS5_thumb 
Found span with non-breaking space:  Flavi_M 
Found span with non-breaking space:  Flavi_NS5 
Found span with non-breaking space:  Peptidase_S7 Superfamily 
Found span with non-breaking space:  Flavivirus_RdRp 
Found span with non-breaking space:  ps-ssRNAv_Flaviviridae_RdRp 
Found span with non-breaking space:  RRM2_U1A 
Found span with non-breaking space:  RRM1_U1A 
Found span with non-breaking space:  Flavi_DEAD 
Found span with non-breaking space:  Peptidase_S31 
Found span with non-breaking space:  Peptidase_S7 
Found span with non-breaking space:  Flavi_glycoprot 
Found span with non-breaking space:  Flavi_propep 
Found span with non-breaking space:  Flavi_capsid 
Found span with non-breaking space:  STAT_CCD Superfamily 
Found span with non-bre

In [113]:
cd_ids

['cl40470',
 'cl17169',
 'cl03065',
 'pfam20483',
 'pfam01004',
 'pfam00972',
 'cl29867',
 'cd23204',
 'cd23178',
 'cd12480',
 'cd12477',
 'pfam07652',
 'pfam05578',
 'pfam00949',
 'pfam00869',
 'pfam01570',
 'pfam01003',
 'cl28921',
 'cl28920',
 'cd17038']

# Get FASTA files

All the FASTA files were downloaded from https://ftp.ncbi.nih.gov/pub/mmdb/cdd, extracted from `fasta.tar.gz` archive

In [114]:
consensus_sequences = {}

for uid in cd_ids:
    filename = os.path.join(FASTA_DIR_PATH, f'{uid}.FASTA')
    
    # Check if the file exists
    if os.path.exists(filename):
        with open(filename, 'r') as fasta_file:
            # Parse the FASTA file
            for record in SeqIO.parse(fasta_file, 'fasta'):
                # Check if the record id indicates a consensus sequence
                if record.id.startswith('lcl|consensus'):
                    consensus_sequences[uid] = str(record.seq)  # Store the consensus sequence
                    break  # Stop after finding the consensus sequence

In [115]:
# Create a list to hold SeqRecord objects
records = []

# Create SeqRecord objects for each entry in the dictionary
for uid, sequence in consensus_sequences.items():
    record = SeqRecord(Seq(sequence), id=uid, description="")
    records.append(record)

# Use StringIO to capture the FASTA formatted output
output = StringIO()
SeqIO.write(records, output, "fasta")
fasta_string = output.getvalue()

# Print the resulting FASTA string
print(fasta_string)

>pfam20483
EEVDFCSHHYEKLTFKD---G-----R--TI--V---V-----PTRDQDEIIAKSRIRP-
GGD-WSLDETAWLSKAYANMWLVNYFHLRTARALGFAYKSAVPPNWVPTGRTTGSIHRPG
PWMTPEDMLDVWNRVWFGEST-HMP-DG-FRVRSWRHVGYLK-KREEKLYDSLIGLRNRA
YWRSNLHLDV
>pfam01004
VALPPHVGLGLETRTETWMSSRGAWKQLQKVETWALRNPGFTVIALFIAHLIGSSITQKV
IIFILLLLVTPAYS
>pfam00972
TYEADVILGIGTRSVATDKEPLNKEIIGERIERIKNEHMTTWFYDEDNPYRTWAYHGSYE
TKTSGSASSMVNGVVRLLTKPWDVIEEVTRIAMTDTTPFGQQRVFKEKVDTRAPDPPAGT
RQIMKVVNRWLWRHLAREKNPRLCTKEEFIAKVRSNAAIGAYFEEEEQWKTANEAVQDPR
FWELVDRERELHQQGRCRTCVYNMMGKREKKLGEFGKAKGSRAIWYMWLGARFLEFEALG
FLNEDHWASRENSGGGVEGIGLQYLGYILRDLAAMPGGGMYADDTAGWDTRITEADLDNE
AEITNYMEPHHKKLAQAVMKMTYQNKVVKVLRPAPGGKTVMDVISRRDQRGSGQVVTYAL
NTFTNLKVQLIRMMEAEMVIHHQHLQDCDES--ERVEAWLTEHGCDRLKRMAVSGDDCVV
KPIDDRFALALSHLNDMGKVRKDISEWQPSKGW
>cd23204
MVNGVVKLLSKPWDVIEMVTQMAMTDTTPFGQQRVFKEKV-DTKAPEPPEGTRKIMRIVN
EWLWKF-LARKKKPRLCTREEFIAKVRSNAALGAVFEEQNQWKSAREAVEDPRFWELVDE
ERELHLEGKCETCVYNMMGKREKKLGEFGKAKGSRAIWYMWLGARFLEFEALGFLNEDHW
ASRENSGGGVEGIGLQYLGYILREISKKPGGKMYADDTAGWDTR

In [116]:
output_fasta_file = f'{TERM}.FASTA'
with open(output_fasta_file, 'w') as fasta_file:
    fasta_file.write(fasta_string)

print(f"FASTA file '{output_fasta_file}' has been created.")

FASTA file 'Flavivirus.FASTA' has been created.


In [117]:
# Create a list to hold SeqRecord objects
records = []

# Create SeqRecord objects for each entry in the dictionary
for uid, sequence in consensus_sequences.items():

    # Find all contiguous uppercase letter sequences
    matches = re.findall(r'[A-Z]+', sequence)  # Find all sequences of uppercase letters
    
    if matches:  # Check if there are any matches
        # Get the longest sequence
        longest_sequence = max(matches, key=len)
        record = SeqRecord(Seq(longest_sequence), id=uid, description="")
        records.append(record)

# Use StringIO to capture the FASTA formatted output
output = StringIO()
SeqIO.write(records, output, "fasta")
fasta_string = output.getvalue()

# Print the resulting FASTA string
print(fasta_string)

>pfam20483
WSLDETAWLSKAYANMWLVNYFHLRTARALGFAYKSAVPPNWVPTGRTTGSIHRPGPWMT
PEDMLDVWNRVWFGEST
>pfam01004
VALPPHVGLGLETRTETWMSSRGAWKQLQKVETWALRNPGFTVIALFIAHLIGSSITQKV
IIFILLLLVTPAYS
>pfam00972
TYEADVILGIGTRSVATDKEPLNKEIIGERIERIKNEHMTTWFYDEDNPYRTWAYHGSYE
TKTSGSASSMVNGVVRLLTKPWDVIEEVTRIAMTDTTPFGQQRVFKEKVDTRAPDPPAGT
RQIMKVVNRWLWRHLAREKNPRLCTKEEFIAKVRSNAAIGAYFEEEEQWKTANEAVQDPR
FWELVDRERELHQQGRCRTCVYNMMGKREKKLGEFGKAKGSRAIWYMWLGARFLEFEALG
FLNEDHWASRENSGGGVEGIGLQYLGYILRDLAAMPGGGMYADDTAGWDTRITEADLDNE
AEITNYMEPHHKKLAQAVMKMTYQNKVVKVLRPAPGGKTVMDVISRRDQRGSGQVVTYAL
NTFTNLKVQLIRMMEAEMVIHHQHLQDCDES
>cd23204
APRLKRVEDWLEENGEERLSRMAVSGDDCVVKPIDDRFATALTFLNDMGKVRKDIQEWEP
SKGWNDWEEVPFCSHHFHELIMKDGRTLVVPCRDQDELIGRARVSPGAGWSLRETACLSK
AYAQMWLLMYFHRRDLRLMANAICSAVPVDWVPTGRTTWSIHAKGEWMTTEDMLEVWNRV
WIEDNPWMEDKTPVTSWRDVPYLGKREDQWCGSLIGLRSRATWAKNIQTAVN
>cd23178
IRLPTMLVCGDDCVVICESDGTQEDAALLAAFTEALTRYGKPPKDPPQ
>cd12480
PPQPVSENPPNHILFLTNLPEETNELMLSMLFNQFPGFKEVRLVPGRHDIAFVEFDNEVQ
AGAAREALQGFKITQSNAMKISFAKK
>cd12477
RPNHT

In [118]:
output_fasta_file = f'{TERM}_cleaned.FASTA'
with open(output_fasta_file, 'w') as fasta_file:
    fasta_file.write(fasta_string)

print(f"FASTA file '{output_fasta_file}' has been created.")

FASTA file 'Flavivirus_cleaned.FASTA' has been created.


In [119]:
proteins

['ps-ssRNAv_RdRp-like Superfamily',
 'RRM_SF Superfamily',
 'Flavi_M Superfamily',
 'Flavi_NS5_thumb',
 'Flavi_M',
 'Flavi_NS5',
 'Peptidase_S7 Superfamily',
 'Flavivirus_RdRp',
 'ps-ssRNAv_Flaviviridae_RdRp',
 'RRM2_U1A',
 'RRM1_U1A',
 'Flavi_DEAD',
 'Peptidase_S31',
 'Peptidase_S7',
 'Flavi_glycoprot',
 'Flavi_propep',
 'Flavi_capsid',
 'STAT_CCD Superfamily',
 'STAT_DBD Superfamily',
 'Flavi_M']

In [120]:
cd_ids

['cl40470',
 'cl17169',
 'cl03065',
 'pfam20483',
 'pfam01004',
 'pfam00972',
 'cl29867',
 'cd23204',
 'cd23178',
 'cd12480',
 'cd12477',
 'pfam07652',
 'pfam05578',
 'pfam00949',
 'pfam00869',
 'pfam01570',
 'pfam01003',
 'cl28921',
 'cl28920',
 'cd17038']