In [1]:
import os
from collections import defaultdict

from Bio import SeqIO
import pandas as pd

REFSEQ_DIR = "RefSeq"

PRODUCT_NAME_MAPPING = {
    "leader protein": "nsp1",
    "3C-like proteinase": "nsp5",
    "RNA-dependent RNA polymerase": "nsp12",
    "helicase": "nsp13",
    "3'-to-5' exonuclease": "nsp14",
    "endoRNAse": "nsp15",
    "2'-O-ribose methyltransferase": "nsp16"
}

In [2]:
annoRef = SeqIO.read(os.path.join(REFSEQ_DIR, "SARS_CoV_2.gb"), "genbank")

In [3]:
aa2nucleotide = {}
mat2nucleotide = {}
matAA2nucleotide = {}
proteinSeqs = {}
orfNames = []

peptideIndex = 0

for f in annoRef.features:
    if f.type == "CDS":
        geneName = f.qualifiers['gene'][0]
        if geneName not in orfNames:
            orfNames.append(geneName)
            aaSeq = f.qualifiers['translation'][0]
            proteinSeqs[geneName] = aaSeq
            print(geneName)

            aaIndex = 0
            codonPos = 1
            codonIndices = []
            siteMapping = {}
            for i in f.location.parts:
                for siteIndex in range(i.start, i.end):
                    codonIndices.append(siteIndex + 1)
                    codonPos += 1
                    if codonPos > 3:
                        aaIndex += 1
                        peptideIndex += 1
                        siteMapping[tuple(codonIndices)] = (aaIndex, peptideIndex)
                        codonPos = 1
                        codonIndices = []
                aa2nucleotide[geneName] = siteMapping
    elif f.type == 'mat_peptide':
        productName = f.qualifiers['product'][0]
        if productName == "nsp11":
            continue

        aaIndex = 0
        codonPos = 1
        codonIndices = []
        siteMapping = {}
        for i in f.location.parts:
            for siteIndex in range(i.start, i.end):
                codonIndices.append(siteIndex + 1)
                codonPos += 1
                if codonPos > 3:
                    aaIndex += 1
#                     peptideIndex += 1
                    siteMapping[tuple(codonIndices)] = (aaIndex, peptideIndex)
                    codonPos = 1
                    codonIndices = []
                if productName in PRODUCT_NAME_MAPPING:
                    productName = PRODUCT_NAME_MAPPING[productName]
                mat2nucleotide[siteIndex + 1] = productName
            matAA2nucleotide[productName] = siteMapping

ORF1ab
S
ORF3a
E
M
ORF6
ORF7a
ORF7b
ORF8
N
ORF10


In [4]:
# aa2nucleotide = {}
# mat2nucleotide = {}
# proteinSeqs = {}
# orfNames = []

# peptideIndex = 0

# for f in annoRef.features:
#     if f.type == "CDS":
#         geneName = f.qualifiers['gene'][0]
#         if geneName not in orfNames:
#             orfNames.append(geneName)
#             aaSeq = f.qualifiers['translation'][0]
#             proteinSeqs[geneName] = aaSeq
#             print(geneName)

#             aaIndex = 0
#             codonPos = 1
#             codonIndices = []
#             siteMapping = {}
#             for i in f.location.parts:
#                 for siteIndex in range(i.start, i.end):
#                     codonIndices.append(siteIndex + 1)
#                     codonPos += 1
#                     if codonPos > 3:
#                         aaIndex += 1
#                         peptideIndex += 1
#                         siteMapping[tuple(codonIndices)] = (aaIndex, peptideIndex)
#                         codonPos = 1
#                         codonIndices = []
#                 aa2nucleotide[geneName] = siteMapping
#     elif f.type == 'mat_peptide':
#         productName = f.qualifiers['product'][0]
#         for i in f.location.parts:
#             for siteIndex in range(i.start, i.end):
#                 mat2nucleotide[siteIndex + 1] = productName

In [6]:
df = []

for geneName in aa2nucleotide:
    aaSeq = proteinSeqs[geneName]
    for codonIndices in aa2nucleotide[geneName]:
        aaSite, peptideSite = aa2nucleotide[geneName][codonIndices]
        if aaSite <= len(aaSeq):
            aa = aaSeq[aaSite - 1]
        else:
            aa = '*'
        for site in codonIndices:
            if site in mat2nucleotide:
                productName = mat2nucleotide[site]
                aaSite = matAA2nucleotide[productName][codonIndices][0]
            else:
                productName = geneName
            nucleotide = annoRef[site - 1]
            df.append([geneName,
                       productName,
                       aaSite,
                       peptideSite,
                       aa,
                       site,
                       nucleotide])

df = pd.DataFrame(df, columns=['gene',
                               'product',
                               'aaPos',
                               'peptidePos',
                               'aa',
                               'genomePos',
                               'nucleotide'])
df.to_csv(os.path.join(REFSEQ_DIR, "SARS_CoV_2.csv"))