# 读取基因组信息

In [None]:
import pandas as pd

DATADIR="C:\\Users\\jinya\\Desktop\\bio\\env"
# 读取Excel文件
# file_path = f'{DATADIR}\Paramyxoviridae.csv'  # 替换为你的Excel文件路径
# file_path = f'{DATADIR}\\Rhabdoviridae.csv' 
file_path = f'{DATADIR}\\Flaviviridae.csv' 
df = pd.read_csv(file_path)

# 显示数据框的前五行
df.head()

# 从NCBI中获取序列和基因标注信息

In [None]:
from Bio import Entrez, SeqIO
import time
import tqdm

Entrez.email = "your_email@example.com"


def fetch_genbank_sequence(genbank_id="V01116"):
    try:
        handle = Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        gid, name = record.id, record.name
        source = [feature for feature in record.features if feature.type == "source"]
        cds =[{"start":int(feature.location.start),"end":int(feature.location.end),"qualifiers":feature.qualifiers} 
              for feature in record.features if feature.type == "CDS" ]
        mat_peptide = [{"start":int(feature.location.start),"end":int(feature.location.end),"qualifiers":feature.qualifiers} 
              for feature in record.features if feature.type == "mat_peptide" ]
        info = {"id":gid,"name":name,"cds":cds, "mat_peptide":mat_peptide,"organism":source[0].qualifiers["organism"], "db_xref":source[0].qualifiers["db_xref"]}
        handle.close()
        return record, info
    except Exception as e:
        print(f"Error fetching GenBank ID {genbank_id}: {e}")
        return None, None

# 示例使用
genbank_ids = df["Accession"].values
my_records, infos = [], []
for genbank_id in tqdm.tqdm(genbank_ids):
    record, info = fetch_genbank_sequence(genbank_id) 
    if seq and info:
        my_records.append(record)
        infos.append(info)
    time.sleep(1)  # 避免过快请求，遵守API速率限制
from Bio import SeqIO
SeqIO.write(my_records, f"{DATADIR}\\fv.fasta", "fasta")
import jsonlines
with jsonlines.open(f"{DATADIR}\\fv.json","w") as fp:
    for v in infos:

# 获取特定基因的序列

In [None]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import jsonlines

lst, s = [], set()
with jsonlines.open( f'{DATADIR}\\pv.json") as fp:
    for v in fp:
        taxid = ""
        for x in v["db_xref"]:
            if "taxon:" in x:
                taxid = x[6:]
        if taxid in s:
            continue
        for x in v["cds"]:
            if"gene" in x["qualifiers"] and  x["qualifiers"]["gene"][0]=="F":
                record = SeqRecord( Seq(x["qualifiers"]["translation"][0]), id=v["organism"][0], description="")
                lst.append(record)
        s.add(taxid)

SeqIO.write(lst, f"{DATADIR}\\pvf.fasta", "fasta")

In [None]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import jsonlines
from Bio import SeqIO

lst, s = [], set()
with jsonlines.open( f'{DATADIR\\rv.json") as fp:
    for v in fp:
        taxid = ""
        for x in v["db_xref"]:
            if "taxon:" in x:
                taxid = x[6:]
        if taxid in s:
            continue
        for x in v["cds"]:
            if("gene" in x["qualifiers"] and  x["qualifiers"]["gene"][0]=="G") or ("product" in x["qualifiers"] and  x["qualifiers"]["product"][0]=="glycoprotein"):
                record = SeqRecord( Seq(x["qualifiers"]["translation"][0]), id=v["organism"][0], description="")
                lst.append(record)
        s.add(taxid)

SeqIO.write(lst[:100], f"{DATADIR}\\rvg.fasta", "fasta")

In [None]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import jsonlines
mseq = {}
for seq_record in SeqIO.parse(f"{DATADIR}\\fv.fasta", "fasta"):
    mseq[seq_record.id]=str(seq_record.seq)

lst, s = [], set()
with jsonlines.open( f'{DATADIR\\fv.json") as fp:
    for v in fp:
        taxid = ""
        for x in v["db_xref"]:
            if "taxon:" in x:
                taxid = x[6:]
        if taxid in s:
            continue
        for x in v["mat_peptide"]:
            if"product" in x["qualifiers"] and  x["qualifiers"]["product"][0] in("envelope protein E1","envelope protein E"):
                dna_seq = Seq(mseq[v["id"]][x["start"]:x["end"]], "DNA")
#                 print(x["qualifiers"]["protein_id"], dna_seq.translate())
                record = SeqRecord( Seq(dna_seq.translate()), id=v["organism"][0], description="")
                lst.append(record)
        s.add(taxid)

SeqIO.write(lst, f"{DATADIR}\\fve.fasta", "fasta")

In [None]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import jsonlines
from Bio import SeqIO
nmap=dict(zip(df["Genbank ID"],df["Virus"]))
lst,infos, s = [],[], set()
lengths =[]
for name in ("apv", "hpv"):
    with jsonlines.open( f"{DATADIR}\\{name}.json") as fp:
        for v in fp:
            vname = nmap[v["id"].split(".")[0]]
            infos.append(v)
            for x in v["cds"]:
                if"gene" in x["qualifiers"] and  x["qualifiers"]["gene"][0]=="L1" and vname not in s:
                    s.add(vname)
#                     record = SeqRecord( Seq(x["qualifiers"]["translation"][0]), id=vname, description=v["organism"][0])
                    record = SeqRecord( Seq(x["qualifiers"]["translation"][0]), id=vname, description="")
                    lengths.append(len(x["qualifiers"]["translation"][0]))
                    lst.append(record)
    s.add(taxid)
with jsonlines.open( f"{DATADIR}\\pvl1.json","w") as fp:
    for v in infos:
        fp.write(v)
SeqIO.write(lst, f"{DATADIR}\\pvl1.fasta", "fasta")
import matplotlib.pylab as plt
import seaborn as sns
print(len(lengths), max(lengths), min(lengths))
p1=sns.kdeplot(lengths)
plt.show()