# 1. 获取序列信息

In [8]:
import csv
import tqdm
import pandas as pd
from Bio import Entrez
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
tlp="/Volumes/PortableSSD/projects/EnvelopProtein/dm_final/{}"
src,trg = tlp.format("env.tsv"), tlp.format("env.fasta")


def fetch_seq(pids):
    Entrez.email = "john@gmail.com"
    lst = []
    with Entrez.efetch(db="protein", rettype="gb", retmode="text", id=",".join(pids)) as handle:
        for i, seq_record in enumerate(SeqIO.parse(handle, "gb")):
            lst.append(SeqRecord(seq_record.seq, id=seq_record.id, description=pids[i]))
    return lst


df = pd.read_csv(src,sep="\t") # \t,,
df.dropna(inplace=True)
df["Gi"] = df["Gi"].astype(int).astype(str)
df["TaxId"] = df["TaxId"].astype(int).astype(str)
df.to_csv(tlp.format("env.csv"),index=False)
# batch_size = 100
# records,lst = [], df["Gi"].drop_duplicates().tolist()
# for i in tqdm.tqdm(range(0, len(lst), batch_size)):
#     if i<230*100:
#         continue
#     ret = fetch_seq(lst[i:i+batch_size])
#     records.extend(ret)
#     SeqIO.write(records, trg, "fasta")


# 2. 利用HmmScan进行序列标注

In [None]:
import os
tlp="/Volumes/PortableSSD/projects/EnvelopProtein/dm_final/{}"
src, trg = tlp.format("env.fasta"),tlp.format("env_pfam.txt")
hmm="/Volumes/PortableSSD/data/uniclust/align/db/pfam_a.hmm"
cmd=f"hmmscan --tblout {trg} -o envelop.log {hmm} {src}"
print(cmd)
res=os.popen(cmd)
print(res.read())

hmmscan --tblout /Volumes/PortableSSD/projects/EnvelopProtein/dm_final/env_pfam.txt -o envelop.log /Volumes/PortableSSD/data/uniclust/align/db/pfam_a.hmm /Volumes/PortableSSD/projects/EnvelopProtein/dm_final/env.fasta


# 3. 处理HmmScan后的结果

In [12]:
import re
import jsonlines
reg_header = re.compile("E-value")
fmt_line = re.compile(r"^#-| +")
tlp="/Volumes/PortableSSD/projects/EnvelopProtein/ann_li/{}"
def reader_header(v):
    multi_column_names = ("target name", "query name", "description of target")
    if reg_header.search(v):
        header = v.strip("#").strip()
        for m in multi_column_names:
            rlp = m.replace(" ", "_")
            header = header.replace(m, rlp)
            header = re.sub(" +", " ", header)
            columns = header.split(" ")
            return v, columns
    return None, None

def read_format(line):
    lst = []
    if fmt_line.match(line):
        for i, v in enumerate(line):
            if v == " ":
                lst.append(i)
    return lst

def read_line(line, fmt):
    idx, values = 0, []
    for i in range(len(fmt)):
        values.append(line[idx:fmt[i]+1])
        idx = fmt[i]
    values.append(line[idx:].strip())
    values = [v.strip() for v in values]
    return values


src, trg = tlp.format("env_pfam.txt"),tlp.format("env_pfam.json")
columns, columns_l, fmt = None, None, None
with open(src) as fp:
    with jsonlines.open(trg, "w") as wp:
        for v in fp:
            if not columns:
                columns_l, columns = reader_header(v)
            elif not fmt:
                fmt = read_format(v)
                columns = read_line(columns_l, fmt)
                columns[0] = columns[0].strip("#").strip()
            else:
                if v.startswith("#"):
                    continue
                cells = read_line(v, fmt)
                row = dict(zip(columns, cells))
                wp.write(row)

# 4. 处理原始基本数据

In [2]:
import pandas as pd
from ete3 import NCBITaxa
from Bio import SeqIO
import warnings
warnings.filterwarnings("ignore")
ncbi_texa = NCBITaxa()



def get_lineage(taxid=57483):
    try:
        lineage = ncbi_texa.get_lineage(int(taxid))
        lineage = sorted(lineage)
        names = ncbi_texa.get_taxid_translator(lineage)
        lineages = [names[k] for k in lineage]
        organism = [v for v in lineages if v.endswith("virus")]
        viridae = [v for v in lineages if v.endswith("viridae")]
        organism = organism [0] if organism else None
        viridae = viridae[0] if viridae else None
        return organism, viridae
    except:
        return None, None
    
tlp="/Volumes/PortableSSD/projects/EnvelopProtein/ann_li/{}"
# df = pd.read_csv(tlp.format("env.csv"),sep=",") # \t,,
tlp="/Volumes/PortableSSD/projects/EnvelopProtein/nc_zf/{}"
df = pd.read_json(tlp.format("env_taxid.json"),lines=True)
# df.rename(columns={"Gi":"rid"},inplace=True)
# df["family"] = df["TaxId"].apply(lambda x:get_lineage(x)[1])
# df["species"] = df["TaxId"].apply(lambda x:get_lineage(x)[0])

df["family"] = df["taxid"].apply(lambda x:get_lineage(x)[1])
df["species"] = df["taxid"].apply(lambda x:get_lineage(x)[0])
# df["flag"] = df["Title"].apply(lambda x:"polyprotein" not in x)
# df = df[df["flag"]]
# records = list(SeqIO.parse(tlp.format("env.fasta"), "fasta"))
# pmap={v.description.split()[-1].strip():v.id for v in records}
# df["rid"] = df["rid"].astype(str)
# df["id"]=df["rid"].map(pmap)
df.to_csv(tlp.format("env_info.csv"),index=False)