In [1]:
import copy
import pandas as pd
import numpy as np
import Levenshtein
from tqdm import tqdm

In [2]:
# parameters
threshold=6  # draw tree above this emergenec risk threshold
CONSTRUCT_PHYLO=True  # use constructed tree (tree construction takes hours)
OUTPUT_DIR='./'
PHYLO_TREE_DIR='./'
PHYLO_DIR='./'
COMBINED_RESULTS='./combined_results.csv'
CONSTRUCT_PHYLO=False
num_collapsed=18  # number of mutations within which leaves are collapsed
VERBOSE=False

# class definition to hold multi sequence data

In [3]:
class SeqInfo(object):
    """Holds information regarding the sequence.
    
    """
    def __init__(self, seq, 
                 protein,
                 accession,
                 subtype=None,
                 id=None,
                 name=None,
                 host=None, 
                 date=None, 
                 erisk=None,
                 irisk=None,
                 risk_flag=None,
                 country=None):
        self.name = name
        self.id = id
        self.protein=protein
        self.subtype=subtype        
        self.seq = seq
        self.accession = accession 
        self.host = host
        self.date = date
        self.erisk = erisk
        self.irisk = irisk
        self.risk_flag = risk_flag
        self.country = country
        
class MultipleSeqInfo(object):
    """Holds information regarding multiple sequences.
    
    Args:
        dataframe (pandas.DataFrame): list of records parsed from NCBI
        accessionname (str): column name for accession id
        proteinname (str): protein name 
        risk_threshold (float): emergence risk threshold to compute distance matrix
    """
    def __init__(self,
                 dataframe,
                 accessionname,
                 proteinname,
                 risk_threshold=6.2):
        
        self.seq_infos = {}
        self.risk_threshold = risk_threshold
        for i in np.arange(dataframe.index.size):
            record=dataframe.iloc[i,:]
            seqinfo = SeqInfo(
                name=record.id,
                seq=record[proteinname], 
                protein=proteinname,
                accession=record[accessionname],
                subtype=record.subtype,
                erisk=record.predicted_emergence_score,
                irisk=record.predicted_impact_score,
                risk_flag = record.predicted_emergence_score > self.risk_threshold,
                host=None,
                date=None,
                country=None)
            #print(record.predicted_emergence_score > self.risk_threshold)
            self.seq_infos[seqinfo.accession] = seqinfo
            
    
    def compute_L_distance_matrix(self):
        highriskseq = pd.DataFrame.from_dict({key:val.seq 
                                              for (key,val) in self.seq_infos.items() 
                                              if val.risk_flag},orient='index',columns=['seq'])
        num=highriskseq.index.size
        d=np.zeros([num,num])
        for x in tqdm(np.arange(num*num)):
            j=x//num
            i=x-num*j
            if i > j:
                d[i,j] = Levenshtein.distance(highriskseq.seq.values[i],
                                                  highriskseq.seq.values[j])
        ds=pd.DataFrame(d)        
        ds=(ds+ds.transpose())
        ds.columns=highriskseq.index.values
        self.highriskdistancematrix=ds.copy()
        
        self.highriskdistancematrix.to_csv('dm'+str(self.risk_threshold)+'.csv',index=None)
        return 
    
    
    def accessions_to_subtype(self, accessions):
        """Create a dictionary mapping the accession to the host.
        """
        
        subtypes = []
        for accession in accessions:
            seqinfo = self.seq_infos[accession]
            subtypes.append(seqinfo.subtype)
            
        return subtypes

    def accessions_to_host(self, accessions):
        """Create a dictionary mapping the accession to the host.
        """
        
        hosts = []
        for accession in accessions:
            seqinfo = self.seq_infos[accession]
            hosts.append(seqinfo.host)
        return hosts
           
    

In [4]:
df=pd.read_csv(COMBINED_RESULTS,index_col=0).reset_index()
#ALLinfoHA=MultipleSeqInfo(df.reset_index(),'ha_accession','ha',risk_threshold=6.054)
ALLinfoHA=MultipleSeqInfo(df.reset_index(),'ha_accession','ha',risk_threshold=threshold)

In [5]:
df[df.predicted_emergence_score>threshold].subtype.value_counts()

H1N1    890
H3N2    812
H9N2     29
H7N9      1
Name: subtype, dtype: int64

# generate distance matrix

In [6]:
ALLinfoHA.compute_L_distance_matrix()

100%|██████████████████████████████| 2999824/2999824 [01:14<00:00, 40519.69it/s]


# tree construction from distance matrix using biopython

In [7]:
from Bio.Phylo import TreeConstruction
from Bio import Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Align import MultipleSeqAlignment
from Bio import Entrez
from Bio import SeqIO


def load_dm(file_, upper_diag=True):
    """Load the distance matrix. 
    
    Also, do some preprocessing. 
    """
    
    df = pd.read_csv(file_)
    #df.set_index('Unnamed: 0', inplace=True)
    #assert np.all(df.columns == df.index)
    
    # drop duplicate columns after reading csv
    #df = df.loc[:, ~df.columns.str.replace("(\.\d+)$", "").duplicated()]
    
    if upper_diag:
        df = df + df.T
    return df

def save_tree(tree, file_name, save_type='xml'):
    """Saved the created phylogenetic tree."""
    
    if save_type == 'pickle':
        graph = Phylo.to_networkx(tree)
        save_pickled(graph, file_name)
    elif save_type == 'xml':
        Phylo.write(tree, file_name, 'phyloxml')
    else:
        raise ValueError('Not a correct save type.')
    
def pandas_dm_to_biopython_dm(dm):
    """Convert the pandas distance matrix to the biopython distance matrix.
    
    Returns:
        biopython distance matrix
    """
    
    accessions = dm.columns
    bio_dm = []
    for i, accession in enumerate(accessions):
        bio_dm.append(list(dm.iloc[i, :i+1].values))
        
    bio_dm = TreeConstruction._DistanceMatrix(
        list(dm.columns), 
        bio_dm)
    
    return bio_dm

def distance_matrix_to_phylo_tree(dm, outfile=None):
    """Create a phylogenetic tree from the distance matrix."""
    
    dm = pandas_dm_to_biopython_dm(dm)
    
    treeConstructor = TreeConstruction.DistanceTreeConstructor()
    tree = treeConstructor.nj(dm)
    
    if outfile is not None:
        save_tree(tree, outfile)

# ete3 function, not all of these are used here

In [8]:
from ete3 import Tree, TreeStyle
from ete3 import Phyloxml
from ete3 import AttrFace, faces, Tree, NodeStyle, TreeStyle

def load_pickled(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f, encoding='latin')


def get_farthest_node(tree, sequence):
    return (tree&sequence).get_farthest_node()

def get_all_accessions_from_tree(tree):
    return [leaf_node.name for leaf_node in tree.get_leaves()]

def remove_certain_hosts_from_tree(tree, hosts):
    """Remove leaf nodes if the host of that leaf is in `hosts`"""
    
    tree = copy.deepcopy(tree)
    
    removed_accessions = []
    for leaf_node in tree.get_leaves():
        if leaf_node.host in hosts:
            leaf_node.detach()
            
    return tree

def set_midpoint_outgroup(tree):
    tree.set_outgroup(tree.get_midpoint_outgroup())


def load_tree(filename, type_='phyloxml'):
    """Load saved phylogenetic tree.
    """
    
    if type_ == 'phyloxml':
        project = Phyloxml()
        project.build_from_file(filename)

        for tree in project.get_phylogeny():
            break

        t=tree
        
    elif type_ == 'newick':
        t = Tree(filename, format=1)
    else:
        raise ValueError('Not a correct type.')
    
    return t

# construct tree from distance matrix

In [9]:
if CONSTRUCT_PHYLO:
    ALL_dm_ldistance = load_dm(
        OUTPUT_DIR + 'dm'+str(threshold)+'.csv', 
        upper_diag=False)
    
    distance_matrix_to_phylo_tree(
        ALL_dm_ldistance, PHYLO_TREE_DIR + 'ldistance'+str(threshold)+'.xml')

# convert phyloxml tree to newick tree to manipulate trees

In [10]:

Phylo.convert(
    PHYLO_DIR + 'ldistance'+str(threshold)+'.xml','phyloxml',
    PHYLO_DIR + 'ldistance'+str(threshold)+'.nhx','newick')

ltree = load_tree(
    PHYLO_DIR + 'ldistance'+str(threshold)+'.nhx',
    type_='newick')

# label nodes in tree to add other attributes like subtype risk etc

In [11]:
def bandify(val,min=6.27,max=6.295):
    maptoten=int(np.ceil(((val-min)/(max-min))*10))
    return ' '+u'\u2580'*maptoten

def label_nodes(
        tree, 
        recordinfo):
    """Label the nodes of the tree.
    
    We label nodes on whether:
        it is covid19
    """
    
    tree = copy.deepcopy(tree)
    
    for node in tree:
        name = node.name      
        node.subtype = recordinfo.seq_infos[name].subtype
        node.erisk =recordinfo.seq_infos[name].erisk
        node.id = recordinfo.seq_infos[name].name + bandify(recordinfo.seq_infos[name].erisk,min=threshold)
        if VERBOSE:
            print(node.name,node.subtype,node.id,node.erisk)
    return tree

# construct labelled tree

In [12]:
labelled_tree=label_nodes(
    ltree, ALLinfoHA)

# functions to collapse simialr leaves

In [13]:
def prune_nodes(t):
    # collapsed nodes are labeled, so you locate them and prune them
    for n in t.search_nodes(collapsed=True):
        for ch in n.get_children():
            ch.detach()
            
            
def mean(array):
    return sum(array)/float(len(array))

def cache_distances(tree):
    ''' precalculate distances of all nodes to the root''' 
    node2rootdist = {tree:0}
    for node in tree.iter_descendants('preorder'):
        node2rootdist[node] = node.dist + node2rootdist[node.up]
    return node2rootdist

def closest_node(node, node2tips, root_distance):
    """Find the closest node."""
    
    tips = []
    distances = []
    for tip in node2tips[node]:
        distances.append(root_distance[tip]-root_distance[node])
        tips.append(tip)
        #     index = np.argmin([root_distance[tip]-root_distance[node] for tip in node2tips[node]])
    index = np.argmin(distances)
    return tips[index]

def riskiest_node(node, node2tips):
    """Find the closest node."""
    
    tips = []
    risks = []
    for tip in node2tips[node]:
        risks.append(tip.erisk)
        tips.append(tip)
        #     index = np.argmin([root_distance[tip]-root_distance[node] for tip in node2tips[node]])
    index = np.argmax(risks)
    return tips[index]

def collapse(tree, min_dist,AllrecordInfo):
    # cache the tip content of each node to reduce the number of times the tree is traversed
    
    tree = copy.deepcopy(tree)
    
    node2tips = tree.get_cached_content()
    root_distance = cache_distances(tree)

    for node in tree.get_descendants('preorder'):
        if not node.is_leaf():
            avg_distance_to_tips = mean([root_distance[tip]-root_distance[node]
                                         for tip in node2tips[node]])
            if VERBOSE:
                print(avg_distance_to_tips)
            if avg_distance_to_tips < min_dist:
                # do whatever, ete support node annotation, deletion, labeling, etc.
            
                #closest_name = closest_node(node, node2tips, root_distance).name
                closest_name = riskiest_node(node, node2tips).name
                node.subtype = AllrecordInfo.seq_infos[closest_name].subtype
                node.id = AllrecordInfo.seq_infos[closest_name].name + bandify(AllrecordInfo.seq_infos[closest_name].erisk,min=threshold)
                node.name = '%s (%g)' %(closest_name,avg_distance_to_tips)
                
            
                node.add_features(collapsed=True)

                # set drawing attribute so they look collapsed when displayed with tree.show()
                node.img_style['draw_descendants'] = False

    return tree

In [14]:
# collapse leaved

In [15]:
ltree_collapsed = collapse(
    labelled_tree, 
    min_dist=num_collapsed, 
    AllrecordInfo=ALLinfoHA)

prune_nodes(ltree_collapsed)

# code for actual rendering

In [20]:
# COLBAT='DarkRed'
# COLRAT='SteelBlue'
COLH3N2='Blue'
COLH1N1='DarkRed'
COLH7N9='DarkGreen'
COLH9N2='#BD890F'
COLDEF='BLACK'

FS=50
PW=10


def nodeAttribConstruct(color, node):
    N = AttrFace(
        "id", fsize=FS, 
        text_prefix=" ",penwidth=PW,ftype='Arial',
        fgcolor=color,fstyle='bold')
    faces.add_face_to_node(N, node, 1, position="branch-right")
    return N

def layout(node):
    if node.is_leaf():
        if  node.subtype == 'H1N1':
            N = nodeAttribConstruct(COLH1N1,node)
        elif node.subtype == 'H3N2':
            N = nodeAttribConstruct(COLH3N2,node)
        elif node.subtype == 'H7N9':
            N = nodeAttribConstruct(COLH7N9,node)
        elif node.subtype == 'H9N2':
            N = nodeAttribConstruct(COLH9N2,node)
        else:
            N = nodeAttribConstruct(COLDEF,node)
            

            
def render_tree(tree, outfile):# all_seq_data, display_type='nearest_host'):
    """Render the tree inside the file to a circular 
    phylogenetic tree.
    
    NOTE: outfile should be in .pdf for best visuals
    Returns:
    """
    #tree = Tree(nwfile,format=1)

    ts = TreeStyle()
    ns = NodeStyle()
    ts.show_leaf_name = False
    #ts.rotation = 90
    ts.mode = "r"
    #ts.arc_start = -360 # 0 degrees = 3 o'clock
    #ts.arc_span = 360
    ts.scale=5
    ts.show_scale=True
    ts.branch_vertical_margin = .5 # 10 pixels between adjacent branches
    # ts.show_branch_length=True
    #ts.min_leaf_separation=10
    #ts.optimal_scale_level='full'
    #ts.branch_vertical_margin=0
    
    ns.hz_line_width=2
    ns.vt_line_width=1
    #ts.layout_fn = layout
    ns["vt_line_width"] = 16
    ns["hz_line_width"] = 16
    #     ns['fsize'] = 20
    for n in tree.traverse():
        n.set_style(ns)
        
    #all_accessions = all_seq_data['accessions'].values
    for n in tree:
        ts.layout_fn = layout

        
    tree.set_style(ns)
    tree.set_style(ts)
    
    ax=tree.render(
        outfile, 
        dpi=300, 
        h=500,
        tree_style=ts)

# Render phylogenetic trees

In [21]:
ax=render_tree(
    labelled_tree,
    './riskyphylo'+str(threshold)+'.pdf')

In [22]:
ax=render_tree(
    ltree_collapsed,
    './riskyphylo'+str(threshold)+'_collapsed_'+str(num_collapsed)+'.pdf')

In [113]:
ltree_collapsed

Tree node 'Inner1730' (0x7fe11d5c9ba)

# Find high risk strains which are on distinct branches on phylogenetic tree

In [200]:
r=6.057
highrisknames=[]
for node in ltree_collapsed:
    if ALLinfoHA.seq_infos[node.name.split()[0]].erisk > r:
        highrisknames=np.append(highrisknames,
                                ALLinfoHA.seq_infos[node.name.split()[0]].name)
highrisknamesdf=df[df.id.isin(highrisknames)][['id','subtype',
                                               'ha_accession',
                                               'na_accession',
                                               'predicted_impact_score',
                                               'predicted_emergence_score']].sort_values('predicted_emergence_score',
                                                                                         ascending=False)
highrisknamesdf = highrisknamesdf.rename(columns={'id':'strain',
                                                  'ha_accession':'HA accession',
                                                  'na_accession':'NA accession',
                                                  'predicted_impact_score':'predicted IRAT impact',
                                                  'predicted_emergence_score':'predicted IRAT emergence'}).set_index('strain')
highrisknamesdf#.drop_duplicates()

Unnamed: 0_level_0,subtype,HA accession,NA accession,predicted IRAT impact,predicted IRAT emergence
strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A/swine/Missouri/A02524711/2020,H1N1,EPI1818121,EPI1818122,6.064677,6.294432
A/swine/Indiana/A02524710/2020,H3N2,EPI1818137,EPI1818138,6.046863,6.288258
A/Camel/Inner_Mongolia/XL/2020,H7N9,EPI2026200,EPI2026202,6.067189,6.273290
A/swine/North_Carolina/A02479173/2020,H1N1,EPI1780425,EPI1780426,6.064405,6.229882
A/swine/Kansas/A02479028/2020,H3N2,EPI1777753,EPI1777754,6.000545,6.228595
...,...,...,...,...,...
A/swine/Iowa/A02525161/2021,H3N2,EPI1909023,EPI1909024,5.890407,6.078649
A/mink/China/chick_embryo/2020,H9N2,EPI2161544,EPI2161548,5.897720,6.076332
A/swine/Italy/127069/2020,H1N1,EPI2142221,EPI2142147,5.996242,6.060233
A/chicken/China/2096/2021,H9N2,EPI2116845,EPI2116843,5.892288,6.057454


In [204]:
COLDICT={'H1N1':'Red3!20','H3N2':'Blue1!30','H7N9':'Green3!50','H9N2':'DarkOrange!40'}
def rowcolor(row):
    return '\\rowcolor{' + COLDICT[row.subtype]+'}' + row['strain']

highrisknamesdf1 = highrisknamesdf.reset_index(drop=False)
highrisknamesdf1['strain']=highrisknamesdf1.apply(rowcolor,axis=1)
highrisknamesdf1=highrisknamesdf1.set_index('strain')
highrisknamesdf1

Unnamed: 0_level_0,subtype,HA accession,NA accession,predicted IRAT impact,predicted IRAT emergence
strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
\rowcolor{Red3!20}A/swine/Missouri/A02524711/2020,H1N1,EPI1818121,EPI1818122,6.064677,6.294432
\rowcolor{Blue1!30}A/swine/Indiana/A02524710/2020,H3N2,EPI1818137,EPI1818138,6.046863,6.288258
\rowcolor{Green3!50}A/Camel/Inner_Mongolia/XL/2020,H7N9,EPI2026200,EPI2026202,6.067189,6.273290
\rowcolor{Red3!20}A/swine/North_Carolina/A02479173/2020,H1N1,EPI1780425,EPI1780426,6.064405,6.229882
\rowcolor{Blue1!30}A/swine/Kansas/A02479028/2020,H3N2,EPI1777753,EPI1777754,6.000545,6.228595
...,...,...,...,...,...
\rowcolor{Blue1!30}A/swine/Iowa/A02525161/2021,H3N2,EPI1909023,EPI1909024,5.890407,6.078649
\rowcolor{DarkOrange!40}A/mink/China/chick_embryo/2020,H9N2,EPI2161544,EPI2161548,5.897720,6.076332
\rowcolor{Red3!20}A/swine/Italy/127069/2020,H1N1,EPI2142221,EPI2142147,5.996242,6.060233
\rowcolor{DarkOrange!40}A/chicken/China/2096/2021,H9N2,EPI2116845,EPI2116843,5.892288,6.057454


In [208]:
from zedstat.textable import textable
         tabname='../../../tex/Figures/tabdata/highrisk.tex',
         FORMAT='%1.4f',INDEX=True,
         TABFORMAT='L{1.95in}|L{.25in}|L{.60in}|L{.6in}|C{1in}|C{1in}',LNTERM='\\\\\n')
! cat highrisk.tex

\begin{tabular}{L{1.2in}|L{.3in}|L{.3in}|L{.3in}|L{.3in}|L{.3in}}\hline
name&subtype& HA  accession & NA  accession & predicted  IRAT  impact & predicted  IRAT  emergence \\
A/swine/Missouri/A02524711/2020&H1N1&EPI1818121&EPI1818122&6.1&6.3\\
A/swine/Indiana/A02524710/2020&H3N2&EPI1818137&EPI1818138&6.0&6.3\\
 A/Camel/Inner\_Mongolia/XL/2020 &H7N9&EPI2026200&EPI2026202&6.1&6.3\\
 A/swine/North\_Carolina/A02479173/2020 &H1N1&EPI1780425&EPI1780426&6.1&6.2\\
A/swine/Kansas/A02479028/2020&H3N2&EPI1777753&EPI1777754&6.0&6.2\\
A/swine/Minnesota/A02635976/2021&H1N1&EPI1912208&EPI1912209&6.0&6.2\\
A/swine/Italy/56910/2020&H3N2&EPI2142217&EPI2142173&6.0&6.2\\
A/swine/Chile/VN1401-5054/2020&H3N2&EPI1974975&EPI1974978&6.0&6.2\\
A/swine/Minnesota/A02245643/2020&H3N2&EPI1769178&EPI1769179&6.0&6.2\\
A/swine/Tennessee/A02524414/2022&H1N1&EPI2149257&EPI2149258&6.1&6.2\\
A/swine/Iowa/A02479005/2020&H1N1&EPI1777621&EPI1777622&6.0&6.2\\
A/swine/Iowa/A02524878/2020&H3N2&EPI1907866&EPI1907867&

In [166]:
count=0
Subtype={'H1N1':0,'H3N2':0,'H7N9':0,'H9N2':0}
MaxriskStrain={'H1N1':None,'H3N2':None,'H7N9':None,'H9N2':None}
Subtype_strat={6:{'H1N1':0,'H3N2':0,'H7N9':0,'H9N2':0},
               6.1:{'H1N1':0,'H3N2':0,'H7N9':0,'H9N2':0},
               6.2:{'H1N1':0,'H3N2':0,'H7N9':0,'H9N2':0},
               6.25:{'H1N1':0,'H3N2':0,'H7N9':0,'H9N2':0}}
for node in ltree_collapsed:
    Subtype[node.subtype]=Subtype[node.subtype]+1
    if MaxriskStrain[node.subtype] is None:
        MaxriskStrain[node.subtype]=(ALLinfoHA.seq_infos[node.name.split()[0]].name,
                                     ALLinfoHA.seq_infos[node.name.split()[0]].erisk)
    else:
        if MaxriskStrain[node.subtype][1]<ALLinfoHA.seq_infos[node.name.split()[0]].erisk:
            MaxriskStrain[node.subtype]=(ALLinfoHA.seq_infos[node.name.split()[0]].name,
                                         ALLinfoHA.seq_infos[node.name.split()[0]].erisk)
    for r in [6.25,6.2,6.1,6]:
        if r<ALLinfoHA.seq_infos[node.name.split()[0]].erisk:
            #print(node.subtype,ALLinfoHA.seq_infos[node.name.split()[0]].erisk)
            Subtype_strat[r][node.subtype]=Subtype_strat[r][node.subtype]+1
        
    count=count+1
maxriskdf = pd.DataFrame(MaxriskStrain)
Subtype_strat_df = pd.DataFrame(Subtype_strat)
display(maxriskdf)
display(Subtype_strat_df)
print(count)
df[df.id.isin(maxriskdf.iloc[0,:].values)]

Unnamed: 0,H1N1,H3N2,H7N9,H9N2
0,A/swine/Missouri/A02524711/2020,A/swine/Indiana/A02524710/2020,A/Camel/Inner_Mongolia/XL/2020,A/mink/China/chick_embryo/2020
1,6.294432,6.288258,6.27329,6.076332


Unnamed: 0,6.00,6.10,6.20,6.25
H1N1,63,55,3,1
H3N2,17,14,4,1
H7N9,1,1,1,1
H9N2,3,0,0,0


84


Unnamed: 0,id,subtype,ha_accession,na_accession,ha,na,HA_Avg_Qdist,ha_variance,NA_Avg_Qdist,na_variance,Geometric_Mean,predicted_impact_score,predicted_emergence_score
218,A/swine/Missouri/A02524711/2020,H1N1,EPI1818121,EPI1818122,MKAILVVMLYTFTTANADTLCIGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSICMTIGTANLILQIGNIISIWVSHSIQIGNQSQI...,0.007931,9.678707e-06,0.001876,2.43843e-06,0.003857,6.064677,6.294432
1902,A/swine/Indiana/A02524710/2020,H3N2,EPI1818137,EPI1818138,MKTIIALSYILCLVFAQKIPGNDNGTATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLTISTICFFMQIAILITTIALHFKQYEFNSPP...,0.004542,4.49795e-07,0.006619,1.048722e-06,0.005483,6.046863,6.288258
5814,A/Camel/Inner_Mongolia/XL/2020,H7N9,EPI2026200,EPI2026202,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTEREVEVVN...,MNPNQKILCTSATAITIGAIAVLIGIANLGLNIGLHLKPGCNCSHS...,0.014006,0.002854999,0.001182,2.218687e-07,0.004069,6.067189,6.27329
6049,A/mink/China/chick_embryo/2020,H9N2,EPI2161544,EPI2161548,METVSLITILLAATVSNADKICIGYQSSNSTETVDTLTENNVPVTH...,MNPNQKITAIGSVSLIIAIICLLMQIAILTMTMTLHFGQKECSNPS...,0.016908,0.00012931,0.046149,0.01553612,0.027933,5.89772,6.076332


In [33]:
def write_fasta(seqs, fasta_file, wrap=80):
    """Write sequences to a fasta file.

    Parameters
    ----------
    seqs : dict[seq_id] -> seq
        Sequences indexed by sequence id.
    fasta_file : str
        Path to write the sequences to.
    wrap: int
        Number of AA/NT before the line is wrapped.
    """
    with open(fasta_file, 'w') as f:
        for gid, gseq in seqs.items():
            f.write('>{}\n'.format(gid))
            for i in range(0, len(gseq), wrap):
                f.write('{}\n'.format(gseq[i:i + wrap])) 

for i,name in zip(df[df.id.isin(maxriskdf.iloc[0,:].values)].ha.values,
                 df[df.id.isin(maxriskdf.iloc[0,:].values)].ha_accession.values):
    write_fasta({name:i},name+'.fasta')
    
! ls -lhtr *fasta

-rw-rw-r--. 1 ishanu ishanu 164K Nov 14 12:19 ha.fasta
-rw-rw-r--. 1 ishanu ishanu  569 Nov 15 12:05 EPI2161544.fasta
-rw-rw-r--. 1 ishanu ishanu  569 Nov 15 12:05 EPI2026200.fasta
-rw-rw-r--. 1 ishanu ishanu  569 Nov 15 12:05 EPI1818137.fasta
-rw-rw-r--. 1 ishanu ishanu  569 Nov 15 12:05 EPI1818121.fasta


In [None]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW
my_query = SeqIO.read("EPI1818121.fasta", format="fasta")
result_handle = NCBIWWW.qblast("blastp", "nr", my_query.seq)
blast_result = open("my_blast.xml", "w")
blast_result.write(result_handle.read())
blast_result.close()
result_handle.close()

In [92]:
def parse_blast(resultfile): #takes in the BLAST result, outputs list that can be made into csv
    from Bio.Blast import NCBIXML
    result_handle = open(resultfile)
    blast_records = NCBIXML.parse(result_handle)
    csv_list = []
    
    header = [  'accession','Query',
                'Name', 'Length', 'Score', 'Expect',
                'QueryStart', 'QueryEnd',
                'SubjectStart', 'SubjectEnd','pct'
            ]
    
    #csv_list.append(header)
    count = 0
    for blast_record in blast_records:
        '''help(blast_record.alignments[0].hsps[0])''' # these give help info for the parts 
        '''help(blast_record.alignments[0])        '''
        count +=1
        
        query = blast_record.query
        for alignment in blast_record.alignments:

            name = alignment.title
            length = alignment.length
    
            hsp = alignment.hsps[0] # I don't know if we will ever have more than one, so might as well take the first one.
            score = hsp.score
            expect = hsp.expect
            querystart = hsp.query_start
            queryend = hsp.query_end
            subjectstart = hsp.sbjct_start
            subjectend = hsp.sbjct_end
            pct=hsp.positives/hsp.align_length
            accession=alignment.accession
            row = [accession,query,name,length,score,expect,querystart,queryend,subjectstart,subjectend,pct]
            csv_list.append(row)
            
    result_handle.close()
    return pd.DataFrame(csv_list,columns=header)
 

In [94]:
csv_list=parse_blast('my_blast.xml')
csv_list

Unnamed: 0,accession,Query,Name,Length,Score,Expect,QueryStart,QueryEnd,SubjectStart,SubjectEnd,pct
0,QIA58660,unnamed protein product,gb|QIA58660.1| hemagglutinin [Influenza A viru...,566,2969.0,0.0,1,550,1,550,1.0
1,QKG28282,unnamed protein product,gb|QKG28282.1| hemagglutinin [Influenza A virus],566,2967.0,0.0,1,550,1,550,1.0
2,QIZ32536,unnamed protein product,gb|QIZ32536.1| hemagglutinin [Influenza A virus],566,2967.0,0.0,1,550,1,550,1.0
3,QJI52882,unnamed protein product,gb|QJI52882.1| hemagglutinin [Influenza A virus],566,2967.0,0.0,1,550,1,550,1.0
4,QJT24758,unnamed protein product,gb|QJT24758.1| hemagglutinin [Influenza A virus],566,2967.0,0.0,1,550,1,550,1.0
5,QKV49724,unnamed protein product,gb|QKV49724.1| hemagglutinin [Influenza A virus],566,2967.0,0.0,1,550,1,550,1.0
6,QIQ45861,unnamed protein product,gb|QIQ45861.1| hemagglutinin [Influenza A virus],566,2967.0,0.0,1,550,1,550,1.0
7,QKM75781,unnamed protein product,gb|QKM75781.1| hemagglutinin [Influenza A virus],566,2966.0,0.0,1,550,1,550,1.0
8,QJX57625,unnamed protein product,gb|QJX57625.1| hemagglutinin [Influenza A virus],566,2966.0,0.0,1,550,1,550,1.0
9,QIQ45830,unnamed protein product,gb|QIQ45830.1| hemagglutinin [Influenza A virus],566,2966.0,0.0,1,550,1,550,1.0


In [95]:
from Bio import Entrez
Entrez.email = "ishanu@uchicago.edu"     # Always tell NCBI who you are
handle = Entrez.esearch(db="protein", term="QJT24340")
record = Entrez.read(handle)
id=record["IdList"]
id

['1841676337']

In [96]:
handle = Entrez.efetch(db="protein", rettype="gb", retmode="text", id="1834373412")
#record = SeqIO.read(handle, "genbank")
handle.read()

'LOCUS       QJD23117                 566 aa            linear   VRL 28-APR-2020\nDEFINITION  hemagglutinin [Influenza A virus].\nACCESSION   QJD23117\nVERSION     QJD23117.1\nDBSOURCE    accession MT372532.1\nKEYWORDS    .\nSOURCE      Influenza A virus\n  ORGANISM  Influenza A virus\n            Viruses; Riboviria; Orthornavirae; Negarnaviricota;\n            Polyploviricotina; Insthoviricetes; Articulavirales;\n            Orthomyxoviridae; Alphainfluenzavirus.\nREFERENCE   1  (residues 1 to 566)\n  AUTHORS   Swine Surveillance,A.\n  TITLE     USDA NAHLN Voluntary Swine Influenza Surveillance\n  JOURNAL   Unpublished\nREFERENCE   2  (residues 1 to 566)\n  AUTHORS   Swine Surveillance,A.\n  TITLE     Direct Submission\n  JOURNAL   Submitted (23-APR-2020) DiVL, USDA NAHLN Voluntary Swine Influenza\n            Surveillance, 1920 Dayton Ave., Ames, IA 50010, USA\nCOMMENT     Method: conceptual translation.\nFEATURES             Location/Qualifiers\n     source          1..566\n        

H9N2: minks are probable mixing vessels
https://www.ncbi.nlm.nih.gov/nuccore/ON870694.1
 https://www.tandfonline.com/doi/full/10.1080/22221751.2021.1899058
 Pandemic influenza, typically caused by the reassortment of human and avian influenza viruses, can result in severe or fatal infections in humans. Timely identification of potential pandemic viruses must be a priority in influenza virus surveillance. However, the range of host species responsible for the generation of novel pandemic influenza viruses remains unclear. In this study, we conducted serological surveys for avian and human influenza virus infections in farmed mink and determined the susceptibility of mink to prevailing avian and human virus subtypes. The results showed that farmed mink were commonly infected with human (H3N2 and H1N1/pdm) and avian (H7N9, H5N6, and H9N2) influenza A viruses. Correlational analysis indicated that transmission of human influenza viruses occurred from humans to mink, and that feed source was a probable route of avian influenza virus transmission to farmed mink. Animal experiments showed that mink were susceptible and permissive to circulating avian and human influenza viruses, and that human influenza viruses (H3N2 and H1N1/pdm), but not avian viruses, were capable of aerosol transmission among mink. These results indicate that farmed mink could be highly permissive “mixing vessels” for the reassortment of circulating human and avian influenza viruses. 
 
 H7N9
 https://www.bmj.com/content/347/bmj.f4752?tab=responses
 HH transmission has been suspected
 Asian lineage H7N9 virus is rated by the Influenza Risk Assessment Tool as having the greatest potential to cause a pandemic, as well as potentially posing the greatest risk to severely impact public health if it were to achieve sustained human-to-human transmission.
 
 H3N2 swine
 https://www.cdc.gov/flu/swineflu/spotlights/first-human-infection-2022.htm
 Human infection detected in US this Aug
 
 H1N1
 Submitted (03-NOV-2020) USDA Swine Surveillance, USDA Swine
            Surveillance, 1920 Dayton, Ames, IA 50010, USA
COMMENT     Method: conceptual translation.

