In [1]:
import copy
import pandas as pd
import numpy as np
import Levenshtein
from tqdm import tqdm
class SeqInfo(object):
    """Holds information regarding the sequence.
    
    """
    def __init__(self, seq, 
                 protein,
                 accession,
                 id=None,
                 name=None,
                 subtype=None,
                 host=None, 
                 date=None, 
                 erisk=None,
                 irisk=None,
                 risk_flag=None,
                 country=None):
        self.name = name
        self.id = id
        self.protein=protein
        self.subtype=subtype        
        self.seq = seq
        self.accession = accession 
        self.host = host
        self.date = date
        self.erisk = erisk
        self.irisk = irisk
        self.risk_flag = risk_flag
        self.country = country
        
class MultipleSeqInfo(object):
    """Holds information regarding the sequences in the records.
    
    Args:
        records (list): list of records parsed from NCBI
        cov19_accessions (list): of accessions corresponding to cov19
    """
    def __init__(self,
                 dataframe,
                 accessionname,
                 proteinname,
                 risk_threshold=6.2):
        
        self.seq_infos = {}
        self.risk_threshold = risk_threshold
        for i in np.arange(dataframe.index.size):
            record=dataframe.iloc[i,:]
            seqinfo = SeqInfo(
                name=record.id,
                seq=record[proteinname], 
                protein=proteinname,
                accession=record[accessionname],
                subtype=record.subtype,
                erisk=record.predicted_emergence_score,
                irisk=record.predicted_impact_score,
                risk_flag = record.predicted_emergence_score > self.risk_threshold,
                host=None,
                date=None,
                country=None)
            #print(record.predicted_emergence_score > self.risk_threshold)
            self.seq_infos[seqinfo.accession] = seqinfo
            
    
    def compute_L_diatance_matrix(self):
        highriskseq = pd.DataFrame.from_dict({key:val.seq 
                                              for (key,val) in self.seq_infos.items() 
                                              if val.risk_flag},orient='index',columns=['seq'])
        num=highriskseq.index.size
        d=np.zeros([num,num])
        for x in tqdm(np.arange(num*num)):
            j=x//num
            i=x-num*j
            if i > j:
                d[i,j] = Levenshtein.distance(highriskseq.seq.values[i],
                                                  highriskseq.seq.values[j])
        ds=pd.DataFrame(d)        
        ds=(ds+ds.transpose())
        ds.columns=highriskseq.index.values
        self.highriskdistancematrix=ds.copy()
        return ds
    
    
    def accessions_to_subtype(self, accessions):
        """Create a dictionary mapping the accession to the host.
        """
        
        subtypes = []
        for accession in accessions:
            seqinfo = self.seq_infos[accession]
            subtypes.append(seqinfo.subtype)
            
        return subtypes

    def accessions_to_host(self, accessions):
        """Create a dictionary mapping the accession to the host.
        """
        
        hosts = []
        for accession in accessions:
            seqinfo = self.seq_infos[accession]
            hosts.append(seqinfo.host)
        return hosts
    

In [2]:
threshold=6
df=pd.read_csv('./combined_results.csv',index_col=0).reset_index()
#ALLinfoHA=MultipleSeqInfo(df.reset_index(),'ha_accession','ha',risk_threshold=6.054)
ALLinfoHA=MultipleSeqInfo(df.reset_index(),'ha_accession','ha',risk_threshold=threshold)

In [3]:
df[df.predicted_emergence_score>threshold].subtype.value_counts()

H1N1    890
H3N2    812
H9N2     29
H7N9      1
Name: subtype, dtype: int64

In [4]:
ds=ALLinfoHA.compute_L_diatance_matrix()

100%|██████████████████████████████| 2999824/2999824 [00:51<00:00, 58691.18it/s]


In [5]:
#df[df.id.str.contains('chicken')].sort_values('predicted_emergence_score',ascending=False).head()

In [6]:
from Bio.Phylo import TreeConstruction
from Bio import Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Align import MultipleSeqAlignment
from Bio import Entrez
from Bio import SeqIO


def load_dm(file_, upper_diag=True):
    """Load the distance matrix. 
    
    Also, do some preprocessing. 
    """
    
    df = pd.read_csv(file_)
    #df.set_index('Unnamed: 0', inplace=True)
    #assert np.all(df.columns == df.index)
    
    # drop duplicate columns after reading csv
    #df = df.loc[:, ~df.columns.str.replace("(\.\d+)$", "").duplicated()]
    
    if upper_diag:
        df = df + df.T
    return df

def save_tree(tree, file_name, save_type='xml'):
    """Saved the created phylogenetic tree."""
    
    if save_type == 'pickle':
        graph = Phylo.to_networkx(tree)
        save_pickled(graph, file_name)
    elif save_type == 'xml':
        Phylo.write(tree, file_name, 'phyloxml')
    else:
        raise ValueError('Not a correct save type.')
    
def pandas_dm_to_biopython_dm(dm):
    """Convert the pandas distance matrix to the biopython distance matrix.
    
    Returns:
        biopython distance matrix
    """
    
    accessions = dm.columns
    bio_dm = []
    for i, accession in enumerate(accessions):
        bio_dm.append(list(dm.iloc[i, :i+1].values))
        
    bio_dm = TreeConstruction._DistanceMatrix(
        list(dm.columns), 
        bio_dm)
    
    return bio_dm

def distance_matrix_to_phylo_tree(dm, outfile=None):
    """Create a phylogenetic tree from the distance matrix."""
    
    dm = pandas_dm_to_biopython_dm(dm)
    
    treeConstructor = TreeConstruction.DistanceTreeConstructor()
    tree = treeConstructor.nj(dm)
    
    if outfile is not None:
        save_tree(tree, outfile)

In [7]:
ds.to_csv('dm'+str(threshold)+'.csv',index=None)

In [8]:
from ete3 import Tree, TreeStyle
from ete3 import Phyloxml
from ete3 import AttrFace, faces, Tree, NodeStyle, TreeStyle

def load_pickled(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f, encoding='latin')


def get_farthest_node(tree, sequence):
    return (tree&sequence).get_farthest_node()

def get_all_accessions_from_tree(tree):
    return [leaf_node.name for leaf_node in tree.get_leaves()]

def remove_certain_hosts_from_tree(tree, hosts):
    """Remove leaf nodes if the host of that leaf is in `hosts`"""
    
    tree = copy.deepcopy(tree)
    
    removed_accessions = []
    for leaf_node in tree.get_leaves():
        if leaf_node.host in hosts:
            leaf_node.detach()
            
    return tree

def set_midpoint_outgroup(tree):
    tree.set_outgroup(tree.get_midpoint_outgroup())


def load_tree(filename, type_='phyloxml'):
    """Load saved phylogenetic tree.
    """
    
    if type_ == 'phyloxml':
        project = Phyloxml()
        project.build_from_file(filename)

        for tree in project.get_phylogeny():
            break

        t=tree
        
    elif type_ == 'newick':
        t = Tree(filename, format=1)
    else:
        raise ValueError('Not a correct type.')
    
    return t

In [9]:
CONSTRUCT_PHYLO=True
OUTPUT_DIR='./'
PHYLO_TREE_DIR='./'
PHYLO_DIR='./'


In [10]:
if CONSTRUCT_PHYLO:
    ALL_dm_ldistance = load_dm(
        OUTPUT_DIR + 'dm'+str(threshold)+'.csv', 
        upper_diag=False)
    
    distance_matrix_to_phylo_tree(
        ALL_dm_ldistance, PHYLO_TREE_DIR + 'ldistance'+str(threshold)+'.xml')

In [11]:

Phylo.convert(
    PHYLO_DIR + 'ldistance'+str(threshold)+'.xml','phyloxml',
    PHYLO_DIR + 'ldistance'+str(threshold)+'.nhx','newick')

ltree = load_tree(
    PHYLO_DIR + 'ldistance'+str(threshold)+'.nhx',
    type_='newick')

In [46]:
def bandify(val,min=6.27,max=6.295):
    maptoten=int(np.ceil(((val-min)/(max-min))*10))
    return ' '+u'\u2580'*maptoten

def label_nodes(
        tree, 
        recordinfo):
    """Label the nodes of the tree.
    
    We label nodes on whether:
        it is covid19
    """
    
    tree = copy.deepcopy(tree)
    
    for node in tree:
        name = node.name      
        node.subtype = recordinfo.seq_infos[name].subtype
        node.erisk =recordinfo.seq_infos[name].erisk
        node.id = recordinfo.seq_infos[name].name + bandify(recordinfo.seq_infos[name].erisk,min=threshold)
        print(node.name,node.subtype,node.id,node.erisk)
    return tree

In [47]:
labelled_tree=label_nodes(
    ltree, ALLinfoHA)

EPI1817164 H1N1 A/swine/Iowa/A02524646/2020 ▀▀▀▀▀ 6.142233336087257
EPI1910907 H1N1 A/swine/Iowa/A02635719/2021 ▀▀▀▀▀ 6.142233336087257
EPI1910807 H1N1 A/swine/Iowa/A02525345/2021 ▀▀▀▀▀ 6.142233336087257
EPI2161576 H1N1 A/swine/Iowa/A02750621/2022 ▀▀▀▀▀ 6.136576251270402
EPI1775817 H1N1 A/swine/Iowa/A02245587/2020 ▀▀▀▀▀ 6.138714301756836
EPI1911666 H1N1 A/swine/Iowa/A02635892/2021 ▀▀▀▀▀ 6.140638691540048
EPI1911263 H1N1 A/swine/Iowa/A02635823/2021 ▀▀▀▀▀ 6.140834081760937
EPI1911015 H1N1 A/swine/Iowa/A02635781/2021 ▀▀▀▀▀ 6.140638691540048
EPI1909089 H1N1 A/swine/Iowa/A02525216/2021 ▀▀▀▀▀ 6.140638691540048
EPI1769116 H1N1 A/swine/Nebraska/A02479337/2020 ▀▀▀▀▀ 6.142233336087257
EPI1912518 H1N1 A/swine/Minnesota/A02246459/2021 ▀▀▀▀▀ 6.133933214747926
EPI1774141 H1N1 A/swine/Nebraska/A02479186/2020 ▀▀▀▀▀ 6.142197043615236
EPI2147451 H1N1 A/swine/Illinois/A02636452/2022 ▀▀▀▀▀ 6.133313575554519
EPI1932937 H1N1 A/swine/Nebraska/A02636117/2021 ▀▀▀▀▀ 6.142357189518824
EPI1780249 H1N1 A/swine/Iow

In [52]:
def prune_nodes(t):
    # collapsed nodes are labeled, so you locate them and prune them
    for n in t.search_nodes(collapsed=True):
        for ch in n.get_children():
            ch.detach()
            
            
def mean(array):
    return sum(array)/float(len(array))

def cache_distances(tree):
    ''' precalculate distances of all nodes to the root''' 
    node2rootdist = {tree:0}
    for node in tree.iter_descendants('preorder'):
        node2rootdist[node] = node.dist + node2rootdist[node.up]
    return node2rootdist

def closest_node(node, node2tips, root_distance):
    """Find the closest node."""
    
    tips = []
    distances = []
    for tip in node2tips[node]:
        distances.append(root_distance[tip]-root_distance[node])
        tips.append(tip)
        #     index = np.argmin([root_distance[tip]-root_distance[node] for tip in node2tips[node]])
    index = np.argmin(distances)
    return tips[index]

def riskiest_node(node, node2tips):
    """Find the closest node."""
    
    tips = []
    risks = []
    for tip in node2tips[node]:
        risks.append(tip.erisk)
        tips.append(tip)
        #     index = np.argmin([root_distance[tip]-root_distance[node] for tip in node2tips[node]])
    index = np.argmax(risks)
    return tips[index]

def collapse(tree, min_dist,AllrecordInfo):
    # cache the tip content of each node to reduce the number of times the tree is traversed
    
    tree = copy.deepcopy(tree)
    
    node2tips = tree.get_cached_content()
    root_distance = cache_distances(tree)

    for node in tree.get_descendants('preorder'):
        if not node.is_leaf():
            avg_distance_to_tips = mean([root_distance[tip]-root_distance[node]
                                         for tip in node2tips[node]])
            print(avg_distance_to_tips)
            if avg_distance_to_tips < min_dist:
                # do whatever, ete support node annotation, deletion, labeling, etc.
            
                #closest_name = closest_node(node, node2tips, root_distance).name
                closest_name = riskiest_node(node, node2tips).name
                node.subtype = AllrecordInfo.seq_infos[closest_name].subtype
                node.id = AllrecordInfo.seq_infos[closest_name].name + bandify(AllrecordInfo.seq_infos[closest_name].erisk,min=threshold)
                node.name = '%s (%g)' %(closest_name,avg_distance_to_tips)
                
            
                node.add_features(collapsed=True)

                # set drawing attribute so they look collapsed when displayed with tree.show()
                node.img_style['draw_descendants'] = False

    return tree
num_collapsed=18
ltree_collapsed = collapse(
    labelled_tree, 
    min_dist=num_collapsed, 
    AllrecordInfo=ALLinfoHA)

prune_nodes(ltree_collapsed)


# COLBAT='DarkRed'
# COLRAT='SteelBlue'
COLHUMAN='DarkGreen'
COLCOVID='DarkRed'
COLBAT='Red'
COLRAT='Blue'
COLCAMEL='Purple'
COLGAME='Red'
COLCATTLE='#BD890F'
# COLHUMAN='Black'
FS=50
PW=10


def nodeAttribConstruct(color, node):
    N = AttrFace(
        "id", fsize=FS, 
        text_prefix=" ",penwidth=PW,ftype='Arial',
        fgcolor=color,fstyle='bold')
    faces.add_face_to_node(N, node, 1, position="branch-right")
    return N

def layout(node):
    if node.is_leaf():
        if  node.subtype == 'H1N1':
            N = nodeAttribConstruct(COLBAT,node)
        elif node.subtype == 'H3N2':
            N = nodeAttribConstruct(COLRAT,node)
        elif node.subtype == 'H7N9':
            N = nodeAttribConstruct(COLHUMAN,node)
        elif node.subtype == 'H9N2':
            N = nodeAttribConstruct(COLCATTLE,node)
        else:
            N = nodeAttribConstruct(COLGAME,node)
            

170.79521933526004
170.89400199537297
170.99289898726843
171.09191050955405
171.1910367612977
171.29027794202892
171.38963425174006
171.4891058908879
171.5886930603948
171.68839596165012
171.78821479651154
171.8881497673065
171.98820107683346
172.08836892836334
172.18865352564094
172.2890550728862
172.38957377479574
172.490209836544
172.59096346378496
172.69183486265334
172.79223944444436
172.89276152720882
0.0
173.09415902167538
173.19503484759665
173.29602900293247
0.0
0.0
0.0
173.70119309229858
173.80278202941167
173.90449055326656
174.00630887514745
0.5
174.20972613797218
1.5
174.41239311097974
174.51439220909634
2.416176666666667
2.5
174.81800910059127
174.91759201894624
2.736244347826088
2.6357857352941187
2.497950149253733
1.0
2.434866615384616
2.2289991935483875
2.7208657142857144
0.5000000000000001
3.1937419999999994
1.0615674999999998
0.3333966666666666
0.0
1.697782545454545
2.107893181818181
0.0
0.0
1.9889500000000004
1.4955966666666667
1.5
1.4423193750000005
1.4213326666666

In [59]:
            
def render_tree(tree, outfile):# all_seq_data, display_type='nearest_host'):
    """Render the tree inside the file to a circular 
    phylogenetic tree.
    
    NOTE: outfile should be in .pdf for best visuals
    Returns:
    """
    #tree = Tree(nwfile,format=1)

    ts = TreeStyle()
    ns = NodeStyle()
    ts.show_leaf_name = False
    #ts.rotation = 90
    ts.mode = "r"
    #ts.arc_start = -360 # 0 degrees = 3 o'clock
    #ts.arc_span = 360
    ts.scale=5
    ts.show_scale=True
    ts.branch_vertical_margin = .5 # 10 pixels between adjacent branches
    # ts.show_branch_length=True
    #ts.min_leaf_separation=10
    #ts.optimal_scale_level='full'
    #ts.branch_vertical_margin=0
    
    ns.hz_line_width=2
    ns.vt_line_width=1
    #ts.layout_fn = layout
    ns["vt_line_width"] = 16
    ns["hz_line_width"] = 16
    #     ns['fsize'] = 20
    for n in tree.traverse():
        n.set_style(ns)
        
    #all_accessions = all_seq_data['accessions'].values
    for n in tree:
        ts.layout_fn = layout

        
    tree.set_style(ns)
    tree.set_style(ts)
    
    #t.show()
    tree.render(
        outfile, 
        dpi=300, 
        h=500,
        tree_style=ts)

In [60]:
render_tree(
    labelled_tree, './riskyphylo'+str(threshold)+'.pdf')
#    

In [61]:
render_tree(
    ltree_collapsed, './riskyphylo'+str(threshold)+'_collapsed_'+str(num_collapsed)+'.pdf')


In [91]:
count=0
Subtype={'H1N1':0,'H3N2':0,'H7N9':0,'H9N2':0}
MaxriskStrain={'H1N1':None,'H3N2':None,'H7N9':None,'H9N2':None}
Subtype_strat={6:{'H1N1':0,'H3N2':0,'H7N9':0,'H9N2':0},
               6.1:{'H1N1':0,'H3N2':0,'H7N9':0,'H9N2':0},
               6.2:{'H1N1':0,'H3N2':0,'H7N9':0,'H9N2':0},
               6.25:{'H1N1':0,'H3N2':0,'H7N9':0,'H9N2':0}}
for node in ltree_collapsed:
    Subtype[node.subtype]=Subtype[node.subtype]+1
    if MaxriskStrain[node.subtype] is None:
        MaxriskStrain[node.subtype]=(ALLinfoHA.seq_infos[node.name.split()[0]].name,
                                     ALLinfoHA.seq_infos[node.name.split()[0]].erisk)
    else:
        if MaxriskStrain[node.subtype][1]<ALLinfoHA.seq_infos[node.name.split()[0]].erisk:
            MaxriskStrain[node.subtype]=(ALLinfoHA.seq_infos[node.name.split()[0]].name,
                                         ALLinfoHA.seq_infos[node.name.split()[0]].erisk)
    for r in [6.25,6.2,6.1,6]:
        if r<ALLinfoHA.seq_infos[node.name.split()[0]].erisk:
            #print(node.subtype,ALLinfoHA.seq_infos[node.name.split()[0]].erisk)
            Subtype_strat[r][node.subtype]=Subtype_strat[r][node.subtype]+1
        
    count=count+1
print(MaxriskStrain)
pd.DataFrame(Subtype_strat)


{'H1N1': ('A/swine/Missouri/A02524711/2020', 6.294431696236518), 'H3N2': ('A/swine/Indiana/A02524710/2020', 6.2882576212142505), 'H7N9': ('A/Camel/Inner_Mongolia/XL/2020', 6.273290290820161), 'H9N2': ('A/mink/China/chick_embryo/2020', 6.076332437429215)}


Unnamed: 0,6.00,6.10,6.20,6.25
H1N1,63,55,3,1
H3N2,17,14,4,1
H7N9,1,1,1,1
H9N2,3,0,0,0


In [93]:
df[df.id.isin(['A/swine/Missouri/A02524711/2020','A/swine/Indiana/A02524710/2020','A/Camel/Inner_Mongolia/XL/2020','A/mink/China/chick_embryo/2020'])]

Unnamed: 0,id,subtype,ha_accession,na_accession,ha,na,HA_Avg_Qdist,ha_variance,NA_Avg_Qdist,na_variance,Geometric_Mean,predicted_impact_score,predicted_emergence_score
218,A/swine/Missouri/A02524711/2020,H1N1,EPI1818121,EPI1818122,MKAILVVMLYTFTTANADTLCIGYHANNSTDTVDTVLEKNVTVTHS...,MNPNQKIITIGSICMTIGTANLILQIGNIISIWVSHSIQIGNQSQI...,0.007931,9.678707e-06,0.001876,2.43843e-06,0.003857,6.064677,6.294432
1902,A/swine/Indiana/A02524710/2020,H3N2,EPI1818137,EPI1818138,MKTIIALSYILCLVFAQKIPGNDNGTATLCLGHHAVPNGTIVKTIT...,MNPNQKIITIGSVSLTISTICFFMQIAILITTIALHFKQYEFNSPP...,0.004542,4.49795e-07,0.006619,1.048722e-06,0.005483,6.046863,6.288258
5814,A/Camel/Inner_Mongolia/XL/2020,H7N9,EPI2026200,EPI2026202,MNTQILVFALIAIIPTNADKICLGHHAVSNGTKVNTLTEREVEVVN...,MNPNQKILCTSATAITIGAIAVLIGIANLGLNIGLHLKPGCNCSHS...,0.014006,0.002854999,0.001182,2.218687e-07,0.004069,6.067189,6.27329
6049,A/mink/China/chick_embryo/2020,H9N2,EPI2161544,EPI2161548,METVSLITILLAATVSNADKICIGYQSSNSTETVDTLTENNVPVTH...,MNPNQKITAIGSVSLIIAIICLLMQIAILTMTMTLHFGQKECSNPS...,0.016908,0.00012931,0.046149,0.01553612,0.027933,5.89772,6.076332


H9N2: minks are probable mixing vessels
https://www.ncbi.nlm.nih.gov/nuccore/ON870694.1
 https://www.tandfonline.com/doi/full/10.1080/22221751.2021.1899058
 Pandemic influenza, typically caused by the reassortment of human and avian influenza viruses, can result in severe or fatal infections in humans. Timely identification of potential pandemic viruses must be a priority in influenza virus surveillance. However, the range of host species responsible for the generation of novel pandemic influenza viruses remains unclear. In this study, we conducted serological surveys for avian and human influenza virus infections in farmed mink and determined the susceptibility of mink to prevailing avian and human virus subtypes. The results showed that farmed mink were commonly infected with human (H3N2 and H1N1/pdm) and avian (H7N9, H5N6, and H9N2) influenza A viruses. Correlational analysis indicated that transmission of human influenza viruses occurred from humans to mink, and that feed source was a probable route of avian influenza virus transmission to farmed mink. Animal experiments showed that mink were susceptible and permissive to circulating avian and human influenza viruses, and that human influenza viruses (H3N2 and H1N1/pdm), but not avian viruses, were capable of aerosol transmission among mink. These results indicate that farmed mink could be highly permissive “mixing vessels” for the reassortment of circulating human and avian influenza viruses. 
 
 H7N9
 https://www.bmj.com/content/347/bmj.f4752?tab=responses
 HH transmission has been suspected
 Asian lineage H7N9 virus is rated by the Influenza Risk Assessment Tool as having the greatest potential to cause a pandemic, as well as potentially posing the greatest risk to severely impact public health if it were to achieve sustained human-to-human transmission.
 
 H3N2 swine
 https://www.cdc.gov/flu/swineflu/spotlights/first-human-infection-2022.htm
 Human infection detected in US this Aug
 
 H1N1
 Submitted (03-NOV-2020) USDA Swine Surveillance, USDA Swine
            Surveillance, 1920 Dayton, Ames, IA 50010, USA
COMMENT     Method: conceptual translation.

