In [94]:
%matplotlib inline

import shutil
import os
from joblib import dump, load
import urllib3
import certifi
from Bio import SeqIO
import Bio
from glob import glob
import json
from IPython import display
import pandas as pd
import numpy as np
import networkx as nx
from collections import OrderedDict

import matplotlib.pyplot as plt
import seaborn as sns
import re

from Bio import Align

In [72]:
def sort_lst(lst):
    '''
    Making list, which contain numbers in str format, sorted, but numbers will still be in str
    return list
    '''
    return [str(x) for x in sorted([int(x) for x in lst])]

In [74]:
def read_fasta(path):
    '''
    read fasta
    returning list of dicts and list of day:
        dict in list of dicts have 'num','desc','seq' keys
        list of days contain all days of HIV seq we know
    '''
    lst_dict = []
    days = []
    patt = r'_[\d]*_' # for days
    with open(path) as fasta:
        i = 1
        for line in fasta:
            if bool(i%2):
                if line.find('reference') == -1:
                    lst_dict.append({'num':i//2+1})
                    lst_dict[-1]['desc'] = (line.replace('>','')).rstrip()
                    days.append(re.search(patt, lst_dict[-1]['desc']).group(0).replace('_',''))
                    #print(i)
                    i+=1
                else:
                    #print(dict_seq)
                    lst_dict.append({'num':i//2+1})
                    lst_dict[-1]['desc'] = 'reference'
                    i+=1
            else:
                lst_dict[-1]['seq'] = line.rstrip()
                i+=1
    days = sort_lst(list(set(days)))
    return lst_dict, days

haplo_seq_dict, days = read_fasta('data/fasta/hivevo_p4_V3.fasta')

In [82]:
def preparing_data(haplo_seq_dict, days):
    patt = r'_[\d]*_'
    seq_dict = {} 
    seq_name_days_dict = {x:[] for x in days}
    for obj in haplo_seq_dict:
        seq_dict[obj['desc']]=obj['seq']
        res = re.search(patt, obj['desc'])
        if res != None:
            seq_name_days_dict[re.search(patt, obj['desc']).group(0).replace('_','')].append(obj['desc'])
        else:
            pass
    return seq_dict, seq_name_days_dict

In [83]:
seq_dict, seq_name_days_dict = preparing_data(haplo_seq_dict, days)

In [193]:
class Node:
    def __init__(self, parent=None, kids=[], name=None, seq = None, level = None):
        self.parent = parent #node type
        self.kids = kids
        self.name = name
        self.level = level
        self.seq = seq
        
    def __repr__(self):
        return 'Node for haplo: ' + self.name
   
    def find_parent(self, levels_lst, levels_nodes):
        '''
        Finding parent from lower level for node
        '''
        score_max = -float('inf')
        
        low_level = levels_lst[levels_lst.index(self.level)-1]
        low_level_nodes = levels_nodes[low_level]
        
        aligner = Align.PairwiseAligner()
        for node in low_level_nodes:
            score = aligner.score(self.seq, node.seq)
            if score > score_max:
                self.parent = node
                score_max = score
            else:
                continue
        self.parent.kids.append(self)    
    
    def _make_path(self):
        '''
        Making path from final level to reference
        '''
        
        if self.level == 'root':
            return self.name
        else:
            return self.name + ' ' + self.parent._make_path()
    
    def path(self):
        '''
        Making _make_path more useful
        '''
        return self._make_path().split()

In [194]:
class Tree:
    def __init__(self, root = None):
        self.root = root
        self.levels_nodes = None
        self.levels_lst = None
        self.levels_days = None
    
    def construct_levels(self, days):
        '''
        Fixing construction of our tree
        '''
        self.levels_days = {'level_'+str(i):days[i] for i in range(len(days))}
        self.levels_lst = list(self.levels_days.keys())
        self.levels_nodes = {x:[] for x in self.levels_lst}
        self.levels_nodes['root'] = None
    
    def constructing_tree(self, seq_dict, seq_name_days_dict):
        '''
        Constructing tree using prepared data
        '''
        root_node = Node(name = 'reference', seq = seq_dict['reference'], level = 'root', kids=[])
        self.levels_nodes['root'] = root_node
        for level in self.levels_lst:
            if level == 'level_0':
                for haplotype in seq_name_days_dict[self.levels_days[level]]:
                    node = Node(name=haplotype, seq=seq_dict[haplotype], level=level, parent=root_node)
                    root_node.kids.append(node)
                    self.levels_nodes[level].append(node)
            else:
                for haplotype in seq_name_days_dict[self.levels_days[level]]:
                    node = Node(name=haplotype, seq=seq_dict[haplotype], level=level)
                    node.find_parent(self.levels_lst, self.levels_nodes)
                    self.levels_nodes[level].append(node)
    
    def create_all_path(self):
        '''
        Making all paths from final nodes to reference
        '''
        all_paths = []
        for node in self.levels_nodes[self.levels_lst[-1]]:
            all_paths.append(node.path())
        return all_paths
                    
phylo_tree = Tree()
phylo_tree.construct_levels(days)
phylo_tree.constructing_tree(seq_dict, seq_name_days_dict)

In [196]:
#phylo_tree.levels_nodes