In [2]:
from Bio import AlignIO
from Bio import SeqIO
from Bio.Align import MultipleSeqAlignment
from Bio import Seq
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import Phylo
import matplotlib
import matplotlib.pyplot as plt
import os
import re

In [3]:
# Tanpa proses MSA, membaca fasta dan memberikan empty padding string sehingga panjang karakter sama
def read_fasta_as_alignment(filepath):
    records = SeqIO.parse(filepath, 'fasta')
    records = list(records)

    maxlen = max(len(record.seq) for record in records)

    # pad sequences so that they all have the same length
    for record in records:  
        description = record.description        
        label = re.findall("\[label=((\w+|\-+)+)\]", description)
        if (len(label) > 0):        
            record.id = label[0][0]
        print (label)
        print(description)            
        if len(record.seq) != maxlen:
            sequence = str(record.seq).ljust(maxlen, '.')
            record.seq = Seq.Seq(sequence)                                    

    assert all(len(record.seq) == maxlen for record in records)
    
    output_file = './padded-{}.fasta'.format(filepath.split('/')[-1])
    with open(output_file, 'w') as f:
        SeqIO.write(records, f, 'fasta')
    
    alignment = AlignIO.read(output_file, 'fasta')
    alignment = MultipleSeqAlignment(alignment)
    return alignment

In [4]:
# input berupa file alignment
# format antara "clustal" atau "fasta"\
# baca selengkapnya di https://biopython.org/docs/1.76/api/Bio.AlignIO.html
def read_alignment(filepath, format):
    alignment = AlignIO.read(filepath, format)
    alignment = MultipleSeqAlignment(alignment)
    return alignment

In [5]:
# hitung distance matrix dari file alignment
def calculate_distance_matrix(aln):
    calculator = DistanceCalculator('identity')
    distance_matrix = calculator.get_distance(aln)
    return distance_matrix

In [6]:
# bentuk pohon filogenetik dari distance matrix
# method antara "upgma" atau "nj"
# baca di https://homolog.us/Biopython/Bio.Phylo.TreeConstruction.DistanceTreeConstructor.html
def construct_tree(dm, method):
    constructor = DistanceTreeConstructor()
    if (method == "upgma"):
        tree = constructor.upgma(dm)
    else:
        tree = constructor.nj(dm)
    return tree

In [7]:
# label all inner contained text
# from each clade using its branch length
import re
import json
def label_func(clade):
    name  = clade.name
    matches = re.findall("Inner", name)
    if (len(matches) > 0):
        return '{0:.4f}'.format(clade.branch_length)
    return clade.name

def label_func_2(clade):
    name  = clade.name
    matches = re.findall("Inner", name)
    if (len(matches) > 0):
        if (clade.branch_length > 0.00):
            return '{0:.4f}'.format(clade.branch_length)
        return '0.0'
    return clade.name

def label_func_3(clade):
    name  = clade.name
    branch_length = clade.branch_length
    postfix = ''
    prefix = ''
    matches = re.findall("^Inner", name)
    if (len(matches) == 0):
        postfix = ' - ' + name
    if (branch_length > 0):
        prefix = '{0:.4f}'.format(branch_length)
    return prefix + postfix

#lcl|OR184928.1_cds_WJJ80277.1_1
def label_func_by_jsonref(clade, ref):
    branch_length = clade.branch_length
    pattern = "^lcl\|([\w\.]+)_cds"
    matches = re.findall(pattern, clade.name)
    labels = []
    if (branch_length > 0):
        labels.append('{0:.4f}'.format(branch_length))
    if (len(matches) > 0):
        try:
            index = str(matches[0])
            labels.append(ref[index])
        except:
            labels.append(clade.name)
    return " - ".join(labels)