# Análise de homologias por BLAST

In [1]:
# importação de módulos e bibliotecas
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 

In [2]:
def blast(accession : str, program : str, database : str, filename : str):
    """
    função que executa um blast de uma sequência do NCBI e guarda o resultado
    recebe o identificador da seq no NCBI, o tipo de programa de blast, a base de dados usada 
    e o nome do ficheiro que queremos guardar a informação
    """
    result_handle = NCBIWWW.qblast(program, database, accession, )

    save_file = open(filename, "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()

    result_handle = open(filename)

    blast_record = NCBIXML.read(result_handle)

    return blast_record

In [9]:
def blast_filter(blast_record, e_value_threshold : float, coverage_threshold : float, per_identity_threshold : float):
    filtered_alignments = []
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            # coverage = hsp.align_length / blast_record.query_length
            e_value = hsp.expect
            coverage = (hsp.query_end - hsp.query_start + 1) / blast_record.query_length * 100
            per_identity = (hsp.identities / hsp.align_length) * 100
            if e_value <= e_value_threshold and coverage >= coverage_threshold and per_identity >= per_identity_threshold:
                print('\n****Alignment****')
                print('acession:', alignment.accession)
                print('title:', alignment.title)
                print('alignment length:', alignment.length)
                print('e value:', hsp.expect)
                #print('hsp length:', hsp.align_length)
                #print('hsps:', len(alignment.hsps))
                filtered_alignments.append(alignment)
    return filtered_alignments

## Gene FLG

In [5]:
flg_blast_record = blast("60097902", "blastp", "swissprot", "flg_protein_blast")

In [7]:
print(len(flg_blast_record.alignments), "hits")

22 hits


In [13]:
flg_filtered_blast = blast_filter(flg_blast_record, 0.05, 20, 0)
flg_filtered_blast


****Alignment****
acession: P20930
title: sp|P20930.3| RecName: Full=Filaggrin [Homo sapiens]
alignment length: 4061
e value: 0.0

****Alignment****
acession: Q5D862
title: sp|Q5D862.1| RecName: Full=Filaggrin-2; Short=FLG-2; AltName: Full=Intermediate filament-associated and psoriasis-susceptibility protein; Short=Ifapsoriasin [Homo sapiens]
alignment length: 2391
e value: 7.59274e-38

****Alignment****
acession: Q5D862
title: sp|Q5D862.1| RecName: Full=Filaggrin-2; Short=FLG-2; AltName: Full=Intermediate filament-associated and psoriasis-susceptibility protein; Short=Ifapsoriasin [Homo sapiens]
alignment length: 2391
e value: 3.08341e-29

****Alignment****
acession: Q5D862
title: sp|Q5D862.1| RecName: Full=Filaggrin-2; Short=FLG-2; AltName: Full=Intermediate filament-associated and psoriasis-susceptibility protein; Short=Ifapsoriasin [Homo sapiens]
alignment length: 2391
e value: 2.61726e-15


[<Bio.Blast.Record.Alignment at 0x2af20e3bc10>,
 <Bio.Blast.Record.Alignment at 0x2af20e5e0d0>,
 <Bio.Blast.Record.Alignment at 0x2af20e5e0d0>,
 <Bio.Blast.Record.Alignment at 0x2af20e5e0d0>]

In [15]:
if len(flg_filtered_blast) == 1:
    print(len(flg_filtered_blast), "filtered sequence")
elif len(flg_filtered_blast) >= 1:
    print(len(flg_filtered_blast), "filtered sequences")

4 filtered sequences


**Não esquecer de ver a abrangencia taxonomica e os dominios conservados das seqs homologas e comparar com a query**