In [1]:
from IPython.display import clear_output

### Preparation
1. Prepare the sequence database in fasta format and put it in an empty folder
2. Go to FTP server of NCBI BLAST+ program @ https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
3. Download and unzip ncbi-blast-2.11.0+-x64-linux.tar.gz
4. Install Biopython, Flask, 

##### What is in sequence database

In [2]:
_='''
>sp|P31947|1433S_HUMAN 14-3-3 protein sigma OS=Homo sapiens OX=9606 GN=SFN PE=1 SV=1 
MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSVAYKNVVGGQRAAWR
VLSSIEQKSNEEGSEEKGPEVREYREKVETELQGVCDTVLGLLDSHLIKEAGDAESRVFY 
LKMKGDYYRYLAEVATGDDKKRIIDSARSAYQEAMDISKKEMPPTNPIRLGLALNFSVFH 
YEIANSPEEAISLAKTTFDEAMADLHTLSEDSYKDSTLIMQLLRDNLTLWTADNAGEEGG 
EAPQEPQS 
>sp|P63104|1433Z_HUMAN 14-3-3 protein zeta/delta OS=Homo sapiens OX=9606 GN=YWHAZ PE=1 SV=1 
MDKNELVQKAKLAEQAERYDDMAACMKSVTEQGAELSNEERNLLSVAYKNVVGARRSSWR 
VVSSIEQKTEGAEKKQQMAREYREKIETELRDICNDVLSLLEKFLIPNASQAESKVFYLK 
MKGDYYRYLAEVAAGDDKKGIVDQSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYYE 
ILNSPEKACSLAKTAFDEAIAELDTLSEESYKDSTLIMQLLRDNLTLWTSDTQGDEAEAG 
EGGEN
'''

In [3]:
!mkdir /tmp/myblastdb
!wget http://127.0.0.1/software/pdbcov.fasta -O /tmp/myblastdb/pdbcov.fasta 
!wget http://127.0.0.1/software/ncbi-blast-2.11.0+-x64-linux.tar.gz
!tar zxvf ncbi-blast-2.11.0+-x64-linux.tar.gz -C /tmp
!pip3 install Bio flask flask_cors
clear_output()

### Build the blast database

In [4]:
from Bio.Blast.Applications import * # https://biopython.org/docs/1.75/api/Bio.Blast.Applications.html

print(NcbimakeblastdbCommandline(cmd='/tmp/ncbi-blast-2.11.0+/bin/makeblastdb',
                                 input_file='/tmp/myblastdb/pdbcov.fasta',
                                 out='mydbname',
                                 parse_seqids=True,
                                 title='mydbtitle',
                                 dbtype="prot")(cwd='/tmp/myblastdb')[0])



Building a new DB, current time: 04/30/2021 04:04:52
New DB name:   /tmp/myblastdb/mydbname
New DB title:  mydbtitle
Sequence type: Protein
Deleted existing Protein BLAST database named /tmp/myblastdb/mydbname
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 912 sequences in 0.0955598 seconds.





### Run a test BLAST

In [5]:
blastp = NcbiblastpCommandline(cmd='/tmp/ncbi-blast-2.11.0+/bin/blastp',
                               db="/tmp/myblastdb/mydbname", 
                               evalue=0.001,
                               word_size=3,
                               matrix='BLOSUM62',
                               threshold=11,
                               outfmt=5) # see Table C3 @ https://www.ncbi.nlm.nih.gov/books/NBK279684/ 

In [6]:
query_seq = '''>Name
ADLPAPDDTGLQAVLHTALSQGAPGAMVRVDDNGTIHQLSEGVADRATGRAITTTDRFRVGSVTKSFSAV
VLLQLVDEGKLDLDASVNTYLPGLLPDDRITVRQVMSHRSGLYDYTNDMFAQTVPGFESVRNKVFSYQDL
ITLSLKHGVTNAPGAAYSYSNTNFVVAGMLIEKLTGHSVATEYQNRIFTPLNLTDTFYVHPDTVIPGTHA
NGYLTPDEAGGALVDSTEQTVSWAQSAGAVISSTQDLDTFFSALMSGQLMSAAQLAQMQQWTTVNSTQGY
GLGLRRRDLSCGISVYGHTGTVQGYYTYAFASKDGKRSVTALANTSNNVNVLNTMARTLESAFCGKPTT
'''
stdout, stderr = blastp(stdin=query_seq) 

### Analysis

In [7]:
import io
from Bio.Blast import NCBIXML 
blast_record = NCBIXML.read(io.StringIO(stdout)) # https://biopython.org/docs/1.75/api/Bio.Blast.NCBIXML.html

E_VALUE_THRESH = 5e-14
for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            print("****Alignment****")
            print("seq title:", alignment.title[:19] + "...") 
            print("sim-score:", hsp.score)
            print("e-value:", hsp.expect)
            print(hsp.query[0:75] + "...")
            print(hsp.match[0:75] + "...")
            print(hsp.sbjct[0:75] + "...") 
            print("*****************\n\n\n")

****Alignment****
seq title: sp|P15555|DAC_STRSR...
sim-score: 1856.0
e-value: 0.0
ADLPAPDDTGLQAVLHTALSQGAPGAMVRVDDNGTIHQLSEGVADRATGRAITTTDRFRVGSVTKSFSAVVLLQL...
ADLPAPDDTGLQAVLHTALSQGAPGAMVRVDDNGTIHQLSEGVADRATGRAITTTDRFRVGSVTKSFSAVVLLQL...
ADLPAPDDTGLQAVLHTALSQGAPGAMVRVDDNGTIHQLSEGVADRATGRAITTTDRFRVGSVTKSFSAVVLLQL...
*****************



****Alignment****
seq title: tr|K4HQE7|K4HQE7_9B...
sim-score: 168.0
e-value: 2.34784e-14
PGAMVRVDDNG-TIHQLSEGVADRATGRAITTTDRFRVGSVTKSFSAVVLLQLVDEGKLDLDASVNTYLPGLL--...
PGA++ +  NG T +  S G+ D  T   +T    FR+ S++K  + V  + LV+EGKL LD  V+ Y+P     ...
PGAVMMIKRNGETAYFSSFGLRDPDTKEPMTAETIFRIYSMSKPITTVAAMMLVEEGKLQLDEPVSKYIPSFANV...
*****************



