In [6]:
from Bio.Blast import NCBIWWW
from Bio import SeqIO
from Bio.Blast import NCBIXML
from Bio import Entrez

NCBIWWW.email = 'wojciech1.batko@student.uj.edu.pl'
Entrez.email = 'wojciech1.batko@student.uj.edu.pl'

In [7]:
task1_sequence =  """
GTACCTTGATTTCGTATTCTGAGAGGCTGCTGCTTAGCGGTAGCCCCTTGGTTTCCGTGGCAACGGAAAA
GCGCGGGAATTACAGATAAATTAAAACTGCGACTGCGCGGCGTGAGCTCGCTGAGACTTCCTGGACGGGG
GACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGCCTTCACCCTCTGCTCTGGGTA
AAGTTCATTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAAT
GCTATGCAGAAAATCTTAGAGTGTCCCATCTGTCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTG
ACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTT
ATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTA
TTGAAAATCATTTGTGCTTTTCAGCTTGACACAGGTTTGGAGTATGCAAACAGCTATAATTTTGCAAAAA
AGGAAAATAACTCTCCTGAACATCTAAAAGATGAAGTTTCTATCATCCAAAGTATGGGCTACAGAAACCG
TGCCAAAAGACTTCTACAGAGTGAACCCGAAAATCCTTCCTTGCAGGAAACCAGTCTCAGTGTCCAACTC
TCTAACCTTGGAACTGTGAGAACTCTGAGGACAAAGCAGCGGATACAACCTCAAAAGACGTCTGTCTACA
TTGAATTGGGATCTGATTCTTCTGAAGATACCGTTAATAAGGCAACTTATTGCAGTGTGGGAGATCAAGA
ATTGTTACAAATCACCCCTCAAGGAACCAGGGATGAAATCAGTTTGGATTCTGCAAAAAAGGCTGCTTGT
GAATTTTCTGAGACGGATGTAACAAATACTGAACATCATCAACCCAGTAATAATGATTTGAACACCACTG
AGAAGCGTGCAGCTGAGAGGCATCCAGAAAAGTATCAGGGTAGTTCTGTTTCAAACTTGCATGTGGAGCC
ATGTGGCACAAATACTCATGCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACTAAAGACAGA
ATGAATGTAGAAAAGGCTGAATTCTGTAATAAAAGCAAACAGCCTGGCTTAGCAAGGAGCCAACATAACA
GATGGGCTGGAAGTAAGGAAACATGTAATGATAGGCGGACTCCCAGCACAGAAAAAAAGGTAGATCTGAA
TGCTGATCCCCTGTGTGAGAGAAAAGAATGGAATAAGCAGAAACTGCCATGCTCAGAGAATCCTAGAGAT
ACTGAAGATGTTCCTTGGATAACACTAAATAGCAGCATTCAGAAAGTTAATGAGTGGTTTTCCAGAAGTG
ATGAACTGTTAGGTTCTGATGACTCACATGATGGGGAGTCTGAATCAAATGCCAAAGTAGCTGATGTATT
GGACGTTCTAAATGAGGTAGATGAATATTCTGGTTCTTCAGAGAAAATAGACTTACTGGCCAGTGATCCT
CATGAGGCTTTAATATGTAAAAGTGAAAGAGTTCACTCCAAATCAGTAGAGAGTAATATTGAAGACAAAA
TATTTGGGAAAACCTATCGGAAGAAGGCAAGCCTCCCCAACTTAAGCCATGTAACTGAAAATCTAATTAT
AGGAGCATTTGTTACTGAGCCACAGATAATACAAGAGCGTCCCCTC
"""

In [8]:
task2_sequence = """
MKSILDGLADTTFRTITTDLLGSPFQEKMTAGDNPQLVPADQVNITEFYNKSLSSFKENEENIQCGENFM
DIECFMVLNPSQQLAIAVLSLTLGTFTVLENLLVLCVILHSRSLRCRPSYHFIGSLAVADLLGSVIFVYS
FIDFHVFHRKDSRNVFLFKLGGVTASFTASVGSLFLTAIDRYISIHRPLAYKRIVTRPKAVVAFCLMWTI
AIVIAVLPLLGWNCEKLQSVCSDIFPHIDETYLMFWIGVTSVLLLFIVYAYMYILWKAHSHAVRMIQRGT
QKSIIIHTSEDGKVQVTRPDQARMDIRLAKTLVLILVVLIICWGPLLAIMVYDVFGKMNKLIKTVFAFCS
MLCLLNSTVNPIIYALRSKDLRHAFRSMFPSCEGTAQPLDNSMGDSDCLHKHANNAASVHRAAESCIKSTVKIAKVTMSVSTDTSAEAL
"""

In [9]:
task3_sequence_id = "NM_000539.3"

# TASK 1

In [3]:
# The default database (nr/nt) contains traditional GenBank and RefSeq RNA sequences
# https://www.nlm.nih.gov/ncbi/workshops/2023-08_BLAST_evol/databases.html
result_handle = NCBIWWW.qblast('blastn', 'nt', task1_sequence)
blast_records = NCBIXML.read(result_handle)

In [4]:
for alignment in blast_records.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < 0.01:
            print("****Alignment****")
            print(f"sequence: {alignment.title}")
            print(f"length: {alignment.length}")
            print(f"e value: {hsp.expect}")
            print(hsp.query[0:75] + "...")
            print(hsp.match[0:75] + "...")
            print(hsp.sbjct[0:75] + "...")

****Alignment****
sequence: gi|2697746793|ref|XM_024350087.3| PREDICTED: Pan troglodytes BRCA1 DNA repair associated (BRCA1), transcript variant X11, mRNA
length: 7194
e value: 0.0
GTACCTTGATTTCGTATTCTGAGAGGCTGCTGCTTAGCGGTAGCCCCTTGGTTTCCGTGGCAACGGAAAAGCGCG...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
GTACCTTGATTTCGTATTCTGAGAGGCTGCTGCTTAGCGGTAGCCCCTTGGTTTCCGTGGCAACGGAAAAGCGCG...
****Alignment****
sequence: gi|2697746783|ref|XM_009432080.5| PREDICTED: Pan troglodytes BRCA1 DNA repair associated (BRCA1), transcript variant X1, mRNA
length: 7200
e value: 0.0
GTACCTTGATTTCGTATTCTGAGAGGCTGCTGCTTAGCGGTAGCCCCTTGGTTTCCGTGGCAACGGAAAAGCGCG...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
GTACCTTGATTTCGTATTCTGAGAGGCTGCTGCTTAGCGGTAGCCCCTTGGTTTCCGTGGCAACGGAAAAGCGCG...
****Alignment****
sequence: gi|2697746789|ref|XM_054669711.2| PREDICTED: Pan troglodytes BRCA1 DNA repair associated (BRCA1), transcript variant X7, mRNA
length: 719

Z otrzymanych sekwencji o e-value < 0.01 (w tym przypadku wszystkie e-value == 0.0) widzimy, że sekwencja ta pochodzi z genu BRCA1 u organizmów takich jak: Goryl nizinny, szympans zwyczajny, orangutan borneański

From the returned alignments with 0.0 e-value we can see, that the sequence comes from BRCA1 gene, from organisms such as: Gorilla, Bornean oranghutan and Chimpanzee

# TASK 2

In [10]:
result_handle = NCBIWWW.qblast('blastp', 'nr', task2_sequence)
blast_records = NCBIXML.read(result_handle)

In [16]:
for alignment in blast_records.alignments[:5]:  # Pokaż 5 najlepszych wyników
    print("Title:", alignment.title)
    print("Length:", alignment.length)
    print("E-value:", alignment.hsps[0].expect)
    print("Species:", alignment.title.split('[')[1].split(']')[0])

Title: ref|NP_149421.2| cannabinoid receptor 1 isoform b [Homo sapiens] >ref|XP_011839841.1| PREDICTED: cannabinoid receptor 1 isoform X2 [Mandrillus leucophaeus] >ref|XP_025238412.1| cannabinoid receptor 1 isoform X2 [Theropithecus gelada] >ref|XP_030785377.1| cannabinoid receptor 1 isoform X2 [Rhinopithecus roxellana] >ref|XP_055114358.1| cannabinoid receptor 1 isoform X2 [Symphalangus syndactylus] >gb|PNI87957.1| CNR1 isoform 7 [Pan troglodytes] >gb|PNJ60190.1| CNR1 isoform 5 [Pongo abelii] >gb|AAV35030.1| cannabinoid receptor 1 splice variant CB1b [Homo sapiens] >gb|AGW25490.1| cannabinoid receptor 1 transcript variant 2 [Homo sapiens] >gb|KAI2543137.1| cannabinoid receptor 1 [Homo sapiens]
Length: 439
E-value: 0.0
Species: Homo sapiens
Title: ref|XP_023060944.1| cannabinoid receptor 1 isoform X2 [Piliocolobus tephrosceles]
Length: 439
E-value: 0.0
Species: Piliocolobus tephrosceles
Title: ref|XP_011797296.1| PREDICTED: cannabinoid receptor 1 isoform X2 [Colobus angolensis palliatu

Białko to receptor kannabinoidowy 1, odgrywa kluczową rolę w modulowaniu neuroprzekaźników, takich jak dopamina, glutaminian i GABA. Wpływa na procesy neurologiczne, takie jak pamięć, percepcja bólu, nastrój. <br>
Przykładowe inne zwierzęta, mające zbliżoną sekwencję tego białka, to np. Jak udomowiony, palczak madagaskarski, gerezanka, i wiele innych 

In [17]:
prot_id = 'NP_149421.2'
handle = Entrez.esearch(db='gene', term=prot_id)
record = Entrez.read(handle)
gene_id = record["IdList"][0]

handle = Entrez.efetch(db="gene", id=gene_id, rettype="xml")
gene_data = Entrez.read(handle)

chromosome_location = gene_data[0]["Entrezgene_locus"][0]["Gene-commentary_accession"]
print(f"Lokalizacja chromosomalna: {chromosome_location}")

Lokalizacja chromosomalna: NC_000006


# TASK 3

In [18]:
with Entrez.efetch(db="nucleotide", id=task3_sequence_id, rettype="gb", retmode="text") as handle:
    task3_sequence = SeqIO.read(handle, "gb").seq

In [23]:
result_handle = NCBIWWW.qblast('tblastx', 'nr', task3_sequence)
blast_records = NCBIXML.parse(result_handle)

In [29]:
for blast_record in blast_records:
    print(f"Potencjalne dopasowania:")
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            print(f"  Dopasowanie do: {alignment.title}")
            print(f"  Ramka odczytu: {hsp.frame}")
            print(f"  Identyczność: {hsp.identities} / {hsp.align_length}")
            print(f"  E-value: {hsp.expect}")
            print(f"  Sekwencja dopasowania: {hsp.sbjct[:100]}...")  # Pokazuje pierwsze 100 znaków dopasowanej sekwencji
            print("-" * 80)

Brak rozwiązań?