In [1]:
import Bio
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqUtils import GC, gc_fraction

print("biopython version: ", Bio.__version__)
my_seq = Seq("AGTACACTGGT")
print(my_seq)
print(my_seq.complement())
print("Reverse complement: ", my_seq.reverse_complement())

biopython version:  1.81
AGTACACTGGT
TCATGTGACCA
Reverse complement:  ACCAGTGTACT


In [2]:
# FASTA parsing example
for seq_record in SeqIO.parse("ls_orchid.fasta", "fasta"):
    print("Id của chuỗi: ", seq_record.id)
    print(repr(seq_record.seq))
    print(seq_record.seq, " => ", len(seq_record.seq), "nucleotides")

    print("\n", seq_record)
    break

Id của chuỗi:  gi|2765658|emb|Z78533.1|CIZ78533
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')
CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCGGCATACAGCCAGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCGGCGGGTCCAAGAGCTGGTGTTTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTGGCAGCAGCTGCCGTGCGAATCCCCCATGTTGTCGTGCTTGTCGGACAGGCAGGAGAACCCTTCCGAACCCCAATGGAGGGCGGTTGACCGCCATTCGGATGTGACCCCAGGTCAGGCGGGGGCACCCGCTGAGTTTACGC  =>  740 nucleotides

 ID: gi|2765658|emb|Z78533.1|CIZ78533
Name: gi|2765658|emb|Z78533.1|CIZ78533
Description: gi|2765658|emb|Z78533.1|CIZ7853

In [3]:
# Genbank parsing example
for seq_record in SeqIO.parse("ls_orchid.gbk", "genbank"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(seq_record.seq, " => ", len(seq_record.seq))

    print("\n### Thông tin bản ghi theo định dạng GENBANK")
    print(seq_record)
    break

Z78533.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')
CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCGGCATACAGCCAGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCGGCGGGTCCAAGAGCTGGTGTTTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTGGCAGCAGCTGCCGTGCGAATCCCCCATGTTGTCGTGCTTGTCGGACAGGCAGGAGAACCCTTCCGAACCCCAATGGAGGGCGGTTGACCGCCATTCGGATGTGACCCCAGGTCAGGCGGGGGCACCCGCTGAGTTTACGC  =>  740

### Thông tin bản ghi theo định dạng GENBANK
ID: Z78533.1
Name: Z78533
Description: C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA
Number of features: 5
/molecule_type=DN

In [4]:
# sequences act like strings
my_seq = Seq("GATCG")
for index, letter in enumerate(my_seq):
    print("%i %s" % (index, letter))
    # print(index, letter)
    # print(my_seq[index])
print("Len sequence: ", len(my_seq))
print(my_seq[0])
print(my_seq[2])
print(my_seq[-1])

0 G
1 A
2 T
3 C
4 G
Len sequence:  5
G
T
G


In [7]:
# non-overlapping count
from Bio.SeqUtils import gc_fraction

print("AAAA".count("AA"))
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
print("Len my_seq: ", len(my_seq))
print("Tổng số G: ", my_seq.count("G"))
print("Tổng số C: ", my_seq.count("C"))
gc = 100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq)
print("Tỷ lệ GC: ", gc)
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
print("Tỷ lệ GC: ", gc_fraction(my_seq))

2
Len my_seq:  32
Tổng số G:  9
Tổng số C:  6
Tỷ lệ GC:  46.875
Tỷ lệ GC:  0.46875


In [9]:
# slicing a sequence
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
my_seq[4:12]
# get the first, second and third codon positions of this DNA sequence:
# với 3 ORF (Open Reading Frame)
print(my_seq[0::3])
print(my_seq[1::3])
print(my_seq[2::3])
# Reverse the string
print(my_seq[::-1])

GCTGTAGTAAG
AGGCATGCATC
TAGCTAAGAC
CGCTAAAAGCTAGGATATATCCGGGTAGCTAG


In [11]:
# turning Seq objects into strings
print(str(my_seq))
# Convert to FASTA format
fasta_format_string = ">Name\n%s\n" % my_seq
print("\nFASTA:\n", fasta_format_string)

GATCGATGGGCCTATATAGGATCGAAAATCGC

FASTA:
 >Name
GATCGATGGGCCTATATAGGATCGAAAATCGC



In [12]:
# Concatenating or adding sequences
dna_seq_1 = Seq("ACGT")
dna_seq_2 = Seq("CGTATG")
dna_seq = dna_seq_1 + dna_seq_2
print("dna_seq: ", dna_seq)
list_of_seqs = [Seq("ACGT"), Seq("AACC"), Seq("GGTT")]
concatenated = Seq("")
for s in list_of_seqs:
    concatenated += s
print("concatenated: ", concatenated)

dna_seq:  ACGTCGTATG
concatenated:  ACGTAACCGGTT


In [13]:
# join method
contigs = [Seq("ATG"), Seq("ATCCCG"), Seq("TTGCA")]
spacer = Seq("N" * 10)
new_seq = spacer.join(contigs)
print("new_seq: ", new_seq)

new_seq:  ATGNNNNNNNNNNATCCCGNNNNNNNNNNTTGCA


In [14]:
# Changing case
dna_seq = Seq("acgtACGT")
dna_seq_upper = dna_seq.upper()
dna_seq_lower = dna_seq.lower()
print("dna_seq_upper: ", dna_seq_upper)
print("dna_seq_lower: ", dna_seq_lower)

dna_seq_upper:  ACGTACGT
dna_seq_lower:  acgtacgt


In [15]:
# Transcription

coding_dna = Seq("ATGGCC")
print("coding_dna: ", coding_dna)
template_dna = coding_dna.reverse_complement()
print("template_dna: ", template_dna)
messenger_rna = coding_dna.transcribe()
print("messenger_rna: ", messenger_rna)

# The Seq object also includes a back-transcription method for
# going from the mRNA to the coding strand of the DNA.
# Again, this is a simple U → T substitution:
print("back_transcribe: ", messenger_rna.back_transcribe())

coding_dna:  ATGGCC
template_dna:  GGCCAT
messenger_rna:  AUGGCC
back_transcribe:  ATGGCC


In [17]:
# Translation table

from Bio.Data import CodonTable

standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
# standard_table = CodonTable.unambiguous_dna_by_id[1]

# Vertebrate Mitochondrial: ty thể của động vật có xương sống
mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]
mito_table = CodonTable.unambiguous_dna_by_id[2]
print(standard_table)
print(mito_table)
print("Stop codons: ", standard_table.stop_codons)
print("Start codons: ", standard_table.start_codons)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [18]:
# translation
messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
protein_seq = messenger_rna.translate()
print("protein_seq: ", protein_seq)
print(messenger_rna.translate(to_stop=True))

# You can also translate directly from the coding strand DNA sequence:
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
protein_seq = coding_dna.translate(to_stop=True)
print("protein_seq: ", protein_seq)

protein_seq:  MAIVMGR*KGAR*
MAIVMGR
protein_seq:  MAIVMGR


In [19]:
# Comparing Seq objects
seq1 = Seq("ACGT")
print("ACGT" == seq1)
print(seq1 == "ACGT")

True
True


In [20]:
from Bio.Seq import Seq
my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA")
my_seq[5] = "G"

TypeError: 'Seq' object does not support item assignment

In [2]:
from Bio.Seq import MutableSeq, Seq

mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA")
mutable_seq[5] = "C"
print(mutable_seq)
mutable_seq.remove("T")
print(mutable_seq)
mutable_seq.reverse()
print(mutable_seq)

# get back to a read-only Seq object
new_seq = Seq(mutable_seq)
new_seq

GCCATCGTAATGGGCCGCTGAAAGGGTGCCCGA
GCCACGTAATGGGCCGCTGAAAGGGTGCCCGA
AGCCCGTGGGAAAGTCGCCGGGTAATGCACCG


Seq('AGCCCGTGGGAAAGTCGCCGGGTAATGCACCG')

In [4]:
# unknown sequence contents
from Bio.Seq import Seq

unknown_seq = Seq(None, 20)
unknown_seq

Seq(None, length=20)

# FGFR1

In [1]:
from Bio.Seq import Seq
from Bio import SeqIO

In [3]:
for gene_record in SeqIO.parse("fgfr1_datasets/gene.fna", "fasta"):
    print("gene id:", gene_record.id)
    print(repr(gene_record.seq))
    print(len(gene_record.seq), "nucleotides")

    coding_dna = Seq("")
    exon_locations = [
        1, 655,
        11102, 11280,
        40201, 40284,
        40543, 40715,
        42391, 42514,
        43937, 44127,
        46695, 46839,
        48901, 49097,
        50263, 50408,
        50645, 50766,
        51220, 51330,
        52576, 52766,
        53735, 53857,
        54007, 54077,
        54347, 54484,
        54613, 54718,
        54832, 57493,
    ]
    for i in range(0, len(exon_locations), 2):
        coding_dna += gene_record.seq[exon_locations[i] - 1 : exon_locations[i + 1]]

    print("Coding DNA length:", len(coding_dna))
    break

gene id: NC_000008.11:c38468635-38411143
Seq('GCATAGCGCTCGGAGCGCTCTTGCGGCCACAGGCGCGGCGTCCTCGGCGGCGGG...TGA')
57493 nucleotides
Coding DNA length: 5418


In [4]:
for rna_record in SeqIO.parse("fgfr1_datasets/rna.fna", "fasta"):
    print("mRNA id:", rna_record.id)
    print(repr(rna_record.seq))
    print("mRNA length:", len(rna_record.seq))
    mRNA = rna_record.seq
    break

mRNA id: NM_001354368.2
Seq('GCATAGCGCTCGGAGCGCTCTTGCGGCCACAGGCGCGGCGTCCTCGGCGGCGGG...TGA')
mRNA length: 5418


In [4]:
print('Exact mRNA:', coding_dna == mRNA)

Exact mRNA: True


In [31]:
new_coding_dna = coding_dna[743:2933]
len(new_coding_dna)

2190

In [34]:
protein_seq = new_coding_dna.translate(to_stop=True)
len(protein_seq)

729

In [35]:
for protein_record in SeqIO.parse("fgfr1_datasets/protein.faa", "fasta"):
    if protein_record.id == "NP_001341297.1":
        print("protein id:", protein_record.id)
        print(repr(protein_record.seq))
        print("protein length:", len(protein_record.seq))
        break
    else:
        continue

protein id: NP_001341297.1
Seq('MWSWKCLLFWAVLVTATLCTARPSPTLPEQDALPSSEDDDDDDDSSSEEKETDN...KRR')
protein length: 729


In [36]:
print(protein_seq == protein_record.seq)

True
