In [41]:
from Bio import Seq, SeqRecord
from Bio.Data import CodonTable
from Bio import Entrez
import re

## 1. 读取序列

### 1.1 读取

In [3]:
my_seq: Seq.Seq = Seq.Seq("AGCATCGTAGCATGCAC")
my_seq

Seq('AGCATCGTAGCATGCAC')

### 1.2 转为RNA

将`T`转为`U`。

In [4]:
my_seq.transcribe()

Seq('AGCAUCGUAGCAUGCAC')

### 1.3 获得反向互补链

In [5]:
my_seq.reverse_complement()

Seq('GTGCATGCTACGATGCT')

我们可以连用上面的命令获得反向互补链的**RNA**。

In [6]:
my_seq.reverse_complement().transcribe()

Seq('GUGCAUGCUACGAUGCU')

## 2. 编码表

### 2.1 获取蛋白质编码表

In [9]:
print(CodonTable.unambiguous_dna_by_name['Standard'])

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

### 2.2 获取起始子和终止子

In [11]:
CodonTable.unambiguous_dna_by_name['Standard'].start_codons

['TTG', 'CTG', 'ATG']

In [12]:
CodonTable.unambiguous_dna_by_name['Standard'].stop_codons

['TAA', 'TAG', 'TGA']

## 3. 作为字符处理序列

### 3.1 slice

In [13]:
my_seq[0]

'A'

In [14]:
my_seq[1:5]

Seq('GCAT')

### 3.2 切割

In [15]:
my_seq.split("T")

[Seq('AGCA'), Seq('CG'), Seq('AGCA'), Seq('GCAC')]

### 3.3 计数

In [17]:
my_seq.count("T"), my_seq.count("T") / len(my_seq)

(3, 0.17647058823529413)

### 3.4 序列合并

In [18]:
my_seq + my_seq

Seq('AGCATCGTAGCATGCACAGCATCGTAGCATGCAC')

### 3.5 查找

In [19]:
my_seq.find("TA")

7

### 3.6 正则查找

In [26]:
re.findall("T+A", str(my_seq))

['TA']

## `SeqRecord`

`SeqRecord`是用来记录带`id`和`description`元信息的序列

In [38]:
my_seq_record = SeqRecord.SeqRecord(my_seq, id="my-seq:2020-10-18", description="An example of a SeqRecord.")
my_seq_record

SeqRecord(seq=Seq('AGCATCGTAGCATGCAC'), id='my-seq:2020-10-18', name='<unknown name>', description='An example of a SeqRecord.', dbxrefs=[])

In [32]:
my_seq_record.id, my_seq_record.description

('my seq: 2020-10-18', 'An example of a SeqRecord.')

转换格式（如`fasta`和`genbank`）。

In [39]:
print(my_seq_record.format("fasta"))

>my-seq:2020-10-18 An example of a SeqRecord.
AGCATCGTAGCATGCAC



In [52]:
Entrez.email = "huangbaochenwo@live.com"
handle = Entrez.esummary(db="pubmed", id="19304878,14630660", retmode="xml")
records = Entrez.parse(handle)

res = []
for record in records:
    print(record)
    res.append(record)

handle.close()

{'Item': [], 'Id': '19304878', 'PubDate': '2009 Jun 1', 'EPubDate': '2009 Mar 20', 'Source': 'Bioinformatics', 'AuthorList': ['Cock PJ', 'Antao T', 'Chang JT', 'Chapman BA', 'Cox CJ', 'Dalke A', 'Friedberg I', 'Hamelryck T', 'Kauff F', 'Wilczynski B', 'de Hoon MJ'], 'LastAuthor': 'de Hoon MJ', 'Title': 'Biopython: freely available Python tools for computational molecular biology and bioinformatics.', 'Volume': '25', 'Issue': '11', 'Pages': '1422-3', 'LangList': ['English'], 'NlmUniqueID': '9808944', 'ISSN': '1367-4803', 'ESSN': '1367-4811', 'PubTypeList': ['Journal Article'], 'RecordStatus': 'PubMed - indexed for MEDLINE', 'PubStatus': 'ppublish+epublish', 'ArticleIds': {'medline': [], 'pubmed': ['19304878'], 'pii': 'btp163', 'doi': '10.1093/bioinformatics/btp163', 'pmc': 'PMC2682512', 'rid': '19304878', 'eid': '19304878', 'pmcid': 'pmc-id: PMC2682512;'}, 'DOI': '10.1093/bioinformatics/btp163', 'History': {'medline': ['2009/07/10 09:00'], 'pubmed': ['2009/03/24 09:00'], 'entrez': '2009

In [53]:
res[0]

{'Item': [], 'Id': '19304878', 'PubDate': '2009 Jun 1', 'EPubDate': '2009 Mar 20', 'Source': 'Bioinformatics', 'AuthorList': ['Cock PJ', 'Antao T', 'Chang JT', 'Chapman BA', 'Cox CJ', 'Dalke A', 'Friedberg I', 'Hamelryck T', 'Kauff F', 'Wilczynski B', 'de Hoon MJ'], 'LastAuthor': 'de Hoon MJ', 'Title': 'Biopython: freely available Python tools for computational molecular biology and bioinformatics.', 'Volume': '25', 'Issue': '11', 'Pages': '1422-3', 'LangList': ['English'], 'NlmUniqueID': '9808944', 'ISSN': '1367-4803', 'ESSN': '1367-4811', 'PubTypeList': ['Journal Article'], 'RecordStatus': 'PubMed - indexed for MEDLINE', 'PubStatus': 'ppublish+epublish', 'ArticleIds': {'medline': [], 'pubmed': ['19304878'], 'pii': 'btp163', 'doi': '10.1093/bioinformatics/btp163', 'pmc': 'PMC2682512', 'rid': '19304878', 'eid': '19304878', 'pmcid': 'pmc-id: PMC2682512;'}, 'DOI': '10.1093/bioinformatics/btp163', 'History': {'medline': ['2009/07/10 09:00'], 'pubmed': ['2009/03/24 09:00'], 'entrez': '2009