# Biopython : Parsing Sequence

In [1]:
from Bio import SeqIO

### FASTA 파일 읽기

* sample_1.fasta - 레코드 1개
* sample_2.fasta - 레코드 2개

여러 개의 레코드가 담긴 파일을 SeqIO.read()로 읽으면 각 레코드별로 객체가 만들어지지 않는다.

In [2]:
seq = SeqIO.parse("data/sample_1.fasta", "fasta") 
print(type(seq))

for s in seq:
    print(type(s))
    print(s)
    print('length :', len(s), '\n')

<class 'Bio.SeqIO.FastaIO.FastaIterator'>
<class 'Bio.SeqRecord.SeqRecord'>
ID: AF501235.1
Name: AF501235.1
Description: AF501235.1 Influenzavirus A (A/duck/Shanghai/1/2000) hemagglutinin gene, complete cds
Number of features: 0
Seq('ATGGAGAAAATAGTGCTTCTTCTTGCAATAGTCAGTCTTGTTAAAAGTGATCAG...AGA', SingleLetterAlphabet())
length : 140 



In [3]:
seq2 = SeqIO.parse("data/sample_2.fasta", "fasta") 
print(type(seq2))

list_seq = list(seq2)
print('records :', len(list_seq))

<class 'Bio.SeqIO.FastaIO.FastaIterator'>
records : 2


In [4]:
for s in list_seq:
    print(type(s))
    print(s)
    print('length :', len(s), '\n')

<class 'Bio.SeqRecord.SeqRecord'>
ID: MH464856.1
Name: MH464856.1
Description: MH464856.1 Hepatitis B virus isolate MA134, complete genome
Number of features: 0
Seq('TTCCACAACATTCCACCAAGCTCTGCAGGATCCCAGAGTAAGAGGCCTGTATTT...GGG', SingleLetterAlphabet())
length : 140 

<class 'Bio.SeqRecord.SeqRecord'>
ID: CP002925.1
Name: CP002925.1
Description: CP002925.1 Streptococcus pseudopneumoniae IS7493, complete genome
Number of features: 0
Seq('TTGAAAGAAAAACAATTTTGGAATCGTATATTAGAATTTGCTCAAGAAAGACTG...ATC', SingleLetterAlphabet())
length : 140 



### FASTAQ 파일 읽기

FASTAQ는 SeqIO.read()로 읽으면 에러 발생.

In [5]:
seq = SeqIO.parse("data/sample_1.fastq", "fastq") 
print(type(seq))

for s in seq:
    print(type(s))
    print(s)
    
    print('* sequence :', s.seq)  # 서열만 추출 가능
    print('* length   :', len(s), '\n')

<class 'Bio.SeqIO.QualityIO.FastqPhredIterator'>
<class 'Bio.SeqRecord.SeqRecord'>
ID: SRR000982.5E745RJU01DDHJ6length=113
Name: SRR000982.5E745RJU01DDHJ6length=113
Description: SRR000982.5E745RJU01DDHJ6length=113
Number of features: 0
Per letter annotation for: phred_quality
Seq('AAGGCACCATGCAGAGATGCAAGGCCCCTTTCTAAGCCCTAGACTTCTGGATGA...CAG', SingleLetterAlphabet())
* sequence : AAGGCACCATGCAGAGATGCAAGGCCCCTTTCTAAGCCCTAGACTTCTGGATGACACTTCTAGAAACACCCTGGGCCAGAAGTGAACCTGCTGCCTTGAAGGGAATAACTCAG
* length   : 113 

<class 'Bio.SeqRecord.SeqRecord'>
ID: SRR000982.35E745RJU01DLQBClength=53
Name: SRR000982.35E745RJU01DLQBClength=53
Description: SRR000982.35E745RJU01DLQBClength=53
Number of features: 0
Per letter annotation for: phred_quality
Seq('ATCTCTACCCAAAGATTAATGGGGATTGGTGTGATATACGGCTGAATTGTACC', SingleLetterAlphabet())
* sequence : ATCTCTACCCAAAGATTAATGGGGATTGGTGTGATATACGGCTGAATTGTACC
* length   : 53 



In [6]:
# gzip으로 압축된 FASTAQ 파일 읽기
import gzip

gzip_file = "data/sample_1.fastq.gz"

In [7]:
# 1.
handle = gzip.open(gzip_file,"rt") 
seq = SeqIO.parse(handle, "fastq") 
for s in seq: 
    print(s.seq)

AAGGCACCATGCAGAGATGCAAGGCCCCTTTCTAAGCCCTAGACTTCTGGATGACACTTCTAGAAACACCCTGGGCCAGAAGTGAACCTGCTGCCTTGAAGGGAATAACTCAG
ATCTCTACCCAAAGATTAATGGGGATTGGTGTGATATACGGCTGAATTGTACC


In [8]:
# 2.
with gzip.open(gzip_file,"rt") as handle: 
    seq = SeqIO.parse(handle, "fastq")
    for s in seq: 
        print(s.seq)

AAGGCACCATGCAGAGATGCAAGGCCCCTTTCTAAGCCCTAGACTTCTGGATGACACTTCTAGAAACACCCTGGGCCAGAAGTGAACCTGCTGCCTTGAAGGGAATAACTCAG
ATCTCTACCCAAAGATTAATGGGGATTGGTGTGATATACGGCTGAATTGTACC


### GenBank 파일 읽기

In [9]:
MERS_file = "data/KT225476.2.gbk"

gbk = SeqIO.read(MERS_file,"genbank")   # 1개의 레코드만 있어서 read 사용
print(type(gbk))
print(gbk)

<class 'Bio.SeqRecord.SeqRecord'>
ID: KT225476.2
Name: KT225476
Description: Middle East respiratory syndrome coronavirus isolate MERS-CoV/THA/CU/17_06_2015, complete genome
Number of features: 12
/molecule_type=RNA
/topology=linear
/data_file_division=VRL
/date=22-AUG-2017
/accessions=['KT225476']
/sequence_version=2
/keywords=['']
/source=Middle East respiratory syndrome-related coronavirus (MERS-CoV)
/organism=Middle East respiratory syndrome-related coronavirus
/taxonomy=['Viruses', 'ssRNA viruses', 'ssRNA positive-strand viruses, no DNA stage', 'Nidovirales', 'Coronaviridae', 'Coronavirinae', 'Betacoronavirus']
/references=[Reference(title='Imported case of Middle East respiratory syndrome coronavirus (MERS-CoV) infection from Oman to Thailand, June 2015', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=On Sep 10, 2015 this sequence version replaced KT225476.1.
/structured_comment=OrderedDict([('Assembly-Data', OrderedDict([('Se

In [10]:
print(gbk.id) 
print(gbk.description) 
print(gbk.annotations['molecule_type']) 
print(gbk.annotations['organism']) 

KT225476.2
Middle East respiratory syndrome coronavirus isolate MERS-CoV/THA/CU/17_06_2015, complete genome
RNA
Middle East respiratory syndrome-related coronavirus


In [11]:
print(gbk.features) 

[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(29809), strand=1), type='source'), SeqFeature(CompoundLocation([FeatureLocation(ExactPosition(272), ExactPosition(13427), strand=1), FeatureLocation(ExactPosition(13426), ExactPosition(21508), strand=1)], 'join'), type='CDS', location_operator='join'), SeqFeature(FeatureLocation(ExactPosition(272), ExactPosition(13448), strand=1), type='CDS'), SeqFeature(FeatureLocation(ExactPosition(21449), ExactPosition(25511), strand=1), type='CDS'), SeqFeature(FeatureLocation(ExactPosition(25525), ExactPosition(25837), strand=1), type='CDS'), SeqFeature(FeatureLocation(ExactPosition(25845), ExactPosition(26175), strand=1), type='CDS'), SeqFeature(FeatureLocation(ExactPosition(26086), ExactPosition(26827), strand=1), type='CDS'), SeqFeature(FeatureLocation(ExactPosition(26833), ExactPosition(27508), strand=1), type='CDS'), SeqFeature(FeatureLocation(ExactPosition(27583), ExactPosition(27832), strand=1), type='CDS'), SeqFeature(FeatureLocati

### Entrez 모듈 : NCBI 파일 읽기

In [12]:
from Bio import Entrez

```
CCR5 : 면역세포(T세포) 표면에서 발현하는 단백질. HIV의 수용체.
       CCR5 유전자에 변이가 있는 경우 HIV에 감염되지 않음.
```

In [13]:
Entrez.email = "woosa7@naver.com" 

with Entrez.efetch(db="nucleotide",   # database name
                   id="AY463215",     # CCR5 gene (HIV 저항성)
                   rettype="fasta",   # file type
                   retmode="text") as handle:

    seq = SeqIO.read(handle, "fasta")
    
print(seq) 
print(len(seq)) 

ID: AY463215.1
Name: AY463215.1
Description: AY463215.1 Homo sapiens CCR5 chemokine receptor (CCR5) gene, complete cds
Number of features: 0
Seq('ATGGATTATCAAGTGTCAAGTCCAATCTATGACATCAATTATTATACATCGGAG...TGA', SingleLetterAlphabet())
1059


In [14]:
with Entrez.efetch(db="nucleotide", 
                   id="1575550",     # 변이가 있는 CCR5
                   rettype="fasta", 
                   retmode="text") as handle:
    
    seq = SeqIO.read(handle, "fasta") 
     
print(seq)
print(len(seq))

ID: U66285.1
Name: U66285.1
Description: U66285.1 Human CC chemokine receptor CCR5 gene, mutant allele, complete cds
Number of features: 0
Seq('ATGGATTATCAAGTGTCAAGTCCAATCTATGACATCAATTATTATACATCGGAG...TGA', SingleLetterAlphabet())
1027
