In [62]:
from BioPandas import pandas as pd

# Sample files used

In [65]:
gbk = 'test-data/GCF_000013425.1.gbk.gz'
fasta = 'test-data/random_sequences.fasta.gz'

# Fasta to Pandas DataFrame example

In [66]:
fasta_df = pd.read_seq(fasta, format='fasta')
fasta_df.head(3)

Unnamed: 0,_per_letter_annotations,_seq,annotations,dbxrefs,description,features,id,name
0,{},"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",{},[],FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],FWIRNKE01DKIF6,FWIRNKE01DKIF6
1,{},"(G, C, G, A, G, C, A, G, C, A, A, T, C, A, T, ...",{},[],FWIRNKE01CDBE3 rank=0000320 x=854.0 y=2685.0 l...,[],FWIRNKE01CDBE3,FWIRNKE01CDBE3
2,{},"(C, G, A, G, C, A, G, C, A, C, A, T, C, A, T, ...",{},[],FWIRNKE01BKZJJ rank=0000535 x=531.0 y=3933.0 l...,[],FWIRNKE01BKZJJ,FWIRNKE01BKZJJ


# Search description with little effort!

In [69]:
# I want all ranks of a certain number 
fasta_df[fasta_df['description'].str.contains('rank=0000177')]

Unnamed: 0,_per_letter_annotations,_seq,annotations,dbxrefs,description,features,id,name
0,{},"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",{},[],FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],FWIRNKE01DKIF6,FWIRNKE01DKIF6


# GenBank to Pandas DataFrame example

In [68]:
gbk_df = pd.read_seq(gbk, format='genbank')
gbk_df.head(3)

Unnamed: 0,EC_number,_per_letter_annotations,_seq,annotations,codon_start,db_xref,dbxrefs,description,experiment,gene,...,organism,product,protein_id,pseudo,pseudogene,strain,sub_species,transl_table,translation,type
0,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,[taxon:93061],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,[Staphylococcus aureus subsp. aureus NCTC 8325],,,,,[NCTC 8325],[aureus],,,source
1,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,[GeneID:3919798],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,[dnaA],...,,,,,,,,,,gene
2,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",[1],[GeneID:3919798],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,[dnaA],...,,[chromosomal replication initiation protein],[YP_498609.1],,,,,[11],[MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSS...,CDS


# See all the possible columns of meta data

In [58]:
gbk_df.columns

Index(['EC_number', '_per_letter_annotations', '_seq', 'annotations',
       'codon_start', 'db_xref', 'dbxrefs', 'description', 'experiment',
       'gene', 'gene_synonym', 'id', 'location', 'locus_tag', 'mol_type',
       'name', 'note', 'organism', 'product', 'protein_id', 'pseudo',
       'pseudogene', 'strain', 'sub_species', 'transl_table', 'translation',
       'type'],
      dtype='object')

# Seach for only rows of type CDS

In [72]:
cds = gbk_df[gbk_df.type == 'CDS']
gene = gbk_df[gbk_df.type == 'gene']
print('Genome has CDS count:', cds.shape)
print('Genome has GENE count:', gene.shape)

Genome has CDS count: (2767, 27)
Genome has GENE count: (2872, 27)


In [74]:
cds.head(3)

Unnamed: 0,EC_number,_per_letter_annotations,_seq,annotations,codon_start,db_xref,dbxrefs,description,experiment,gene,...,organism,product,protein_id,pseudo,pseudogene,strain,sub_species,transl_table,translation,type
2,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",[1],[GeneID:3919798],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,[dnaA],...,,[chromosomal replication initiation protein],[YP_498609.1],,,,,[11],[MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSS...,CDS
4,[2.7.7.7],{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",[1],[GeneID:3919799],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,,[DNA polymerase III subunit beta],[YP_498610.1],,,,,[11],[MMEFTIKRDYFITQLNDTLKAISPRTTLPILTGIKIDAKEHEVIL...,CDS
6,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",[1],[GeneID:3919176],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,,[hypothetical protein],[YP_498611.1],,,,,[11],[MIILVQEVVVEGDINLGQFLKTEGIIESGGQAKWFLQDVEVLING...,CDS


# Better view of the layout for a single row

In [90]:
cds.head(1).to_dict('records')

[{'EC_number': nan,
  '_per_letter_annotations': {},
  '_seq': Seq('CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCAT...TAT', SingleLetterAlphabet()),
  'annotations': {'molecule_type': 'DNA',
   'topology': 'circular',
   'data_file_division': 'CON',
   'date': '03-AUG-2016',
   'accessions': ['NC_007795'],
   'sequence_version': 1,
   'keywords': ['RefSeq'],
   'source': 'Staphylococcus aureus subsp. aureus NCTC 8325',
   'organism': 'Staphylococcus aureus subsp. aureus NCTC 8325',
   'taxonomy': ['Bacteria',
    'Firmicutes',
    'Bacilli',
    'Bacillales',
    'Staphylococcaceae',
    'Staphylococcus'],
   'references': [Reference(title='The Staphylococcus aureus NCTC8325 Genome', ...),
    Reference(title='Direct Submission', ...),
    Reference(title='Direct Submission', ...)],
   'comment': 'REVIEWED REFSEQ: This record has been curated by NCBI staff. The\nreference sequence was derived from CP000253.\nRefSeq Category: Reference Genome\n            UPR: UniProt Genome\nSta

# Scikit-Learn for Machine Learning

# Extra Notes

In [82]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

# Creating a BioPython seq from scratch

In [87]:
seq = Seq('ATGCATGATGATGATGATGATAG', alphabet=IUPAC.Alphabet.single_letter_alphabet)
seq

Seq('ATGCATGATGATGATGATGATAG', SingleLetterAlphabet())

## You can create a dataframe row as such

In [89]:
pd.DataFrame([{'_seq': seq}])._seq 

0    (A, T, G, C, A, T, G, A, T, G, A, T, G, A, T, ...
Name: _seq, dtype: object