In [53]:
import Bio
import pandas
from pandas import DataFrame
import numpy as np
from pandas.core.frame import *
from types import GeneratorType
from typing import Union, Dict, List, Tuple
from Bio import SeqIO
from copy import deepcopy
import gzip
from pathlib import Path
from Bio.Alphabet import single_letter_alphabet


class SubclassedSeries(pandas.Series):
    """ Pandas Series API to Inherit """
    
    @property
    def _constructor(self):
        return SubclassedSeries

    @property
    def _constructor_expanddim(self):
        return SubclassedDataFrame


class SubclassedDataFrame(pandas.DataFrame):
    """ Pandas DataFrame to Inherit """
    
    @property
    def _constructor(self):
        return SubclassedDataFrame

    @property
    def _constructor_sliced(self):
        return SubclassedSeries


class BioDatabase(SubclassedDataFrame):
    """ Expanded Pandas DataFrame to handle BioPython SeqRecords generator or genomic file types """
    
    @classmethod
    def from_seqrecords(cls, seqrecords: Union[GeneratorType, list], index=None, exclude=None, 
                        columns=None, coerce_float=False, nrows=None) -> pandas.DataFrame:
        """ Takes Biopython parsed output to convert to a proper DataFrame 

        :param seqrecords: Generator or list from BioPython universal output. 
            All formats are the same output.
        
        >>> from_seqrecords(Bio.SeqIO.parse('file.fasta', format='fasta'))
        """
        # if isinstance(seqrecords, GeneratorType):
        data = cls.__normalize_seqrecords(seqrecords)
        # else:
        #     data = seqrecords
        return cls.from_records(data, index=index, exclude=exclude, columns=columns,
                                coerce_float=coerce_float, nrows=nrows)

    def __normalize_seqrecords(seqrecords: Union[GeneratorType, list]) -> List[dict]:
        """ Pull nested dictionaries into a single dictionary.

        Priority is given to the keys higher in the hierarchy.
        :param seqrecords: Generator from BioPython universal output. 
            All formats are the same output.
        :returns: List of dictionaries with keys that were obtained along the way.
        
        >>> __normalize_seqrecords(Bio.SeqIO.parse('file.fasta', format='fasta'))
        """
        records = []
        for seqrecord in SeqIO.to_dict(seqrecords).values():
            _records = []
            record = seqrecord.__dict__
            # If a more complicated format is used; features will be nested.
            features = record.pop('features') if record.get('features') else []
            for feature in features:
                _record = deepcopy(record)
                # Meta that make up the feature
                aspects = feature.__dict__
                # Qualifier dictionary inside each feature
                qualifiers = aspects.pop('qualifiers') if aspects.get('qualifiers') else {}
                # Add each feature aspect
                for aspect_key, aspect_value in aspects.items():
                    if aspect_key not in record:
                        _record[aspect_key] = aspect_value
                # Add each qualifier
                for qualifier_key, qualifier_value in qualifiers.items():
                    _record = deepcopy(_record)
                    if qualifier_key not in _record:
                        _record[qualifier_key] = qualifier_value
                # Collect normalized feature 
                _records += [_record]
            # If no normalized feature collected use original seq record
            if not _records:
                _records += [record]
            # Add current records list to past iterations.
            # We do this because there could be more than one feature per seqrecord.
            records += _records
        return records


def pathing(path: Union[str, Path], new: bool = False) -> Path:
    """ Guarantees correct expansion rules for pathing.

    :param Union[str, Path] path: path of folder or file you wish to expand.
    :param bool new: will check if distination exists if new
        (will check parent path regardless).
    :return: A pathlib.Path object.
    
    >>> pathing('~/Desktop/folderofgoodstuffs/')
    /home/user/Desktop/folderofgoodstuffs
    """
    path = Path(path)
    # Expand tilda shortened path or local path.
    if str(path)[0] == '~':
        path = path.expanduser()
    else:
        path = path.absolute() #
    # Making sure new paths don't exist while also making sure existing paths actually exist.
    if new:
        if not path.parent.exists():
            raise ValueError(f'ERROR ::: Parent directory of {path} does not exist.')
        if path.exists():
            raise ValueError(f'ERROR ::: {path} already exists!')
    else:
        if not path.exists():
            raise ValueError(f'ERROR ::: Path {path} does not exist.')
    return path


def read_seq(handle: str, format: str, alphabet: object = single_letter_alphabet) -> pandas.DataFrame:
    """ Read Bioinformatic file type 
    
    :param str handle: str path of file to open.
    :param str format: Broad range of Bioinformatic formats ie fasta & genbank.
    :param object alphabet: Custom string from BioPython with handy methods.
    
    >>> read_seq('file.fasta.gz', format='fasta')
    """
    # Checks path validity
    path = pathing(handle)
    # If file is gzip compressed
    if path.suffix == '.gz':
        with gzip.open(handle, "rt") as handle:
            seqrecords = SeqIO.parse(handle, format=format, alphabet=alphabet)
            # need to use/return while I/O is open
            return BioDatabase.from_seqrecords(seqrecords)
    # Uncompressed; will break if another compression is used.
    seqrecords = SeqIO.parse(handle, format=format, alphabet=alphabet)
    return BioDatabase.from_seqrecords(seqrecords)


# Add to pandas module for seemly behavior
pandas.DataFrame = BioDatabase
pandas.read_seq = read_seq

pd = pandas
df = pandas.read_seq(gbk, format='genbank')
df.head()

Unnamed: 0,EC_number,_per_letter_annotations,_seq,annotations,codon_start,db_xref,dbxrefs,description,experiment,gene,...,organism,product,protein_id,pseudo,pseudogene,strain,sub_species,transl_table,translation,type
0,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,[taxon:93061],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,[Staphylococcus aureus subsp. aureus NCTC 8325],,,,,[NCTC 8325],[aureus],,,source
1,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,[GeneID:3919798],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,[dnaA],...,,,,,,,,,,gene
2,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",[1],[GeneID:3919798],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,[dnaA],...,,[chromosomal replication initiation protein],[YP_498609.1],,,,,[11],[MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSS...,CDS
3,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,[GeneID:3919799],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,,,,,,,,,,gene
4,[2.7.7.7],{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",[1],[GeneID:3919799],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,,[DNA polymerase III subunit beta],[YP_498610.1],,,,,[11],[MMEFTIKRDYFITQLNDTLKAISPRTTLPILTGIKIDAKEHEVIL...,CDS


# Sample files used

In [55]:
gbk = 'GCF_000013425.1.gbk.gz'
fasta = 'random_sequences.fasta.gz'

# Fasta to DF example

In [56]:
df = pd.read_seq(fasta, format='fasta')
df.head(3)

Unnamed: 0,_per_letter_annotations,_seq,annotations,dbxrefs,description,features,id,name
0,{},"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",{},[],FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],FWIRNKE01DKIF6,FWIRNKE01DKIF6
1,{},"(G, C, G, A, G, C, A, G, C, A, A, T, C, A, T, ...",{},[],FWIRNKE01CDBE3 rank=0000320 x=854.0 y=2685.0 l...,[],FWIRNKE01CDBE3,FWIRNKE01CDBE3
2,{},"(C, G, A, G, C, A, G, C, A, C, A, T, C, A, T, ...",{},[],FWIRNKE01BKZJJ rank=0000535 x=531.0 y=3933.0 l...,[],FWIRNKE01BKZJJ,FWIRNKE01BKZJJ


# Search description with little effort!

In [23]:
# I want all ranks of a certain number 
df[df['description'].str.contains('rank=0000177')]

Unnamed: 0,_per_letter_annotations,_seq,annotations,dbxrefs,description,features,id,name
0,{},"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",{},[],FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],FWIRNKE01DKIF6,FWIRNKE01DKIF6


# GenBank to DF example

In [57]:
gbk_df = pd.read_seq(gbk, format='genbank')
gbk_df.head(3)

Unnamed: 0,EC_number,_per_letter_annotations,_seq,annotations,codon_start,db_xref,dbxrefs,description,experiment,gene,...,organism,product,protein_id,pseudo,pseudogene,strain,sub_species,transl_table,translation,type
0,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,[taxon:93061],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,[Staphylococcus aureus subsp. aureus NCTC 8325],,,,,[NCTC 8325],[aureus],,,source
1,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,[GeneID:3919798],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,[dnaA],...,,,,,,,,,,gene
2,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",[1],[GeneID:3919798],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,[dnaA],...,,[chromosomal replication initiation protein],[YP_498609.1],,,,,[11],[MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSS...,CDS


# See all the possible columns of meta data

In [58]:
gbk_df.columns

Index(['EC_number', '_per_letter_annotations', '_seq', 'annotations',
       'codon_start', 'db_xref', 'dbxrefs', 'description', 'experiment',
       'gene', 'gene_synonym', 'id', 'location', 'locus_tag', 'mol_type',
       'name', 'note', 'organism', 'product', 'protein_id', 'pseudo',
       'pseudogene', 'strain', 'sub_species', 'transl_table', 'translation',
       'type'],
      dtype='object')

# Seach for only rows of type CDS

In [59]:
cds = gbk_df[gbk_df.type == 'CDS']
print('Genome has CDS count:', cds.shape)
cds.head(3)

Genome has CDS count: (2767, 27)


Unnamed: 0,EC_number,_per_letter_annotations,_seq,annotations,codon_start,db_xref,dbxrefs,description,experiment,gene,...,organism,product,protein_id,pseudo,pseudogene,strain,sub_species,transl_table,translation,type
2,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",[1],[GeneID:3919798],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,[dnaA],...,,[chromosomal replication initiation protein],[YP_498609.1],,,,,[11],[MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSS...,CDS
4,[2.7.7.7],{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",[1],[GeneID:3919799],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,,[DNA polymerase III subunit beta],[YP_498610.1],,,,,[11],[MMEFTIKRDYFITQLNDTLKAISPRTTLPILTGIKIDAKEHEVIL...,CDS
6,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",[1],[GeneID:3919176],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,,[hypothetical protein],[YP_498611.1],,,,,[11],[MIILVQEVVVEGDINLGQFLKTEGIIESGGQAKWFLQDVEVLING...,CDS


# Better view of the layout for a single row

In [60]:
cds.head(1).to_dict()

{'EC_number': {2: nan},
 '_per_letter_annotations': {2: {}},
 '_seq': {2: Seq('CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCAT...TAT', SingleLetterAlphabet())},
 'annotations': {2: {'molecule_type': 'DNA',
   'topology': 'circular',
   'data_file_division': 'CON',
   'date': '03-AUG-2016',
   'accessions': ['NC_007795'],
   'sequence_version': 1,
   'keywords': ['RefSeq'],
   'source': 'Staphylococcus aureus subsp. aureus NCTC 8325',
   'organism': 'Staphylococcus aureus subsp. aureus NCTC 8325',
   'taxonomy': ['Bacteria',
    'Firmicutes',
    'Bacilli',
    'Bacillales',
    'Staphylococcaceae',
    'Staphylococcus'],
   'references': [Reference(title='The Staphylococcus aureus NCTC8325 Genome', ...),
    Reference(title='Direct Submission', ...),
    Reference(title='Direct Submission', ...)],
   'comment': 'REVIEWED REFSEQ: This record has been curated by NCBI staff. The\nreference sequence was derived from CP000253.\nRefSeq Category: Reference Genome\n            UPR: UniP

# Extra Notes

# Creating a BioPython seq from scratch

In [8]:
seq = Seq('ATGCATGATGATGATGATGATAG', alphabet=single_letter_alphabet)
seq

Seq('ATGCATGATGATGATGATGATAG', SingleLetterAlphabet())

## You can create a dataframe row as such

In [12]:
pd.DataFrame([{'_seq': seq}])._seq 

0    (A, T, G, C, A, T, G, A, T, G, A, T, G, A, T, ...
Name: _seq, dtype: object