In [21]:
# BioDataFrame

In [4]:
from Bio.SeqIO.FastaIO import SequenceIterator
class FastaIterator(SequenceIterator):
    """Parser for Fasta files."""

    def __init__(self, source, alphabet=single_letter_alphabet, title2ids=None):
        """Iterate over Fasta records as SeqRecord objects.
        Arguments:
         - source - input stream opened in text mode, or a path to a file
         - alphabet - optional alphabet
         - title2ids - A function that, when given the title of the FASTA
           file (without the beginning >), will return the id, name and
           description (in that order) for the record as a tuple of strings.
           If this is not given, then the entire title line will be used
           as the description, and the first word as the id and name.
        By default this will act like calling Bio.SeqIO.parse(handle, "fasta")
        with no custom handling of the title lines:
        >>> with open("Fasta/dups.fasta") as handle:
        ...     for record in FastaIterator(handle):
        ...         print(record.id)
        ...
        alpha
        beta
        gamma
        alpha
        delta
        However, you can supply a title2ids function to alter this:
        >>> def take_upper(title):
        ...     return title.split(None, 1)[0].upper(), "", title
        >>> with open("Fasta/dups.fasta") as handle:
        ...     for record in FastaIterator(handle, title2ids=take_upper):
        ...         print(record.id)
        ...
        ALPHA
        BETA
        GAMMA
        ALPHA
        DELTA
        """
        self.title2ids = title2ids
        super().__init__(source, alphabet=alphabet, mode="t", fmt="Fasta")

    def parse(self, handle):
        """Start parsing the file, and return a SeqRecord generator."""
        records = self.iterate(handle)
        return records

    def iterate(self, handle):
        """Parse the file and generate SeqRecord objects."""
        alphabet = self.alphabet
        title2ids = self.title2ids
        if title2ids:
            for title, sequence in SimpleFastaParser(handle):
                id, name, descr = title2ids(title)
                yield SeqRecord(
                    Seq(sequence, alphabet), id=id, name=name, description=descr
                )
        else:
            for title, sequence in SimpleFastaParser(handle):
                try:
                    first_word = title.split(None, 1)[0]
                except IndexError:
                    assert not title, repr(title)
                    # Should we use SeqRecord default for no ID?
                    first_word = ""
                yield SeqRecord(
                    Seq(sequence, alphabet),
                    id=first_word,
                    name=first_word,
                    description=title,
                )

ImportError: cannot import name 'SequenceIterator' from 'Bio.SeqIO.FastaIO' (/Users/tmsincomb/anaconda3/lib/python3.7/site-packages/Bio/SeqIO/FastaIO.py)

In [32]:
def SimpleFastaParser(handle):
    """Iterate over Fasta records as string tuples.

    For each record a tuple of two strings is returned, the FASTA title
    line (without the leading '>' character), and the sequence (with any
    whitespace removed). The title line is not divided up into an
    identifier (the first word) and comment or description.

    >>> with open("Fasta/dups.fasta") as handle:
    ...     for values in SimpleFastaParser(handle):
    ...         print(values)
    ...
    ('alpha', 'ACGTA')
    ('beta', 'CGTC')
    ('gamma', 'CCGCC')
    ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA')
    ('delta', 'CGCGC')

    """
    # Skip any text before the first record (e.g. blank lines, comments)
    # This matches the previous implementation where .readline() was used
    for line in handle:
        if line[0] == ">":
            title = line[1:].rstrip()
            break
        elif isinstance(line[0], int):
            # Same exception as for FASTQ files
            raise ValueError("Is this handle in binary mode not text mode?")
    else:
        # no break encountered - probably an empty file
        return

    # Main logic
    # Note, remove trailing whitespace, and any internal spaces
    # (and any embedded \r which are possible in mangled files
    # when not opened in universal read lines mode)
    lines = []
    for line in handle:
        if line[0] == ">":
            yield title, "".join(lines).replace(" ", "").replace("\r", "")
            lines = []
            title = line[1:].rstrip()
            continue
        lines.append(line.rstrip())

    yield title, "".join(lines).replace(" ", "").replace("\r", "")


In [40]:
from Bio.Seq import Seq
from Bio.Alphabet import single_letter_alphabet
import pandas as pd
pd.DataFrame([{'seq':Seq('ATGCATGATGATGATGATGATAG', alphabet=single_letter_alphabet)}]).seq

0    (A, T, G, C, A, T, G, A, T, G, A, T, G, A, T, ...
Name: seq, dtype: object

In [31]:
from Bio.SeqIO import InsdcIO
InsdcIO.GenBankIterator??

[0;31mSignature:[0m [0mInsdcIO[0m[0;34m.[0m[0mGenBankIterator[0m[0;34m([0m[0mhandle[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mGenBankIterator[0m[0;34m([0m[0mhandle[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Break up a Genbank file into SeqRecord objects.[0m
[0;34m[0m
[0;34m    Every section from the LOCUS line to the terminating // becomes[0m
[0;34m    a single SeqRecord with associated annotation and features.[0m
[0;34m[0m
[0;34m    Note that for genomes or chromosomes, there is typically only[0m
[0;34m    one record.[0m
[0;34m[0m
[0;34m    This gets called internally by Bio.SeqIO for the GenBank file format:[0m
[0;34m[0m
[0;34m    >>> from Bio import SeqIO[0m
[0;34m    >>> for record in SeqIO.parse("GenBank/cor6_6.gb", "gb"):[0m
[0;34m    ...     print(record.id)[0m
[0;34m    ...[0m
[0;34m    X55053.1[0m
[0;34m    X62281.1[0m
[0;34m    M81224.1[0m
[0;34m    AJ237582.1[0m
[

In [28]:
from Bio.SeqIO import FastaIO
FastaIO??

[0;31mType:[0m        module
[0;31mString form:[0m <module 'Bio.SeqIO.FastaIO' from '/Users/tmsincomb/anaconda3/lib/python3.7/site-packages/Bio/SeqIO/FastaIO.py'>
[0;31mFile:[0m        ~/anaconda3/lib/python3.7/site-packages/Bio/SeqIO/FastaIO.py
[0;31mSource:[0m     
[0;31m# Copyright 2006-2017 by Peter Cock.  All rights reserved.[0m[0;34m[0m
[0;34m[0m[0;31m#[0m[0;34m[0m
[0;34m[0m[0;31m# This file is part of the Biopython distribution and governed by your[0m[0;34m[0m
[0;34m[0m[0;31m# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".[0m[0;34m[0m
[0;34m[0m[0;31m# Please see the LICENSE file that should have been included as part of this[0m[0;34m[0m
[0;34m[0m[0;31m# package.[0m[0;34m[0m
[0;34m[0m[0;31m#[0m[0;34m[0m
[0;34m[0m[0;31m# This module is for reading and writing FASTA format files as SeqRecord[0m[0;34m[0m
[0;34m[0m[0;31m# objects.  The code is partly inspired  by earlier Biopython modules,[0m[0;34m

In [60]:
import Bio

class BioDataFrame(metaclass=Bio.SeqIO):
    pass 

vars(BioDataFrame)

TypeError: 'module' object is not callable

In [28]:

    @classmethod
    def from_seqrecords_raw(cls, data, index=None, exclude=None, columns=None,
                        coerce_float=False, nrows=None):
        """
        Convert structured or record ndarray to DataFrame.

        Parameters
        ----------
        data : ndarray (structured dtype), list of tuples, dict, or DataFrame
        index : string, list of fields, array-like
            Field of array to use as the index, alternately a specific set of
            input labels to use
        exclude : sequence, default None
            Columns or fields to exclude
        columns : sequence, default None
            Column names to use. If the passed data do not have names
            associated with them, this argument provides names for the
            columns. Otherwise this argument indicates the order of the columns
            in the result (any names not found in the data will become all-NA
            columns)
        coerce_float : boolean, default False
            Attempt to convert values of non-string, non-numeric objects (like
            decimal.Decimal) to floating point, useful for SQL result sets
        nrows : int, default None
            Number of rows to read if data is an iterator

        Returns
        -------
        df : DataFrame
        """

        # Make a copy of the input columns so we can modify it
        if columns is not None:
            columns = ensure_index(columns)

        if pandas.api.types.is_iterator(data):
            if nrows == 0:
                return cls()

            try:
                first_row = next(data)
            except StopIteration:
                return cls(index=index, columns=columns)

            dtype = None
            if hasattr(first_row, 'dtype') and first_row.dtype.names:
                dtype = first_row.dtype

            values = [first_row]

            if nrows is None:
                values += data
            else:
                values.extend(itertools.islice(data, nrows - 1))

            if dtype is not None:
                data = np.array(values, dtype=dtype)
            else:
                data = values

        if isinstance(data, dict):
            if columns is None:
                columns = arr_columns = ensure_index(sorted(data))
                arrays = [data[k] for k in columns]
            else:
                arrays = []
                arr_columns = []
                for k, v in compat.iteritems(data):
                    if k in columns:
                        arr_columns.append(k)
                        arrays.append(v)

                arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns)
        elif isinstance(data, (np.ndarray, DataFrame)):
            arrays, columns = to_arrays(data, columns)
            if columns is not None:
                columns = ensure_index(columns)
            arr_columns = columns
        else:
            arrays, arr_columns = to_arrays(data, columns,coerce_float=coerce_float)
            arr_columns = ensure_index(arr_columns)
            if columns is not None:
                columns = ensure_index(columns)
            else:
                columns = arr_columns

        if exclude is None:
            exclude = set()
        else:
            exclude = set(exclude)

        result_index = None
        if index is not None:
            if (isinstance(index, compat.string_types) or
                    not hasattr(index, "__iter__")):
                i = columns.get_loc(index)
                exclude.add(index)
                if len(arrays) > 0:
                    result_index = Index(arrays[i], name=index)
                else:
                    result_index = Index([], name=index)
            else:
                try:
                    to_remove = [arr_columns.get_loc(field) for field in index]
                    index_data = [arrays[i] for i in to_remove]
                    result_index = ensure_index_from_sequences(index_data,
                                                               names=index)

                    exclude.update(index)
                except Exception:
                    result_index = index

        if any(exclude):
            arr_exclude = [x for x in exclude if x in arr_columns]
            to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
            arrays = [v for i, v in enumerate(arrays) if i not in to_remove]

            arr_columns = arr_columns.drop(arr_exclude)
            columns = columns.drop(exclude)

        mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)

        return cls(mgr)


TypeError: function() argument 1 must be code, not str

In [49]:
from Bio import SeqIO
gbk = '/Users/tmsincomb/Dropbox/school/BIOMI696-02/MRSA/data/genomes/GCF_000013425.1_ASM1342v1_genomic.gbff'
fasta = '/Users/tmsincomb/Dropbox/school/BIOMI696-02/MRSA/data/genomes/MRSA-evolved-under-FS-1-pressure.fasta'
fasta = '/Users/tmsincomb/Dropbox/school/bio-696/A1/bigdata.fna'
bioseq_gbk = SeqIO.parse(gbk, format='genbank')
bioseq_fasta = SeqIO.parse(fasta, format='fasta')
# SeqIO.to_dict(bioseq_fasta)
data = [values.__dict__ for values in SeqIO.to_dict(bioseq_fasta).values()]
data
# print(type(next(bioseq_fasta)))
# print(type(bioseq_fasta))

[{'_seq': Seq('CGATATTCGATCCGCATCGCTGCCCTACCCGTGGAGTGCCTCCCTCGGNGCAG', SingleLetterAlphabet()),
  'id': 'FWIRNKE01DKIF6',
  'name': 'FWIRNKE01DKIF6',
  'description': 'FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 length=53',
  'dbxrefs': [],
  'annotations': {},
  '_per_letter_annotations': {},
  'features': []},
 {'_seq': Seq('GCGAGCAGCAATCATGCTGCCTCCCGTCGGAGGTGGCCCTCCCCTCCCTCCGC', SingleLetterAlphabet()),
  'id': 'FWIRNKE01CDBE3',
  'name': 'FWIRNKE01CDBE3',
  'description': 'FWIRNKE01CDBE3 rank=0000320 x=854.0 y=2685.0 length=53',
  'dbxrefs': [],
  'annotations': {},
  '_per_letter_annotations': {},
  'features': []},
 {'_seq': Seq('CGAGCAGCACATCATGCCTGGCCTTCCGACGGAGTGCCTCCTCGC', SingleLetterAlphabet()),
  'id': 'FWIRNKE01BKZJJ',
  'name': 'FWIRNKE01BKZJJ',
  'description': 'FWIRNKE01BKZJJ rank=0000535 x=531.0 y=3933.0 length=45',
  'dbxrefs': [],
  'annotations': {},
  '_per_letter_annotations': {},
  'features': []},
 {'_seq': Seq('CGTATGACTGTATCATGCTGCCTCCCGTAGGAGTGCCTCCTCGAC',

In [48]:
SeqIO.to_dict(bioseq_fasta).values()

dict_values([])

In [38]:
records = [values.__dict__ for values in SeqIO.to_dict(SeqIO.parse(fasta, "fasta")).values()]
records[0]

{'_seq': Seq('CGATATTCGATCCGCATCGCTGCCCTACCCGTGGAGTGCCTCCCTCGGNGCAG', SingleLetterAlphabet()),
 'id': 'FWIRNKE01DKIF6',
 'name': 'FWIRNKE01DKIF6',
 'description': 'FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 length=53',
 'dbxrefs': [],
 'annotations': {},
 '_per_letter_annotations': {},
 'features': []}

In [77]:
for k, v in {}.items():
    print(k)

In [53]:
SeqIO.parse??

[0;31mSignature:[0m [0mSeqIO[0m[0;34m.[0m[0mparse[0m[0;34m([0m[0mhandle[0m[0;34m,[0m [0mformat[0m[0;34m,[0m [0malphabet[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mparse[0m[0;34m([0m[0mhandle[0m[0;34m,[0m [0mformat[0m[0;34m,[0m [0malphabet[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""Turn a sequence file into an iterator returning SeqRecords.[0m
[0;34m[0m
[0;34m    Arguments:[0m
[0;34m     - handle   - handle to the file, or the filename as a string[0m
[0;34m       (note older versions of Biopython only took a handle).[0m
[0;34m     - format   - lower case string describing the file format.[0m
[0;34m     - alphabet - optional Alphabet object, useful when the sequence type[0m
[0;34m       cannot be automatically inferred from the file itself[0m
[0;34m       (e.g. format="fasta" or "tab")[0m
[0;34m[0m
[0;34m    Typical usage, 

In [104]:
import Bio
import pandas as pd 
from pandas import DataFrame
import pandas 
import numpy as np
from pandas.core.frame import *
import types
from typing import List


class SubclassedSeries(pd.Series):
    """ Pandas Series API to Inherit """
    @property
    def _constructor(self):
        return SubclassedSeries

    @property
    def _constructor_expanddim(self):
        return SubclassedDataFrame


class SubclassedDataFrame(pd.DataFrame):
    """ Pandas DataFrame to Inherit """
    @property
    def _constructor(self):
        return SubclassedDataFrame

    @property
    def _constructor_sliced(self):
        return SubclassedSeries
    
    
class BioDatabase(SubclassedDataFrame):
    """ Expanded Pandas DataFrame to handle BioPython SeqRecords generator or genomic file types """
    @classmethod
    def from_seqrecords(cls, seqrecords, index=None, exclude=None, columns=None,
                        coerce_float=False, nrows=None):
        """ Takes Biopython parsed output to convert to a proper DataFrame"""
        if isinstance(seqrecords, types.GeneratorType):
            data = cls.__normalize_seqrecords(seqrecords)
        else:
            data = seqrecords
        return cls.from_records(data, index=index, exclude=exclude, columns=columns,
                                coerce_float=coerce_float, nrows=nrows)
    
    def __normalize_seqrecords(seqrecords) -> List[dict]:
        """ Pull nested dictionaries into a single dictionary. 
        
        Priority is given to the keys higher in the hierarchy.  
        """
        records = []
        for seqrecord in SeqIO.to_dict(seqrecords).values():
            _records = []
            record = seqrecord.__dict__
            # If a more complicated format is used; features will be nested.
            features = record.pop('features') if record.get('features') else []
            for feature in features:
                _record = deepcopy(record)
                # Meta that make up the feature 
                aspects = feature.__dict__
                # Qualifier dictionary inside each feature
                qualifiers = aspects.pop('qualifiers') if aspects.get('qualifiers') else {}
                # Add feature
                for aspect_key, aspect_value in aspects.items():
                    if aspect_key not in record:
                        _record[aspect_key] = aspect_value
                # Add qualifier 
                for qualifier_key, qualifier_value in qualifiers.items():
                    _record = deepcopy(_record)
                    if qualifier_key not in _record:
                        _record[qualifier_key] = qualifier_value
                        _records += [_record]
                # If no qualifiers dump feature
                if not _records:
                    _records += [_record]
            # If no feature dump original seq record
            if not _records:
                _records += [record]
            records += _records
        
        return records 
    

def read_seq(handle, format, alphabet=None):
    seqrecords = SeqIO.parse(handle, format=format, alphabet=alphabet)
    return BioDatabase.from_seqrecords(seqrecords)
    

pd.DataFrame = BioDatabase 
pd.read_seq = read_seq


# pd.DataFrame.from_seqrecords([{1:2}])
# pd.DataFrame.from_seqrecords(SeqIO.parse(fasta, format='fasta'))
df = pd.read_seq(fasta, format='fasta')
print(df.loc[0, '_seq'])
df.head()

CGATATTCGATCCGCATCGCTGCCCTACCCGTGGAGTGCCTCCCTCGGNGCAG


Unnamed: 0,_per_letter_annotations,_seq,annotations,dbxrefs,description,features,id,name
0,{},"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",{},[],FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],FWIRNKE01DKIF6,FWIRNKE01DKIF6
1,{},"(G, C, G, A, G, C, A, G, C, A, A, T, C, A, T, ...",{},[],FWIRNKE01CDBE3 rank=0000320 x=854.0 y=2685.0 l...,[],FWIRNKE01CDBE3,FWIRNKE01CDBE3
2,{},"(C, G, A, G, C, A, G, C, A, C, A, T, C, A, T, ...",{},[],FWIRNKE01BKZJJ rank=0000535 x=531.0 y=3933.0 l...,[],FWIRNKE01BKZJJ,FWIRNKE01BKZJJ
3,{},"(C, G, T, A, T, G, A, C, T, G, T, A, T, C, A, ...",{},[],FWIRNKE01CT8MK rank=0000656 x=1047.0 y=1690.0 ...,[],FWIRNKE01CT8MK,FWIRNKE01CT8MK
4,{},"(C, G, A, G, C, A, G, C, A, C, A, T, C, A, T, ...",{},[],FWIRNKE01EP6FI rank=0000658 x=1821.0 y=1148.0 ...,[],FWIRNKE01EP6FI,FWIRNKE01EP6FI


In [107]:
df = pd.read_seq(fasta, format='fasta')
df.head()

Unnamed: 0,_per_letter_annotations,_seq,annotations,dbxrefs,description,features,id,name
0,{},"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",{},[],FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],FWIRNKE01DKIF6,FWIRNKE01DKIF6
1,{},"(G, C, G, A, G, C, A, G, C, A, A, T, C, A, T, ...",{},[],FWIRNKE01CDBE3 rank=0000320 x=854.0 y=2685.0 l...,[],FWIRNKE01CDBE3,FWIRNKE01CDBE3
2,{},"(C, G, A, G, C, A, G, C, A, C, A, T, C, A, T, ...",{},[],FWIRNKE01BKZJJ rank=0000535 x=531.0 y=3933.0 l...,[],FWIRNKE01BKZJJ,FWIRNKE01BKZJJ
3,{},"(C, G, T, A, T, G, A, C, T, G, T, A, T, C, A, ...",{},[],FWIRNKE01CT8MK rank=0000656 x=1047.0 y=1690.0 ...,[],FWIRNKE01CT8MK,FWIRNKE01CT8MK
4,{},"(C, G, A, G, C, A, G, C, A, C, A, T, C, A, T, ...",{},[],FWIRNKE01EP6FI rank=0000658 x=1821.0 y=1148.0 ...,[],FWIRNKE01EP6FI,FWIRNKE01EP6FI


In [122]:
df[df['description'].str.contains('rank=0000177')]

Unnamed: 0,_per_letter_annotations,_seq,annotations,dbxrefs,description,features,id,name
0,{},"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",{},[],FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],FWIRNKE01DKIF6,FWIRNKE01DKIF6


In [106]:
df = pd.read_seq(gbk, format='genbank')
df.head()

Unnamed: 0,EC_number,_per_letter_annotations,_seq,annotations,codon_start,db_xref,dbxrefs,description,experiment,gene,...,organism,product,protein_id,pseudo,pseudogene,strain,sub_species,transl_table,translation,type
0,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,,"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,[Staphylococcus aureus subsp. aureus NCTC 8325],,,,,,,,,source
1,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,,"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,[Staphylococcus aureus subsp. aureus NCTC 8325],,,,,,,,,source
2,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,,"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,[Staphylococcus aureus subsp. aureus NCTC 8325],,,,,[NCTC 8325],,,,source
3,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,,"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,[Staphylococcus aureus subsp. aureus NCTC 8325],,,,,[NCTC 8325],[aureus],,,source
4,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...","{'molecule_type': 'DNA', 'topology': 'circular...",,[taxon:93061],"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",Staphylococcus aureus subsp. aureus NCTC 8325 ...,,,...,[Staphylococcus aureus subsp. aureus NCTC 8325],,,,,[NCTC 8325],[aureus],,,source
