In [1]:
from BioPandas import pandas as pd

# Sample files used

In [29]:
gbk = 'test-data/GCF_000013425.1.gbk.gz'
fasta = 'test-data/random_sequences.fasta.gz'

# Fasta to Pandas DataFrame example

In [11]:
fasta_df = pd.read_seq(fasta, format='fasta')
fasta_df.head(3)

Unnamed: 0,_per_letter_annotations,_seq,annotations,dbxrefs,description,features,id,name
0,{},"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",{},[],FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],FWIRNKE01DKIF6,FWIRNKE01DKIF6
1,{},"(G, C, G, A, G, C, A, G, C, A, A, T, C, A, T, ...",{},[],FWIRNKE01CDBE3 rank=0000320 x=854.0 y=2685.0 l...,[],FWIRNKE01CDBE3,FWIRNKE01CDBE3
2,{},"(C, G, A, G, C, A, G, C, A, C, A, T, C, A, T, ...",{},[],FWIRNKE01BKZJJ rank=0000535 x=531.0 y=3933.0 l...,[],FWIRNKE01BKZJJ,FWIRNKE01BKZJJ


# Search description with little effort!

In [12]:
# I want all ranks of a certain number 
fasta_df[fasta_df['description'].str.contains('rank=0000177')]

Unnamed: 0,_per_letter_annotations,_seq,annotations,dbxrefs,description,features,id,name
0,{},"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",{},[],FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],FWIRNKE01DKIF6,FWIRNKE01DKIF6


# Better view of the layout for a single row

In [13]:
fasta_df.head(1).to_dict('records')

[{'_per_letter_annotations': {},
  '_seq': Seq('CGATATTCGATCCGCATCGCTGCCCTACCCGTGGAGTGCCTCCCTCGGNGCAG', SingleLetterAlphabet()),
  'annotations': {},
  'dbxrefs': [],
  'description': 'FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 length=53',
  'features': [],
  'id': 'FWIRNKE01DKIF6',
  'name': 'FWIRNKE01DKIF6'}]

# GenBank to Pandas DataFrame example

In [30]:
gbk_df = pd.read_seq(gbk, format='genbank')
gbk_df.head(3)

Unnamed: 0,EC_number,_per_letter_annotations,_seq,accessions,codon_start,comment,contig,data_file_division,date,db_xref,...,references,sequence_version,source,strain,sub_species,taxonomy,topology,transl_table,translation,type
0,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",[NC_007795],,REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),CON,03-AUG-2016,[taxon:93061],...,"[location: [0:2821361]\nauthors: Gillaspy,A.F....",1,Staphylococcus aureus subsp. aureus NCTC 8325,[NCTC 8325],[aureus],"[Bacteria, Firmicutes, Bacilli, Bacillales, St...",circular,,,source
1,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",[NC_007795],,REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),CON,03-AUG-2016,[GeneID:3919798],...,"[location: [0:2821361]\nauthors: Gillaspy,A.F....",1,Staphylococcus aureus subsp. aureus NCTC 8325,,,"[Bacteria, Firmicutes, Bacilli, Bacillales, St...",circular,,,gene
2,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",[NC_007795],[1],REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),CON,03-AUG-2016,[GeneID:3919798],...,"[location: [0:2821361]\nauthors: Gillaspy,A.F....",1,Staphylococcus aureus subsp. aureus NCTC 8325,,,"[Bacteria, Firmicutes, Bacilli, Bacillales, St...",circular,[11],[MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSS...,CDS


# See all the possible columns of meta data

In [19]:
gbk_df.columns

Index(['_per_letter_annotations', '_seq', 'accessions', 'codon_start',
       'collected_by', 'collection_date', 'comment', 'country',
       'culture_collection', 'data_file_division', 'date', 'db_xref',
       'dbxrefs', 'description', 'gene', 'host', 'id', 'isolation_source',
       'keywords', 'lat_lon', 'location', 'locus_tag', 'mol_type',
       'molecule_type', 'name', 'ncRNA_class', 'note', 'organism', 'product',
       'protein_id', 'references', 'sequence_version', 'source', 'strain',
       'structured_comment', 'taxonomy', 'topology', 'transl_table',
       'translation', 'type'],
      dtype='object')

# Seach for only rows of type CDS

In [31]:
cds = gbk_df[gbk_df.type == 'CDS']
gene = gbk_df[gbk_df.type == 'gene']
print('Genome has CDS count:', cds.shape)
print('Genome has GENE count:', gene.shape)

Genome has CDS count: (2767, 38)
Genome has GENE count: (2872, 38)


In [32]:
cds.head(3)

Unnamed: 0,EC_number,_per_letter_annotations,_seq,accessions,codon_start,comment,contig,data_file_division,date,db_xref,...,references,sequence_version,source,strain,sub_species,taxonomy,topology,transl_table,translation,type
2,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",[NC_007795],[1],REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),CON,03-AUG-2016,[GeneID:3919798],...,"[location: [0:2821361]\nauthors: Gillaspy,A.F....",1,Staphylococcus aureus subsp. aureus NCTC 8325,,,"[Bacteria, Firmicutes, Bacilli, Bacillales, St...",circular,[11],[MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSS...,CDS
4,[2.7.7.7],{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",[NC_007795],[1],REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),CON,03-AUG-2016,[GeneID:3919799],...,"[location: [0:2821361]\nauthors: Gillaspy,A.F....",1,Staphylococcus aureus subsp. aureus NCTC 8325,,,"[Bacteria, Firmicutes, Bacilli, Bacillales, St...",circular,[11],[MMEFTIKRDYFITQLNDTLKAISPRTTLPILTGIKIDAKEHEVIL...,CDS
6,,{},"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",[NC_007795],[1],REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),CON,03-AUG-2016,[GeneID:3919176],...,"[location: [0:2821361]\nauthors: Gillaspy,A.F....",1,Staphylococcus aureus subsp. aureus NCTC 8325,,,"[Bacteria, Firmicutes, Bacilli, Bacillales, St...",circular,[11],[MIILVQEVVVEGDINLGQFLKTEGIIESGGQAKWFLQDVEVLING...,CDS


# Better view of the layout for a single row

In [33]:
cds.head(1).to_dict('records')

[{'EC_number': nan,
  '_per_letter_annotations': {},
  '_seq': Seq('CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCAT...TAT', IUPACAmbiguousDNA()),
  'accessions': ['NC_007795'],
  'codon_start': ['1'],
  'comment': 'REVIEWED REFSEQ: This record has been curated by NCBI staff. The\nreference sequence was derived from CP000253.\nRefSeq Category: Reference Genome\n            UPR: UniProt Genome\nStaphylococcus aureus subsp. aureus NCTC 8325 is available from\nwww.narsa.net.\nCOMPLETENESS: full length.',
  'contig': 'join(CP000253.1:1..2821361)',
  'data_file_division': 'CON',
  'date': '03-AUG-2016',
  'db_xref': ['GeneID:3919798'],
  'dbxrefs': ['BioProject:PRJNA57795', 'Assembly:GCF_000013425.1'],
  'description': 'Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome',
  'experiment': nan,
  'gene': ['dnaA'],
  'gene_synonym': nan,
  'id': 'NC_007795.1',
  'keywords': ['RefSeq'],
  'location': FeatureLocation(ExactPosition(516), ExactPosition(1878), strand=1)

In [None]:
# Lets say we can predict a probability disease if a person has a 

------

# Machine Learning Example Application

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense 
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:5], df['status'],
                                                    test_size = 0.33, random_state = 100)

# Create general sequential model with 4 layers  

In [None]:
model = Sequential()

model.add(Dense(4, activation = 'relu')) # 4 for just random guessing
model.add(Dense(4, activation = 'relu'))
model.add(Dense(4, activation = 'relu'))
model.add(Dense(1)) # just want healthy status; hence final layer is 1

model.compile(optimizer='rmsprop', loss='mse') # mean squared error for sanity checking

In [None]:
early_stop = EarlyStopping(monitor = 'val_loss', patience = 1)
model.fit(
    x = X_train.astype('float32'), # float32 to save memory and runtime; we dont need 64 bits anyway
    y = y_train.astype('float32'), 
    verbose = 1, # 0 if you dont want to see the print out
    epochs = 100, # number of training intervals 
    # callbacks = [early_stop], # cant used validation set to have an educated stop due to data set being too small :(
    # validation_data=(X_test.astype('float32'), y_test.astype('float32')), # data too small to use as a validator
)

# Check how well your model was built

In [None]:
loss = pd.DataFrame(model.history.history)
loss.plot()

In [None]:
model # Thats exciting! This is also a word of caution that you can make a model from anything. 

--------

# Extra Notes

In [10]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

# Creating a BioPython seq from scratch

In [11]:
seq = Seq('ATGCATGATGATGATGATGATAG', alphabet=IUPAC.Alphabet.single_letter_alphabet)
seq

Seq('ATGCATGATGATGATGATGATAG', SingleLetterAlphabet())

## You can create a simple dataframe as such

In [12]:
pd.DataFrame([{'_seq': seq}])._seq 

0    (A, T, G, C, A, T, G, A, T, G, A, T, G, A, T, ...
Name: _seq, dtype: object