In [1]:
import seqpandas as spd
from Bio import SeqIO
import gzip
import numpy as np

# Sample files used

In [2]:
gbk = "tests/test-data/GCF_000013425.1.gbk.gz"
fasta = "tests/test-data/random_sequences.fasta.gz"

# Reading Different Genomic File Formats

## FASTA Files

In [3]:
fasta_df = spd.read_seq(fasta, format="fasta")
fasta_df.head(3)

Unnamed: 0,_seq,id,name,description,dbxrefs,features,annotations,_per_letter_annotations
0,"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",FWIRNKE01DKIF6,FWIRNKE01DKIF6,FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],[],{},
1,"(G, C, G, A, G, C, A, G, C, A, A, T, C, A, T, ...",FWIRNKE01CDBE3,FWIRNKE01CDBE3,FWIRNKE01CDBE3 rank=0000320 x=854.0 y=2685.0 l...,[],[],{},
2,"(C, G, A, G, C, A, G, C, A, C, A, T, C, A, T, ...",FWIRNKE01BKZJJ,FWIRNKE01BKZJJ,FWIRNKE01BKZJJ rank=0000535 x=531.0 y=3933.0 l...,[],[],{},


# You can also import seqrecords directly from BioPythons parsing output.

In [4]:
with gzip.open(fasta, "rt") as handle:
    seqrecords = SeqIO.parse(handle, format="fasta")
    fasta_df = spd.BioDataFrame.from_seqrecords(seqrecords)
fasta_df.head(3)

Unnamed: 0,_seq,id,name,description,dbxrefs,features,annotations,_per_letter_annotations
0,"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",FWIRNKE01DKIF6,FWIRNKE01DKIF6,FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],[],{},
1,"(G, C, G, A, G, C, A, G, C, A, A, T, C, A, T, ...",FWIRNKE01CDBE3,FWIRNKE01CDBE3,FWIRNKE01CDBE3 rank=0000320 x=854.0 y=2685.0 l...,[],[],{},
2,"(C, G, A, G, C, A, G, C, A, C, A, T, C, A, T, ...",FWIRNKE01BKZJJ,FWIRNKE01BKZJJ,FWIRNKE01BKZJJ rank=0000535 x=531.0 y=3933.0 l...,[],[],{},


# Search description with little effort!

In [5]:
# I want all ranks of a certain number
fasta_df[fasta_df["description"].str.contains("rank=0000177")]

Unnamed: 0,_seq,id,name,description,dbxrefs,features,annotations,_per_letter_annotations
0,"(C, G, A, T, A, T, T, C, G, A, T, C, C, G, C, ...",FWIRNKE01DKIF6,FWIRNKE01DKIF6,FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 ...,[],[],{},


# Better view of the layout for a single row

In [6]:
fasta_df.head(1).to_dict("records")

[{'_seq': Seq('CGATATTCGATCCGCATCGCTGCCCTACCCGTGGAGTGCCTCCCTCGGNGCAG'),
  'id': 'FWIRNKE01DKIF6',
  'name': 'FWIRNKE01DKIF6',
  'description': 'FWIRNKE01DKIF6 rank=0000177 x=1346.0 y=2772.0 length=53',
  'dbxrefs': [],
  'features': [],
  'annotations': {},
  '_per_letter_annotations': None}]

## SAM/BAM (Sequence Alignment Map) Files

In [7]:
# Read SAM file for sequence alignments
sam_file = "tests/test-data/example.sam"
sam_df = spd.read_seq(sam_file, format="sam")
print(f"SAM shape: {sam_df.shape}")
sam_df.head()
# Note: SAM reading requires pysam library and proper SAM formatting

SAM shape: (5, 12)


Unnamed: 0,name,flag,ref_name,ref_pos,map_quality,cigar,next_ref_name,next_ref_pos,length,seq,qual,tags
0,r001,99,ref1,7,30,8M2I4M1D3M,=,37,39,TTAGATAAAGGATACTG,*,[]
1,r002,0,ref1,9,30,3S6M1P1I4M,*,0,0,AAAAGATAAGGATA,*,[]
2,r003,0,ref1,9,30,5S6M,*,0,0,GCCTAAGCTAA,*,"[SA:Z:ref2,29,-,6H5M,17,0;]"
3,r004,0,ref1,16,30,6M14N5M,*,0,0,ATAGCTTCAGC,*,[]
4,r003,2064,ref2,29,17,6H5M,*,0,0,TAGGC,*,"[SA:Z:ref1,9,+,5S6M,30,1;]"


## BED (Browser Extensible Data) Files

In [8]:
# Read BED file for genomic intervals
bed_file = "tests/test-data/example.bed"
bed_df = spd.read_bed(bed_file)
print(f"BED shape: {bed_df.shape}")
bed_df.head()

BED shape: (6, 12)


Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts
0,chr1,1000,2000,feature1,100,+,,,,,,
1,chr1,3000,4000,feature2,200,-,,,,,,
2,chr2,5000,6000,feature3,150,+,,,,,,
3,chr2,7000,8000,feature4,300,.,,,,,,
4,chr3,9000,10000,feature5,250,-,,,,,,


## VCF (Variant Call Format) Files

In [9]:
# Read VCF file
vcf_file = "tests/test-data/vcf-test-data/example.vcf"
vcf_df = spd.read_vcf(vcf_file)
print(f"VCF shape: {vcf_df.shape}")
vcf_df.head(3)

VCF shape: (6, 12)


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,Sample1,Sample2,Sample3
0,1,14370,rs6054257,G,A,29,PASS,NS=3;DP=14;AF=0.5;DB;H2,GT:GQ:DP:HQ,"{'GT': '0|0', 'GQ': '48', 'DP': '1', 'HQ': '51...","{'GT': '1|0', 'GQ': '48', 'DP': '8', 'HQ': '51...","{'GT': '1/1', 'GQ': '43', 'DP': '5', 'HQ': '.,.'}"
1,1,17330,.,T,A,3,q10,NS=3;DP=11;AF=0.017,GT:GQ:DP:HQ,"{'GT': '0|0', 'GQ': '49', 'DP': '3', 'HQ': '58...","{'GT': '0|1', 'GQ': '3', 'DP': '5', 'HQ': '65,3'}","{'GT': '0/0', 'GQ': '41', 'DP': '3', 'HQ': '.,.'}"
2,1,1110696,rs6040355,A,"G,T",67,PASS,"NS=2;DP=10;AF=0.333,0.667;AA=T;DB",GT:GQ:DP:HQ,"{'GT': '1|2', 'GQ': '21', 'DP': '6', 'HQ': '23...","{'GT': '2|1', 'GQ': '2', 'DP': '0', 'HQ': '18,2'}","{'GT': '2/2', 'GQ': '35', 'DP': '4', 'HQ': '.,.'}"


In [10]:
# VCF files contain variant information with sample genotypes
# Sample columns are parsed as dictionaries containing GT, GQ, DP, etc.
print("Sample1 genotype info for first variant:")
print(vcf_df["Sample1"].iloc[0])

Sample1 genotype info for first variant:
{'GT': '0|0', 'GQ': '48', 'DP': '1', 'HQ': '51,51'}


# GenBank to Pandas DataFrame example

In [11]:
gbk_df = spd.read_seq(gbk, format="genbank")
gbk_df.head(3)

Unnamed: 0,_seq,id,name,description,dbxrefs,_per_letter_annotations,molecule_type,topology,data_file_division,date,accessions,sequence_version,keywords,source,organism,taxonomy,references,comment,contig,location,type,mol_type,strain,sub_species,db_xref,gene,locus_tag,note,codon_start,transl_table,product,protein_id,translation,EC_number,pseudogene,gene_synonym,experiment,pseudo
0,"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",NC_007795.1,NC_007795,Staphylococcus aureus subsp. aureus NCTC 8325 ...,"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",{},DNA,circular,CON,03-AUG-2016,[NC_007795],1,[RefSeq],Staphylococcus aureus subsp. aureus NCTC 8325,Staphylococcus aureus subsp. aureus NCTC 8325,"[Bacteria, Firmicutes, Bacilli, Bacillales, St...","[location: [0:2821361]\nauthors: Gillaspy,A.F....",REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",source,[genomic DNA],[NCTC 8325],[aureus],[taxon:93061],,,,,,,,,,,,,
1,"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",NC_007795.1,NC_007795,Staphylococcus aureus subsp. aureus NCTC 8325 ...,"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",{},DNA,circular,CON,03-AUG-2016,[NC_007795],1,[RefSeq],Staphylococcus aureus subsp. aureus NCTC 8325,Staphylococcus aureus subsp. aureus NCTC 8325,"[Bacteria, Firmicutes, Bacilli, Bacillales, St...","[location: [0:2821361]\nauthors: Gillaspy,A.F....",REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),"(516, 517, 518, 519, 520, 521, 522, 523, 524, ...",gene,,,,[GeneID:3919798],[dnaA],[SAOUHSC_00001],,,,,,,,,,,
2,"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",NC_007795.1,NC_007795,Staphylococcus aureus subsp. aureus NCTC 8325 ...,"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",{},DNA,circular,CON,03-AUG-2016,[NC_007795],1,[RefSeq],Staphylococcus aureus subsp. aureus NCTC 8325,Staphylococcus aureus subsp. aureus NCTC 8325,"[Bacteria, Firmicutes, Bacilli, Bacillales, St...","[location: [0:2821361]\nauthors: Gillaspy,A.F....",REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),"(516, 517, 518, 519, 520, 521, 522, 523, 524, ...",CDS,,,,[GeneID:3919798],[dnaA],[SAOUHSC_00001],[binds to the dnaA-box as an ATP-bound complex...,[1],[11],[chromosomal replication initiation protein],[YP_498609.1],[MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSS...,,,,,


# See all the possible columns of meta data

In [12]:
gbk_df.columns

Index(['_seq', 'id', 'name', 'description', 'dbxrefs',
       '_per_letter_annotations', 'molecule_type', 'topology',
       'data_file_division', 'date', 'accessions', 'sequence_version',
       'keywords', 'source', 'organism', 'taxonomy', 'references', 'comment',
       'contig', 'location', 'type', 'mol_type', 'strain', 'sub_species',
       'db_xref', 'gene', 'locus_tag', 'note', 'codon_start', 'transl_table',
       'product', 'protein_id', 'translation', 'EC_number', 'pseudogene',
       'gene_synonym', 'experiment', 'pseudo'],
      dtype='object')

# Seach for only rows of type CDS

In [13]:
cds = gbk_df[gbk_df.type == "CDS"]
gene = gbk_df[gbk_df.type == "gene"]
print("Genome has CDS count:", cds.shape)
print("Genome has GENE count:", gene.shape)

Genome has CDS count: (2767, 38)
Genome has GENE count: (2872, 38)


In [14]:
cds.head(3)

Unnamed: 0,_seq,id,name,description,dbxrefs,_per_letter_annotations,molecule_type,topology,data_file_division,date,accessions,sequence_version,keywords,source,organism,taxonomy,references,comment,contig,location,type,mol_type,strain,sub_species,db_xref,gene,locus_tag,note,codon_start,transl_table,product,protein_id,translation,EC_number,pseudogene,gene_synonym,experiment,pseudo
2,"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",NC_007795.1,NC_007795,Staphylococcus aureus subsp. aureus NCTC 8325 ...,"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",{},DNA,circular,CON,03-AUG-2016,[NC_007795],1,[RefSeq],Staphylococcus aureus subsp. aureus NCTC 8325,Staphylococcus aureus subsp. aureus NCTC 8325,"[Bacteria, Firmicutes, Bacilli, Bacillales, St...","[location: [0:2821361]\nauthors: Gillaspy,A.F....",REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),"(516, 517, 518, 519, 520, 521, 522, 523, 524, ...",CDS,,,,[GeneID:3919798],[dnaA],[SAOUHSC_00001],[binds to the dnaA-box as an ATP-bound complex...,[1],[11],[chromosomal replication initiation protein],[YP_498609.1],[MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSS...,,,,,
4,"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",NC_007795.1,NC_007795,Staphylococcus aureus subsp. aureus NCTC 8325 ...,"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",{},DNA,circular,CON,03-AUG-2016,[NC_007795],1,[RefSeq],Staphylococcus aureus subsp. aureus NCTC 8325,Staphylococcus aureus subsp. aureus NCTC 8325,"[Bacteria, Firmicutes, Bacilli, Bacillales, St...","[location: [0:2821361]\nauthors: Gillaspy,A.F....",REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),"(2155, 2156, 2157, 2158, 2159, 2160, 2161, 216...",CDS,,,,[GeneID:3919799],,[SAOUHSC_00002],[binds the polymerase to DNA and acts as a sli...,[1],[11],[DNA polymerase III subunit beta],[YP_498610.1],[MMEFTIKRDYFITQLNDTLKAISPRTTLPILTGIKIDAKEHEVIL...,[2.7.7.7],,,,
6,"(C, G, A, T, T, A, A, A, G, A, T, A, G, A, A, ...",NC_007795.1,NC_007795,Staphylococcus aureus subsp. aureus NCTC 8325 ...,"[BioProject:PRJNA57795, Assembly:GCF_000013425.1]",{},DNA,circular,CON,03-AUG-2016,[NC_007795],1,[RefSeq],Staphylococcus aureus subsp. aureus NCTC 8325,Staphylococcus aureus subsp. aureus NCTC 8325,"[Bacteria, Firmicutes, Bacilli, Bacillales, St...","[location: [0:2821361]\nauthors: Gillaspy,A.F....",REVIEWED REFSEQ: This record has been curated ...,join(CP000253.1:1..2821361),"(3669, 3670, 3671, 3672, 3673, 3674, 3675, 367...",CDS,,,,[GeneID:3919176],,[SAOUHSC_00003],[conserved hypothetical protein],[1],[11],[hypothetical protein],[YP_498611.1],[MIILVQEVVVEGDINLGQFLKTEGIIESGGQAKWFLQDVEVLING...,,,,,


# Better view of the layout for a single row

In [15]:
cds.head(1).to_dict("records")

[{'_seq': Seq('CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCAT...TAT'),
  'id': 'NC_007795.1',
  'name': 'NC_007795',
  'description': 'Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome',
  'dbxrefs': ['BioProject:PRJNA57795', 'Assembly:GCF_000013425.1'],
  '_per_letter_annotations': {},
  'molecule_type': 'DNA',
  'topology': 'circular',
  'data_file_division': 'CON',
  'date': '03-AUG-2016',
  'accessions': ['NC_007795'],
  'sequence_version': 1,
  'keywords': ['RefSeq'],
  'source': 'Staphylococcus aureus subsp. aureus NCTC 8325',
  'organism': 'Staphylococcus aureus subsp. aureus NCTC 8325',
  'taxonomy': ['Bacteria',
   'Firmicutes',
   'Bacilli',
   'Bacillales',
   'Staphylococcaceae',
   'Staphylococcus'],
  'references': [Reference(title='The Staphylococcus aureus NCTC8325 Genome', ...),
   Reference(title='Direct Submission', ...),
   Reference(title='Direct Submission', ...)],
  'comment': 'REVIEWED REFSEQ: This record has been curated by NCBI 

------

# Machine Learning Example Application

### For simplicity of this example we will pretend we have a large sample set, but really we just using the single genbank genome. The follow will create the feature table we can use for a simple Sequential model.
1. Sample the first 4 genes
2. Given them random mutation value from 0 to 20
3. Give each mutation a random "observed in healthy patient"

In [16]:
# Sample for the first 4 genes
genes = [gene[0] for gene in gbk_df[(gbk_df.type == "gene") & ~gbk_df.gene.isna()]["gene"][:4].values]
genes

['dnaA', 'recF', 'rplI', 'argJ']

In [17]:
# Create fake observed mutation rates for target genomes compared to a reference genome
mutation_array = lambda: list(np.random.uniform(low=0, high=20, size=(1, 100))[0])
observed_gene_mutation_per_genome = spd.BioDataFrame.from_dict({gene: mutation_array() for gene in genes})
observed_gene_mutation_per_genome.head(3)

Unnamed: 0,dnaA,recF,rplI,argJ
0,19.926619,15.783713,17.04123,7.861119
1,7.200247,13.553658,15.47005,0.735451
2,13.83196,8.450581,7.379233,4.611984


In [18]:
# Give a 0 or 1 to show if a patient has the fake disease
# We a biased 30 percent with disease
patients = np.zeros(100)
patients[:30] = 1
np.random.shuffle(patients)

observed_gene_mutation_per_genome["fake_disease"] = list(patients)
observed_gene_mutation_per_genome.head(10)

Unnamed: 0,dnaA,recF,rplI,argJ,fake_disease
0,19.926619,15.783713,17.04123,7.861119,1.0
1,7.200247,13.553658,15.47005,0.735451,0.0
2,13.83196,8.450581,7.379233,4.611984,0.0
3,6.591249,6.134858,12.047699,10.040812,0.0
4,9.614278,10.150649,15.374298,14.12476,0.0
5,16.860632,12.342782,10.999112,15.806463,0.0
6,0.6206,14.627543,4.200134,14.739281,0.0
7,6.340793,19.087236,8.459715,9.066288,0.0
8,8.825462,12.633047,7.450039,2.275944,0.0
9,19.892919,4.359332,18.910079,2.968047,0.0


--------