In [1]:
from Bio.SeqFeature import FeatureLocation
from Bio.SeqRecord  import SeqRecord
import _pickle as pickle
import pandas as pd
import sys

Import custom package (see https://github.com/zpedro27/operon-analysis)

In [2]:
sys.path.append("../operon-analysis")
from locations import GenomeLocation

Load E. coli BW25113 genome from https://www.ncbi.nlm.nih.gov/nuccore/CP009273.1

In [3]:
with open("data/BW25113_genome.pkl", "rb") as input:
    chr = pickle.load(input)
chr

SeqRecord(seq=Seq('AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAG...TTC'), id='CP009273.1', name='CP009273', description='Escherichia coli BW25113, complete genome', dbxrefs=['BioProject:PRJNA257976', 'BioSample:SAMN03013572'])

Obtain a table matching each gene to its location in the genome:

In [4]:
def create_gene_location_table(genome: SeqRecord) -> dict:
    """
    Creates a dictionary of {"gene": location in genome}
    """
    hashtable = {}
    for feature in genome.features:
        if feature.type=="CDS":
            gene, = feature.qualifiers["gene"]
            hashtable[gene] = feature.location
    return hashtable 

In [5]:
genetable = create_gene_location_table(chr)

In [6]:
gene_locations = [GenomeLocation(name=gene_name,
                                 location=[gene_feat],
                                 genome=chr) for gene_name, gene_feat in genetable.items()]

In [7]:
data = {g.name: (float(g.start), float(g.end), g.strand) for g in gene_locations}
df = pd.DataFrame(data).T
df

Unnamed: 0,0,1,2
thrL,189.0,255.0,1.0
thrA,336.0,2799.0,1.0
thrB,2800.0,3733.0,1.0
thrC,3733.0,5020.0,1.0
yaaX,5233.0,5530.0,1.0
...,...,...,...
creC,4626512.0,4627937.0,1.0
creD,4627994.0,4629347.0,1.0
arcA,4629406.0,4630123.0,-1.0
yjjY,4630218.0,4630359.0,1.0


In [8]:
df.to_csv("data/genes.csv")