In [1]:
# Libraries

import numpy
import os
import shutil
import warnings

In [2]:
# Code

from src.data.feature import feature as data_feature
from src.io           import loader  as data_loader
from src.io           import writer  as data_writer

# 1. Setup

In [3]:
# Setup some directory paths

OUT_SUBFOLDER = 'nbp04-feature'
RES_SUBFOLDER = ''

CWD = os.getcwd()
OUT = os.path.join(CWD, 'out')
RES = os.path.join(CWD, 'res')

OUT_DATA  = os.path.join(OUT, 'data',  OUT_SUBFOLDER)
RES_PLANT = os.path.join(RES, 'plant', RES_SUBFOLDER)
RES_NBP01 = os.path.join(OUT, 'data',  'nbp01-analysis')

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT}')
print(f' Resource Directory : {RES}')

     Root Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis
   Output Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\out
 Resource Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\res


In [4]:
# Load the annotated and cleaned data

gene_assembly = data_loader.load_faidx(
	filename  = os.path.join(RES_PLANT, 'arabidopsis-r36', 'gene-assembly.fa')
)

gene_annotation = data_loader.load_csv(
	filename   = os.path.join(RES_NBP01, 'gene-annotation.csv'),
	low_memory = False
)

# 2. Lengths

In [5]:
# Define the region lengths

lengths = {
	'prom_full' : [int(1000), int(500)],
	'prom'      :  int(1000),
	'utr5'      :  int( 300),
	'cds'       :  int(9000),
	'utr3'      :  int( 350),
	'term'      :  int( 500),
	'term_full' : [int( 500), int(500)]
}

# 3. Gene Regions

In [6]:
# Group annotations into regions

regions = data_feature.annotation_to_regions(
	annotation = gene_annotation,
	lengths    = lengths
)

print()
print(f'Total unique mRNA regions : {len(regions)}')

Passed 1st assertion : True
Passed 2nd assertion : True

Total unique mRNA regions : 31685


# 4. Gene Features

In [7]:
# Convert the regions into sequences and features

sequences, features = data_feature.regions_to_features(
	faidx     = gene_assembly,
	dataframe = regions,
	lengths   = lengths
)

  0%|          | 0/31685 [00:00<?, ?it/s]

[AT1G33612.1 ] : unknown codon [GYG]
[AT2G01120.1 ] : unknown codon [GKT]
[AT2G01120.1 ] : unknown codon [STA]
[AT2G01120.1 ] : unknown codon [GAK]
[AT2G01120.1 ] : unknown codon [KTG]
[AT2G01120.2 ] : unknown codon [GKT]
[AT2G01120.2 ] : unknown codon [STA]
[AT2G01120.2 ] : unknown codon [GAK]
[AT2G01120.2 ] : unknown codon [KTG]
[AT2G48110.1 ] : unknown codon [GCM]
[AT3G63540.1 ] : out of bounds at sequence end


In [8]:
# Convert the dataframe into a dictionary for ease of use

sequences = sequences.copy()
sequences = sequences.set_index('mRNA', drop = False)
sequences = sequences.rename_axis(None, axis = 'index')
sequences = sequences.to_dict('index')

In [9]:
# Add a header field with more transcript information

sequences = data_feature.sequences_extend_kvpair(
	sequences = sequences,
	regions   = regions,
	header    = '{} | {} | {}:{}-{} | {}'
)

In [10]:
# Example of a positive strand

data_feature.print_extracted_sequence(
	mrna      = 'AT1G01010.1',
	sequences = sequences,
	space     = True
)

AT1G01010.1 | + | 1:2631-3630 | 1000
ATATTGCTAT TTCTGCCAAT ATTAAAACTT CACTTAGGAA GACTTGAACC TACCACACGT TAGTGACTAA TGAGAGCCAC TAGATAATTG CATGCATCCC 
ACACTAGTAC TAATTTTCTA GGGATATTAG AGTTTTCTAA TCACCTACTT CCTACTATGT GTATGTTATC TACTGGCGTG GATGCTTTTA AAGATGTTAC 
GTTATTATTT TGTTCGGTTT GGAAAACGGC TCAATCGTTA TGAGTTCGTA AGACACATAC ATTGTTCCAT GATAAAATGC AACCCCACGA ACCATTTGCG 
ACAAGCAAAA CAACATGGTC AAAATTAAAA GCTAACAATT AGCCAGCGAT TCAAAAAGTC AACCTTCTAG ATGGATTTAA CAACATATCG ATAGGATTCA 
AGATTAAAAA TAAGCACACT CTTATTAATG TTAAAAAACG AATGAGATGA AAATATTTGG CGTGTTCACA CACATAATCT AGAAGACAGA TTCGAGTTGC 
TCTCCTTTGT TTTGCTTTGG GAGGGACCCA TTATTACCGC CCAGCAGCTT CCCAGCCTTC CTTTATAAGG CTTAATTTAT ATTTATTTAA ATTTTATATG 
TTCTTCTATT ATAATACTAA AAGGGGAATA CAAATTTCTA CAGAGGATGA TATTCAATCC ACGGTTCACC CAAACCGATT TTATAAAATT TATTATTAAA 
TCTTTTTTAA TTGTTAAATT GGTTTAAATC TGAACTCTGT TTACTTACAT TGATTAAAAT TCTAAACCAT CATAAGTAAA AAATAATATG ATTAAGACTA 
ATAAATCTTA ATAGTTAATA CTACTCGGTT TACTACATGA AATTTCATAC CATCAATTGT TTTAATAAT

In [11]:
# Example of a negative strand

data_feature.print_extracted_sequence(
	mrna      = 'AT1G01020.1',
	sequences = sequences,
	space     = True
)

AT1G01020.1 | - | 1:9129-10128 | 1000
ATAAATATAT GAACCTACAT CATTATAAGT AGGGTTAAGT GTGTATGATT GTGTATGCGT ATAAAAATAC TCCCTTGACC GTAAACATGA AACATGTAAT 
ATATAAGATA TATAGACATG GAGACTATAT CATATAAACA TACATATATA TATATATGTT AGTTATATGT GTAGCCCACA TTTTTCGATA TAGGGGAACA 
AAGTTACGAA TAATGTATAT GTTAGTTATA TGTATAAAAC ATTTGCAAAG GGATGATACA ATGGAATATG TAAATACGTA TTCCAAATAG TCTACAAGTA 
GCAATGATAT TGTGTAGATA TGTCTCATAA AGGCTTGTGC TATTATATTG TGTAGATGTG TCTCAAAAGG CTTGTGCATA GTTGTCAAAA GAATATAAAA 
TGCATTTTAA TATTCAAAAG AATGCATGCA TAGACTGATA GAAAAGAAGA AATAATGAAA ATTAAAAGGG GAGGTATTAG TGAAAAGTAC GGAGGTATTA 
AAATTATGAA AATGACTAAA GGAAATAATA TTGTAACACA TCTAAATGAT TAGTTGTGTA AGAAGAATAA AGAGAATTAA TGACATGCTC CAATTATTGT 
TAAATAAAAT AAAATAATTC ATTCTCATTA GATTCCTGAC CTAAAAGAGA CTGCAAATAT GTAGGCCGGT TAGGGTTTTC ATAGGCATGA TTAATGATGA 
TAAGGAAACT TCGTATTGGG GGTTAATGGG TTCTTTCCCT TTCTAATACA TTTCTAAATA AAGTAATAAT CAAATATTAA ACTTTTAACT TACCGTTTCA 
TTTTTATGTT GAGTTTACTT ATATTGAAAG GAAACTATTT TATCTAAACC TCATTGTCAA ATCTTTCA

In [12]:
# Save the gene regions (non-padded)

tuples = [
	('Prom_Full', 'promoter-full'),
	('Prom',      'promoter'),
	('UTR5',      'utr5'),
	('CDS',       'cds'),
	('UTR3',      'utr3'),
	('Term',      'terminator'),
	('Term_Full', 'terminator-full')
]

for region, filename in tuples :
	data_writer.write_fasta(
		data = {
			item[region]['key'] : item[region]['seq']
			for item in sequences.values()
		},
		filename = os.path.join(OUT_DATA, f'sequences-{filename}.fasta')
	)

In [13]:
# Merge regions and pad accordingly

bp2150 = data_feature.merge_and_pad_sequences(
	sequences = sequences,
	lengths   = lengths,
	padding   = {
		'prom' : 'left',
		'utr5' : 'left',
		'utr3' : 'left',
		'term' : 'right'
	}
)

In [14]:
# Display an example of a merged mrna sequence

data_feature.print_padded_sequence(
	mrna      = 'AT1G01010.1',
	sequences = bp2150,
	space     = True
)

AT1G01010.1 | + | 1:2631-6399 | 2150
ATATTGCTAT TTCTGCCAAT ATTAAAACTT CACTTAGGAA GACTTGAACC TACCACACGT TAGTGACTAA TGAGAGCCAC TAGATAATTG CATGCATCCC 
ACACTAGTAC TAATTTTCTA GGGATATTAG AGTTTTCTAA TCACCTACTT CCTACTATGT GTATGTTATC TACTGGCGTG GATGCTTTTA AAGATGTTAC 
GTTATTATTT TGTTCGGTTT GGAAAACGGC TCAATCGTTA TGAGTTCGTA AGACACATAC ATTGTTCCAT GATAAAATGC AACCCCACGA ACCATTTGCG 
ACAAGCAAAA CAACATGGTC AAAATTAAAA GCTAACAATT AGCCAGCGAT TCAAAAAGTC AACCTTCTAG ATGGATTTAA CAACATATCG ATAGGATTCA 
AGATTAAAAA TAAGCACACT CTTATTAATG TTAAAAAACG AATGAGATGA AAATATTTGG CGTGTTCACA CACATAATCT AGAAGACAGA TTCGAGTTGC 
TCTCCTTTGT TTTGCTTTGG GAGGGACCCA TTATTACCGC CCAGCAGCTT CCCAGCCTTC CTTTATAAGG CTTAATTTAT ATTTATTTAA ATTTTATATG 
TTCTTCTATT ATAATACTAA AAGGGGAATA CAAATTTCTA CAGAGGATGA TATTCAATCC ACGGTTCACC CAAACCGATT TTATAAAATT TATTATTAAA 
TCTTTTTTAA TTGTTAAATT GGTTTAAATC TGAACTCTGT TTACTTACAT TGATTAAAAT TCTAAACCAT CATAAGTAAA AAATAATATG ATTAAGACTA 
ATAAATCTTA ATAGTTAATA CTACTCGGTT TACTACATGA AATTTCATAC CATCAATTGT TTTAATAAT

In [15]:
# Save the gene features

data_writer.write_fasta(
	data     = bp2150,
	filename = os.path.join(OUT_DATA, f'features-bp2150.fasta')
)

In [16]:
# Convert the dataframe into a dictionary for ease of use

features = features.copy()
features = features.set_index('mRNA', drop = False)
features = features.rename_axis(None, axis = 'index')
features = features.to_dict('index')


In [17]:
# Save the gene features

tuples = [
	('Frequency', 'frequency'),
	('Stability', 'stability')
]

for feature, filename in tuples :
	data_writer.write_npz(
		data = {
			key : numpy.array(value[feature])
			for key, value in features.items()
		},
		filename = os.path.join(OUT_DATA, f'features-{filename}')
	)