In [1]:
# Libraries

import itertools
import numpy
import os
import random
import shutil
import sys
import warnings

In [2]:
# Root directory

ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [3]:
# Code

from src.data.feature import feature  as data_feature
from src.data.synth   import sequence as data_synth
from src.io           import loader   as data_loader
from src.io           import writer   as data_writer

# 1. Setup

In [4]:
# Setup some directory paths

CWD = ROOT
OUT = os.path.join(CWD, 'out')
RES = os.path.join(CWD, 'res')

OUT_DATA  = os.path.join(OUT,      'nbp04-feature')
OUT_PLOT  = os.path.join(OUT_DATA, 'plot')
RES_PLANT = os.path.join(RES,      'plant')
RES_NBP01 = os.path.join(OUT,      'nbp01-analysis')

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA, exist_ok = True)
os.makedirs(OUT_PLOT, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f'   Output Directory : {OUT_PLOT}')
print(f' Resource Directory : {RES_PLANT}')
print(f' Resource Directory : {RES_NBP01}')

     Root Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis
   Output Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\out\nbp04-feature
   Output Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\out\nbp04-feature\plot
 Resource Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\res\plant
 Resource Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\out\nbp01-analysis


In [5]:
# Load the annotated and cleaned data

gene_assembly = data_loader.load_faidx(
	filename  = os.path.join(RES_PLANT, 'arabidopsis-r36', 'gene-assembly.fa')
)

gene_annotation = data_loader.load_csv(
	filename   = os.path.join(RES_NBP01, 'gene-annotation.csv'),
	low_memory = False
)

In [6]:
# Define the region lengths

lengths = {
	'prom_full' : [int(1000), int(500)],
	'prom'      :  int(1000),
	'utr5'      :  int( 300),
	'cds'       :  int(9999),
	'utr3'      :  int( 350),
	'term'      :  int( 500),
	'term_full' : [int( 500), int(500)]
}

padding = {
	'prom_full' : 'left',
	'prom'      : 'left',
	'utr5'      : 'left',
	'cds'       : 'none',
	'utr3'      : 'left',
	'term'      : 'right',
	'term_full' : 'right'
}

# 2. Transcript Regions

In [7]:
# Group annotations into regions

regions = data_feature.annotation_to_regions(
	annotation = gene_annotation,
	lengths    = lengths
)

print()
print('Gene       : {:}'.format(regions['Gene'].nunique()))
print('Transcript : {:}'.format(regions['Transcript'].nunique()))

Passed 1st assertion : True
Passed 2nd assertion : True

Gene       : 20726
Transcript : 30415


# 3. Transcript Features

In [8]:
# Convert the regions into sequences and features

sequences, features = data_feature.regions_to_features(
	faidx     = gene_assembly,
	dataframe = regions,
	lengths   = lengths
)

  0%|          | 0/30415 [00:00<?, ?it/s]

[AT1G33612.1 ] : unknown codon [GYG]
[AT2G01120.1 ] : unknown codon [GKT]
[AT2G01120.1 ] : unknown codon [STA]
[AT2G01120.1 ] : unknown codon [GAK]
[AT2G01120.1 ] : unknown codon [KTG]
[AT2G01120.2 ] : unknown codon [GKT]
[AT2G01120.2 ] : unknown codon [STA]
[AT2G01120.2 ] : unknown codon [GAK]
[AT2G01120.2 ] : unknown codon [KTG]
[AT2G48110.1 ] : unknown codon [GCM]
[AT3G63540.1 ] : out of bounds at sequence end


In [9]:
# Convert the dataframe into a dictionary for ease of use

sequences = sequences.copy()
sequences = sequences.set_index('Transcript', drop = False)
sequences = sequences.rename_axis(None, axis = 'index')
sequences = sequences.to_dict('index')

In [10]:
# Convert the dataframe into a dictionary for ease of use

features = features.copy()
features = features.set_index('Transcript', drop = False)
features = features.rename_axis(None, axis = 'index')
features = features.to_dict('index')

## 3.1 Fasta

In [11]:
# Add a header field with more transcript information

sequences = data_feature.sequences_extend_kvpair(
	sequences = sequences,
	regions   = regions,
	header    = '{} | {} | {}:{}-{} | {}'
)

  0%|          | 0/30414 [00:00<?, ?it/s]

In [12]:
# Example of a positive strand

data_feature.print_extracted_sequence(
	transcript = 'AT1G01010.1',
	sequences  = sequences,
	space      = True
)

AT1G01010.1 | + | 1:2631-3630 | 1000
ATATTGCTAT TTCTGCCAAT ATTAAAACTT CACTTAGGAA GACTTGAACC TACCACACGT TAGTGACTAA TGAGAGCCAC TAGATAATTG CATGCATCCC 
ACACTAGTAC TAATTTTCTA GGGATATTAG AGTTTTCTAA TCACCTACTT CCTACTATGT GTATGTTATC TACTGGCGTG GATGCTTTTA AAGATGTTAC 
GTTATTATTT TGTTCGGTTT GGAAAACGGC TCAATCGTTA TGAGTTCGTA AGACACATAC ATTGTTCCAT GATAAAATGC AACCCCACGA ACCATTTGCG 
ACAAGCAAAA CAACATGGTC AAAATTAAAA GCTAACAATT AGCCAGCGAT TCAAAAAGTC AACCTTCTAG ATGGATTTAA CAACATATCG ATAGGATTCA 
AGATTAAAAA TAAGCACACT CTTATTAATG TTAAAAAACG AATGAGATGA AAATATTTGG CGTGTTCACA CACATAATCT AGAAGACAGA TTCGAGTTGC 
TCTCCTTTGT TTTGCTTTGG GAGGGACCCA TTATTACCGC CCAGCAGCTT CCCAGCCTTC CTTTATAAGG CTTAATTTAT ATTTATTTAA ATTTTATATG 
TTCTTCTATT ATAATACTAA AAGGGGAATA CAAATTTCTA CAGAGGATGA TATTCAATCC ACGGTTCACC CAAACCGATT TTATAAAATT TATTATTAAA 
TCTTTTTTAA TTGTTAAATT GGTTTAAATC TGAACTCTGT TTACTTACAT TGATTAAAAT TCTAAACCAT CATAAGTAAA AAATAATATG ATTAAGACTA 
ATAAATCTTA ATAGTTAATA CTACTCGGTT TACTACATGA AATTTCATAC CATCAATTGT TTTAATAAT

In [13]:
# Example of a negative strand

data_feature.print_extracted_sequence(
	transcript = 'AT1G01020.1',
	sequences  = sequences,
	space      = True
)

AT1G01020.1 | - | 1:9129-10128 | 1000
ATAAATATAT GAACCTACAT CATTATAAGT AGGGTTAAGT GTGTATGATT GTGTATGCGT ATAAAAATAC TCCCTTGACC GTAAACATGA AACATGTAAT 
ATATAAGATA TATAGACATG GAGACTATAT CATATAAACA TACATATATA TATATATGTT AGTTATATGT GTAGCCCACA TTTTTCGATA TAGGGGAACA 
AAGTTACGAA TAATGTATAT GTTAGTTATA TGTATAAAAC ATTTGCAAAG GGATGATACA ATGGAATATG TAAATACGTA TTCCAAATAG TCTACAAGTA 
GCAATGATAT TGTGTAGATA TGTCTCATAA AGGCTTGTGC TATTATATTG TGTAGATGTG TCTCAAAAGG CTTGTGCATA GTTGTCAAAA GAATATAAAA 
TGCATTTTAA TATTCAAAAG AATGCATGCA TAGACTGATA GAAAAGAAGA AATAATGAAA ATTAAAAGGG GAGGTATTAG TGAAAAGTAC GGAGGTATTA 
AAATTATGAA AATGACTAAA GGAAATAATA TTGTAACACA TCTAAATGAT TAGTTGTGTA AGAAGAATAA AGAGAATTAA TGACATGCTC CAATTATTGT 
TAAATAAAAT AAAATAATTC ATTCTCATTA GATTCCTGAC CTAAAAGAGA CTGCAAATAT GTAGGCCGGT TAGGGTTTTC ATAGGCATGA TTAATGATGA 
TAAGGAAACT TCGTATTGGG GGTTAATGGG TTCTTTCCCT TTCTAATACA TTTCTAAATA AAGTAATAAT CAAATATTAA ACTTTTAACT TACCGTTTCA 
TTTTTATGTT GAGTTTACTT ATATTGAAAG GAAACTATTT TATCTAAACC TCATTGTCAA ATCTTTCA

In [14]:
# Save the transcript region sequences

tuples = [
	('Prom_Full', 'promoter-full'),
	('Prom',      'promoter'),
	('UTR5',      'utr5'),
	('CDS',       'cds'),
	('UTR3',      'utr3'),
	('Term',      'terminator'),
	('Term_Full', 'terminator-full')
]

for region, filename in tuples :
	data_writer.write_fasta(
		data = {
			item[region]['key'] : item[region]['seq']
			for item in sequences.values()
		},
		filename = os.path.join(OUT_DATA, f'sequences-{filename}.fasta')
	)

## 3.2 Sequences

In [15]:
# Merge transcript regions and pad accordingly

bp2150 = data_feature.merge_and_pad_sequences(
	sequences = sequences,
	lengths   = lengths,
	padding   = padding
)

In [16]:
# Display an example of a merged transcript sequence

data_feature.print_padded_sequence(
	transcript = 'AT1G03730.1',
	sequences  = bp2150,
	space      = True
)

AT1G03730.1 | - | 1:930217-932983 | 2150
ACTAAAATAG AATAATTTAT GAAATACTTT AATTTACTTG GCTCACACGA CAGCCCGATG TCTTTAGAAC TAATTCATTT ACTTATTTTA AAAAAAAATC 
AAAGTGTATT AAATATAGAG ATCGTCAATA GGATGCCGAC ACCTTCATCG GCATGCGGAC CATGCGGTGT TGCCAATTTG CCATCGCGAC CGTGTTGTAT 
GGCAAAATTG GACTCCGTTT CCGGTCATGG GATTATGAAC TAGAAGATTC TTTGAAGTGT ACATCAATAG AAGACTACAA AACTGGAAGA TTATTTGTTT 
AATGTTCTTA CAAATTAATT CTCAAAGTCG TGTCAACTTC CATATTAGAA AAGCGTTTGA CACATAACAA AGTCGTTGCA ATCGTTCGTA CGTGAAATTG 
TTACAAATGT CAAGGGACGT TAAATTGTTT CATATGCCAA GTAAATCTTA GTATTCCAAA TGTCCAATTA CTTGGAAGAT GGTTTACCAT ACAATACCAC 
ATCCATATCC AATTTTAAAA ATGTTTGATT GTAGATTCTC AACTTATATG ATTAGTGTTC CATATATTGT AGATAAATGC TTAATCTCAT TTGATGACTA 
GACGGAAGAA TATTAAAATC CATATTCAAG AGAAAAAGCT AAGACTTCCA TAAGAAAAAG ACAATCGCAT GCATTCAAGA ATTTGCTAGC GGTGAACGTC 
AAGAAGTTTC GTTCCTTTAG AGTCTTCTTC ATAAAAGATA GACAAAACAA AACAAAACAA AAAAGGTAAA ATGTCATTAA AGGTTAGAAT AAATAGACGA 
TTCTTTTATT TTAGTTTGTC GGCTATAATA AGCGTAGAGG CGCACTATCA AGCCAAATCG CATTA

In [17]:
# Save the transcript sequences

data_writer.write_fasta(
	data     = bp2150,
	filename = os.path.join(OUT_DATA, f'features-bp2150.fasta')
)

## 3.3 Mutations

In [18]:
# Select random transcripts to mutate

mutation_transcripts = random.choices(list(sequences.keys()), k = 25)
mutation_transcripts = {key : value for key, value in sequences.items() if key in mutation_transcripts}

In [19]:
# Mutate transcripts multiple times

rates = [
	0.01,
	0.05,
	0.10,
	0.15,
	0.25
]

params = {
	'mutation_rate'     : 0.1,
	'insertion_rate'    : 0.0,
	'deletion_rate'     : 0.0,
	'substitution_rate' : 1.0,
	'max_length'        : 6
}

result = data_synth.mutate_sequences(
	sequences = mutation_transcripts,
	variants  = 20,
	method    = 'random',
	rates     = rates,
	params    = params
)

mutation_sequences = result[0]
mutation_features  = result[1]

  0%|          | 0/125 [00:00<?, ?it/s]

In [20]:
# Merge mutation transcript regions and pad accordingly

mutation_bp2150 = data_feature.merge_and_pad_sequences(
	sequences = mutation_sequences,
	lengths   = lengths,
	padding   = padding
)

In [21]:
# Compute similiarity betwen orginal and mutated transcript sequences

data = dict()

for key, value in mutation_bp2150.items() :
	splits = key.split(' | ')

	orgkey = splits[0].split('-')[0]
	orgkey = orgkey + ' | ' + ' | '.join(splits[1:])

	orgseq = bp2150[orgkey]

	match = sum([1 if x == y else 0 for x, y in zip(orgseq, value)])
	match = match / len(orgseq)

	data['{} | {:.5f}'.format(key, match)] = value

mutation_bp2150 = data

In [22]:
# Save the mutation transcript sequences

data_writer.write_fasta(
	data     = mutation_bp2150,
	filename = os.path.join(OUT_DATA, f'mutation-features-bp2150.fasta')
)

In [23]:
# Save the mutation transcript features

tuples = [
	('Frequency', 'frequency'),
	('Stability', 'stability')
]

for feature, filename in tuples :
	data_writer.write_npz(
		data = {
			key : numpy.array(value[feature])
			for key, value in mutation_features.items()
		},
		filename = os.path.join(OUT_DATA, f'mutation-features-{filename}')
	)

## 3.4 Features

In [24]:
# Save the transcript features

tuples = [
	('Frequency', 'frequency'),
	('Stability', 'stability')
]

for feature, filename in tuples :
	data_writer.write_npz(
		data = {
			key : numpy.array(value[feature])
			for key, value in features.items()
		},
		filename = os.path.join(OUT_DATA, f'features-{filename}')
	)