In [1]:
# Libraries

import numpy
import os
import platform
import random
import shutil
import sys

In [2]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux' :
			ROOT = '/d/hpc/home/up4472/workspace/upolanc-thesis'
		elif platform.system() == 'Windows' :
			ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else :
			raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [3]:
# Code

from source.python.data.mutation import mutation_sequence
from source.python.data.feature  import feature_extractor
from source.python.io            import loader
from source.python.io            import writer
from source.python               import runtime

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

# 1. Setup

In [4]:
# Setup some directory paths

FILTER_ID = 0
SUBFOLDER = 'filter' + str(FILTER_ID)

CWD = ROOT
OUT = os.path.join(CWD, 'output')
RES = os.path.join(CWD, 'resources')

OUT_DATA   = os.path.join(OUT, 'nbp04-feature', SUBFOLDER)
RES_GENOME = os.path.join(RES, 'genome')
RES_NBP01  = os.path.join(OUT, 'nbp01-filter',  SUBFOLDER)
RES_NBP02  = os.path.join(OUT, 'nbp02-anndata', SUBFOLDER)

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f' Resource Directory : {RES_GENOME}')
print(f' Resource Directory : {RES_NBP01}')

     Root Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis
   Output Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\output\nbp04-feature\filter0
 Resource Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\resources\genome
 Resource Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\output\nbp01-filter\filter0


In [5]:
# Load the annotated and cleaned data

gene_assembly = loader.load_faidx(
	filename  = os.path.join(RES_GENOME, 'arabidopsis-r36', 'gene-assembly.fa')
)

gene_annotation = loader.load_csv(
	filename   = os.path.join(RES_NBP01, 'gene-annotation.csv'),
	low_memory = False
)

anndata = loader.load_h5ad(
	filename = os.path.join(RES_NBP02, 'arabidopsis-r36.h5ad')
)

In [6]:
# Define the region lengths

lengths = {
	'prom_full' : [int(1000), int(500)],
	'prom'      :  int(1000),
	'utr5'      :  int( 300),
	'cds'       :  int(9999),
	'utr3'      :  int( 350),
	'term'      :  int( 500),
	'term_full' : [int( 500), int(500)]
}

padding = {
	'prom_full' : 'left',
	'prom'      : 'left',
	'utr5'      : 'left',
	'cds'       : 'none',
	'utr3'      : 'left',
	'term'      : 'right',
	'term_full' : 'right'
}

# 2. Transcript Regions

In [7]:
# Group annotations into regions

regions = feature_extractor.annotation_to_regions(
	annotation = gene_annotation,
	lengths    = lengths
)

print()
print('Gene       : {:}'.format(regions['Gene'].nunique()))
print('Transcript : {:}'.format(regions['Transcript'].nunique()))

Passed 1st assertion : True
Passed 2nd assertion : True

Gene       : 19022
Transcript : 19022


# 3. Transcript Features

In [8]:
# Convert the regions into sequences and features

sequences, features = feature_extractor.regions_to_features(
	faidx     = gene_assembly,
	dataframe = regions,
	lengths   = lengths
)

[AT1G33612.1 ] : unknown codon [GYG]
[AT2G01120.1 ] : unknown codon [GKT]
[AT2G01120.1 ] : unknown codon [STA]
[AT2G01120.1 ] : unknown codon [GAK]
[AT2G01120.1 ] : unknown codon [KTG]
[AT2G48110.1 ] : unknown codon [GCM]
[AT3G63540.1 ] : out of bounds at sequence end


In [9]:
# Convert the dataframe into a dictionary for ease of use

sequences = sequences.copy()
sequences = sequences.set_index('Transcript', drop = False)
sequences = sequences.rename_axis(None, axis = 'index')
sequences = sequences.to_dict('index')

In [10]:
# Convert the dataframe into a dictionary for ease of use

features = features.copy()
features = features.set_index('Transcript', drop = False)
features = features.rename_axis(None, axis = 'index')
features = features.to_dict('index')

## 3.1 Fasta

In [11]:
# Add a header field with more transcript information

sequences = feature_extractor.sequences_extend_kvpair(
	sequences = sequences,
	regions   = regions,
	header    = '{} | {} | {}:{}-{} | {}'
)

In [12]:
# Example of a positive strand

transcript = 'AT1G01010.1',

if transcript in sequences.keys() :
	feature_extractor.print_extracted_sequence(
		transcript = transcript,
		sequences  = sequences,
		space      = True
	)

In [13]:
# Example of a negative strand

transcript = 'AT1G01020.1'

if transcript in sequences.keys() :
	feature_extractor.print_extracted_sequence(
		transcript = transcript,
		sequences  = sequences,
		space      = True
	)

AT1G01020.1 | - | 1:9129-10128 | 1000
ATAAATATAT GAACCTACAT CATTATAAGT AGGGTTAAGT GTGTATGATT GTGTATGCGT ATAAAAATAC TCCCTTGACC GTAAACATGA AACATGTAAT 
ATATAAGATA TATAGACATG GAGACTATAT CATATAAACA TACATATATA TATATATGTT AGTTATATGT GTAGCCCACA TTTTTCGATA TAGGGGAACA 
AAGTTACGAA TAATGTATAT GTTAGTTATA TGTATAAAAC ATTTGCAAAG GGATGATACA ATGGAATATG TAAATACGTA TTCCAAATAG TCTACAAGTA 
GCAATGATAT TGTGTAGATA TGTCTCATAA AGGCTTGTGC TATTATATTG TGTAGATGTG TCTCAAAAGG CTTGTGCATA GTTGTCAAAA GAATATAAAA 
TGCATTTTAA TATTCAAAAG AATGCATGCA TAGACTGATA GAAAAGAAGA AATAATGAAA ATTAAAAGGG GAGGTATTAG TGAAAAGTAC GGAGGTATTA 
AAATTATGAA AATGACTAAA GGAAATAATA TTGTAACACA TCTAAATGAT TAGTTGTGTA AGAAGAATAA AGAGAATTAA TGACATGCTC CAATTATTGT 
TAAATAAAAT AAAATAATTC ATTCTCATTA GATTCCTGAC CTAAAAGAGA CTGCAAATAT GTAGGCCGGT TAGGGTTTTC ATAGGCATGA TTAATGATGA 
TAAGGAAACT TCGTATTGGG GGTTAATGGG TTCTTTCCCT TTCTAATACA TTTCTAAATA AAGTAATAAT CAAATATTAA ACTTTTAACT TACCGTTTCA 
TTTTTATGTT GAGTTTACTT ATATTGAAAG GAAACTATTT TATCTAAACC TCATTGTCAA ATCTTTCA

In [14]:
# Save the transcript region sequences

tuples = [
	('Prom_Full', 'promoter-full'),
	('Prom',      'promoter'),
	('UTR5',      'utr5'),
	('CDS',       'cds'),
	('UTR3',      'utr3'),
	('Term',      'terminator'),
	('Term_Full', 'terminator-full')
]

for region, filename in tuples :
	writer.write_fasta(
		data = {
			item[region]['key'] : item[region]['seq']
			for item in sequences.values()
		},
		filename = os.path.join(OUT_DATA, f'sequences-{filename}.fasta')
	)

## 3.2 Sequences

In [15]:
# Merge transcript regions and pad accordingly

bp2150 = feature_extractor.merge_and_pad_sequences(
	sequences = sequences,
	lengths   = lengths,
	padding   = padding
)

In [16]:
# Display an example of a merged transcript sequence

transcript = 'AT1G03730.1'

if transcript in bp2150.keys() :
	feature_extractor.print_padded_sequence(
		transcript = transcript,
		sequences  = bp2150,
		space      = True
	)

In [17]:
# Save the transcript sequences

writer.write_fasta(
	data     = bp2150,
	filename = os.path.join(OUT_DATA, f'sequences-bp2150.fasta')
)

## 3.3 Mutations

In [18]:
# Select random transcripts to mutate

mutation_transcripts = random.choices(list(sequences.keys()), k = 25)
mutation_transcripts = {key : value for key, value in sequences.items() if key in mutation_transcripts}

In [19]:
# Mutate transcripts multiple times

rates = [
	0.01,
	0.05,
	0.10,
	0.15,
	0.25
]

params = {
	'mutation_rate'     : 0.1,
	'insertion_rate'    : 0.0,
	'deletion_rate'     : 0.0,
	'substitution_rate' : 1.0,
	'max_length'        : 6
}

result = mutation_sequence.generate_multi(
	sequences = mutation_transcripts,
	variants  = 20,
	method    = 'random',
	rates     = rates,
	params    = params
)

mutation_sequences = result[0]
mutation_features  = result[1]

  0%|          | 0/125 [00:00<?, ?it/s]

In [20]:
# Merge mutation transcript regions and pad accordingly

mutation_bp2150 = feature_extractor.merge_and_pad_sequences(
	sequences = mutation_sequences,
	lengths   = lengths,
	padding   = padding
)

In [21]:
# Compute similiarity betwen orginal and mutated transcript sequences

data = dict()

for key, value in mutation_bp2150.items() :
	splits = key.split(' | ')

	orgkey = splits[0].split('-')[0]
	orgkey = orgkey + ' | ' + ' | '.join(splits[1:])

	orgseq = bp2150[orgkey]

	match = sum([1 if x == y else 0 for x, y in zip(orgseq, value)])
	match = match / len(orgseq)

	data['{} | {:.5f}'.format(key, match)] = value

mutation_bp2150 = data

In [22]:
# Save the mutation transcript sequences

writer.write_fasta(
	data     = mutation_bp2150,
	filename = os.path.join(OUT_DATA, f'mutation-sequences-bp2150.fasta')
)

In [23]:
# Extract mutation features

mutation_features_frequency = {
	key : numpy.array(value['Frequency'])
	for key, value in mutation_features.items()
}

mutation_features_stability = {
	key : numpy.array(value['Stability'])
	for key, value in mutation_features.items()
}

In [24]:
# Save the mutation features

writer.write_npz(
	data     = mutation_features_frequency,
	filename = os.path.join(OUT_DATA, 'mutation-features-frequency')
)

writer.write_npz(
	data     = mutation_features_stability,
	filename = os.path.join(OUT_DATA, 'mutation-features-stability')
)

In [25]:
# Save merged features

mutation_features_base = dict()

for key in mutation_features_frequency.keys() :
	freq = mutation_features_frequency[key]
	stab = mutation_features_stability[key]

	mutation_features_base[key] = numpy.concatenate((freq, stab), axis = 0)

writer.write_npz(
	data     = mutation_features_base,
	filename = os.path.join(OUT_DATA, 'mutation-features-base')
)

## 3.4 Features

In [26]:
# Extract features

features_frequency = {
	key : numpy.array(value['Frequency'])
	for key, value in features.items()
}

features_stability = {
	key : numpy.array(value['Stability'])
	for key, value in features.items()
}

In [27]:
# Save the features

writer.write_npz(
	data     = features_frequency,
	filename = os.path.join(OUT_DATA, 'features-frequency')
)

writer.write_npz(
	data     = features_stability,
	filename = os.path.join(OUT_DATA, 'features-stability')
)

In [28]:
# Save merged features

features_base = dict()

for key in features_frequency.keys() :
	freq = features_frequency[key]
	stab = features_stability[key]

	features_base[key] = numpy.concatenate((freq, stab), axis = 0)

writer.write_npz(
	data     = features_base,
	filename = os.path.join(OUT_DATA, 'features-base')
)

## 3.5 Anndata

In [29]:
# Save the annotated data with multiple layers

writer.write_h5ad(
	data     = anndata[:, list(features.keys())].copy(),
	filename = os.path.join(OUT_DATA, 'arabidopsis-r36.h5ad')
)