In [1]:
import os
devices = "12,13,14,15"
os.environ["CUDA_VISIBLE_DEVICES"] = devices
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12356'

In [2]:
import torch.multiprocessing as mp
from thesis_code.convert import train, cross_validate, cross_validate_, cross_validate_stratified, MODE_GREEDY, MODE_BEAM
from thesis_code.model import Converter, TransformerConverter, FastAutoregressiveConverter
from sample.gMLP.model import MultigMLP
import torch.nn as nn
# for transformer
config_transformer = {
    "n_tokens": 66,
    "seq_len": 700,
    "n_layers": 6,
    "n_heads": 2,
    "query_dimensions": 128,
    "value_dimensions": 128,
    "feed_forward_dimensions": 256,
    "attention_type": "full",
    "n_species": 34,
    "pretrain": True
}
n_epochs = 2000
batch_size = 64
lr = 1e-4 * 14
warmup = 0.1
use_apex = False
strain_ids = [
    22096, 15376, 22118, 22146, 8415, 21918, 20123, 452, 18655, 6750, 17659, 421, 22191, 21978, 12722, 17400,\
    15093, 20120, 20313, 20114, 22204, 19272, 17982, 19601, 21259, 22091, 1375, 10427, 18739, 18441, 22200, 22201, 22202, 22203
]
direction = 2
pretrain = True
pretrain_class = MultigMLP
config_pretrain = {
    "n_tokens": 67,
    "d_in": 512,
    "d_ffn": 1024,
    "max_len": 701,
    "n_layers": 32,
    "act": nn.Tanh(),
    "n_species": 278,
}
pretrain_path = "./Result/pretrain/protein_family.pt"
log_interval = 100
save_interval = 1000
output_dir = "./Result/pretrain/protein_family_stratified"
converter_class = FastAutoregressiveConverter
config = config_transformer

nprocs = len(devices.split(","))

In [None]:
# trainning for k-fold cross validation
mp.spawn(cross_validate, nprocs=nprocs, args=(nprocs, converter_class, config, "{}/weight/checkpoint_12000.pt".format(output_dir),\
        n_epochs, batch_size, lr, warmup, use_apex, False,\
        strain_ids, [35], direction, MODE_BEAM,\
        pretrain, pretrain_class, config_pretrain, pretrain_path,\
        log_interval, save_interval, output_dir))

In [3]:
# trainning for k-fold cross validation(stratified k-fold)
mp.spawn(cross_validate_stratified, nprocs=nprocs, args=(nprocs, converter_class, config, "{}/weight/checkpoint_12000.pt".format(output_dir),\
        n_epochs, batch_size, lr, warmup, use_apex, False,\
        strain_ids, [35], direction, MODE_BEAM,\
        pretrain, pretrain_class, config_pretrain, pretrain_path,\
        log_interval, save_interval, output_dir))

In [3]:
from thesis_code.convert import test, MODE_BEAM, MODE_GREEDY
test(converter_class, config, strain_ids, [35], direction, 12000, 3, MODE_BEAM, 0, output_dir=output_dir)

In [None]:
from thesis_code.convert import test_from_fasta, MODE_GREEDY, MODE_BEAM
# generate sequences from source fasta file
test_from_fasta(converter_class, config, strain_ids, 22096, 2000, 2, MODE_BEAM,0,\
                pretrain, pretrain_class, config_pretrain, pretrain_path,\
                output_dir, 100)

 50%|█████     | 1/2 [20:48<20:48, 1248.25s/it]

In [3]:
from thesis_code.visualize import scatter_plot, scatter_plot_beam, METRIC_IDENTITY_NC, METRIC_IDENTITY_AA
from thesis_code.analysis import gc_content, log_cai, rscu
from Bio import SeqIO
import numpy as np
# visualization

reference = "./sample/data/bacillus_subtilis/GCF_000009045.1_ASM904v1_cds_from_genomic.fna"
ref = rscu(reference)

for i in range(1,3):
    with open("{}/finetune/tgt_{}.fna".format(output_dir, i), "r") as f:
        tgt = list(SeqIO.parse(f, "fasta"))

    with open("{}/finetune/gen_{}.fna".format(output_dir, i), "r") as f:
        gen = list(SeqIO.parse(f, "fasta"))

    with open("{}/finetune/src_{}.fna".format(output_dir, i), "r") as f:
        src = list(SeqIO.parse(f, "fasta"))
        
    #print(np.exp([log_cai(str(x.seq), ref) for x in gen]).mean())
    #print(np.array([gc_content(str(x.seq)) for x in gen]).mean())
    scatter_plot_beam(src, tgt, gen, "{}/beam_search_nc_{}.png".format(output_dir, i),\
      metric=METRIC_IDENTITY_AA)

287it [00:22, 12.98it/s]
287it [00:21, 13.58it/s]
