In [1]:

from __future__ import print_function
from math import ceil
import numpy as np
import sys
import pdb

import torch
import torch.optim as optim
import torch.nn as nn

import generator
import discriminator
import generator_kmer
import discriminator_kmer

import helpers

import kmer_embedding
CUDA = False
VOCAB_SIZE = 5000
MAX_SEQ_LEN = 20
START_LETTER = 0
BATCH_SIZE = 1
MLE_TRAIN_EPOCHS = 2
ADV_TRAIN_EPOCHS = 2
POS_NEG_SAMPLES = 10000

GEN_EMBEDDING_DIM = 32
GEN_HIDDEN_DIM = 32
DIS_EMBEDDING_DIM = 64
DIS_HIDDEN_DIM = 64

oracle_samples_path = './oracle_samples.trc'
oracle_state_dict_path = './oracle_EMBDIM32_HIDDENDIM32_VOCAB5000_MAXSEQLEN20.trc'
pretrained_gen_path = './gen_MLEtrain_EMBDIM32_HIDDENDIM32_VOCAB5000_MAXSEQLEN20.trc'
pretrained_dis_path = './dis_pretrain_EMBDIM_64_HIDDENDIM64_VOCAB5000_MAXSEQLEN20.trc'


def train_generator_MLE(gen, gen_opt, oracle, real_data_samples, epochs):
    # print("oracle = ",oracle)
    # print("real_data_samples = ",real_data_samples)
    """
    Max Likelihood Pretraining for the generator
    """
    for epoch in range(epochs):
        print('epoch %d : ' % (epoch + 1), end='')
        sys.stdout.flush()
        total_loss = 0

        for i in range(0, POS_NEG_SAMPLES, BATCH_SIZE):
            inp, target = helpers.prepare_generator_batch(real_data_samples[i:i + BATCH_SIZE], start_letter=START_LETTER,
                                                          gpu=CUDA)
            gen_opt.zero_grad()
            loss = gen.batchNLLLoss(inp, target)
            loss.backward()
            gen_opt.step()

            total_loss += loss.data.item()

            if (i / BATCH_SIZE) % ceil(
                            ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / 10.) == 0:  # roughly every 10% of an epoch
                print('.', end='')
                sys.stdout.flush()

        # each loss in a batch is loss per sample
        total_loss = total_loss / ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / MAX_SEQ_LEN

        # sample from generator and compute oracle NLL
        oracle_loss = helpers.batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN,
                                                   start_letter=START_LETTER, gpu=CUDA)

        print(' average_train_NLL = %.4f, oracle_sample_NLL = %.4f' % (total_loss, oracle_loss))


def train_generator_PG(gen, gen_opt, oracle, dis, num_batches):
    """
    The generator is trained using policy gradients, using the reward from the discriminator.
    Training is done for num_batches batches.
    """

    for batch in range(num_batches):
        s = gen.sample(BATCH_SIZE*2)        # 64 works best
        inp, target = helpers.prepare_generator_batch(s, start_letter=START_LETTER, gpu=CUDA)
        rewards = dis.batchClassify(target)

        gen_opt.zero_grad()
        pg_loss = gen.batchPGLoss(inp, target, rewards)
        pg_loss.backward()
        gen_opt.step()

    # sample from generator and compute oracle NLL
    oracle_loss = helpers.batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN,
                                                   start_letter=START_LETTER, gpu=CUDA)

    print(' oracle_sample_NLL = %.4f' % oracle_loss)


def train_discriminator(discriminator, dis_opt, real_data_samples, generator, oracle, d_steps, epochs):
    """
    Training the discriminator on real_data_samples (positive) and generated samples from generator (negative).
    Samples are drawn d_steps times, and the discriminator is trained for epochs epochs.
    """

    # generating a small validation set before training (using oracle and generator)
    pos_val = oracle.sample(100)
    neg_val = generator.sample(100)
    val_inp, val_target = helpers.prepare_discriminator_data(pos_val, neg_val, gpu=CUDA)

    for d_step in range(d_steps):
        s = helpers.batchwise_sample(generator, POS_NEG_SAMPLES, BATCH_SIZE)
        dis_inp, dis_target = helpers.prepare_discriminator_data(real_data_samples, s, gpu=CUDA)
        for epoch in range(epochs):
            print('d-step %d epoch %d : ' % (d_step + 1, epoch + 1), end='')
            sys.stdout.flush()
            total_loss = 0
            total_acc = 0

            for i in range(0, 2 * POS_NEG_SAMPLES, BATCH_SIZE):
                inp, target = dis_inp[i:i + BATCH_SIZE], dis_target[i:i + BATCH_SIZE]
                dis_opt.zero_grad()
                out = discriminator.batchClassify(inp)
                loss_fn = nn.BCELoss()
                loss = loss_fn(out, target)
                loss.backward()
                dis_opt.step()

                total_loss += loss.data.item()
                total_acc += torch.sum((out>0.5)==(target>0.5)).data.item()

                if (i / BATCH_SIZE) % ceil(ceil(2 * POS_NEG_SAMPLES / float(
                        BATCH_SIZE)) / 10.) == 0:  # roughly every 10% of an epoch
                    print('.', end='')
                    sys.stdout.flush()

            total_loss /= ceil(2 * POS_NEG_SAMPLES / float(BATCH_SIZE))
            total_acc /= float(2 * POS_NEG_SAMPLES)

            val_pred = discriminator.batchClassify(val_inp)
            print(' average_loss = %.4f, train_acc = %.4f, val_acc = %.4f' % (
                total_loss, total_acc, torch.sum((val_pred>0.5)==(val_target>0.5)).data.item()/200.))

# MAIN
if __name__ == '__main__':
    
    oracle = generator.Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)
    oracle.load_state_dict(torch.load(oracle_state_dict_path))
    oracle_samples = torch.load(oracle_samples_path).type(torch.LongTensor)
    # a new oracle can be generated by passing oracle_init=True in the generator constructor
    # samples for the new oracle can be generated using helpers.batchwise_sample()
    print(oracle_samples)
    gen = generator.Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)
    dis = discriminator.Discriminator(DIS_EMBEDDING_DIM, DIS_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)

    # if CUDA:
    #     oracle = oracle.cuda()
    #     gen = gen.cuda()
    #     dis = dis.cuda()
    #     oracle_samples = oracle_samples.cuda()

    # # GENERATOR MLE TRAINING
    # print('Starting Generator MLE Training...')
    gen_optimizer = optim.Adam(gen.parameters(), lr=1e-2)
    train_generator_MLE(gen, gen_optimizer, oracle, oracle_samples, MLE_TRAIN_EPOCHS)

    # # torch.save(gen.state_dict(), pretrained_gen_path)
    # # gen.load_state_dict(torch.load(pretrained_gen_path))

    # # PRETRAIN DISCRIMINATOR
    # print('\nStarting Discriminator Training...')
    # dis_optimizer = optim.Adagrad(dis.parameters())
    # train_discriminator(dis, dis_optimizer, oracle_samples, gen, oracle, 2, 2)

    # # torch.save(dis.state_dict(), pretrained_dis_path)
    # # dis.load_state_dict(torch.load(pretrained_dis_path))

    # # ADVERSARIAL TRAINING
    # print('\nStarting Adversarial Training...')
    # oracle_loss = helpers.batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN,
    #                                            start_letter=START_LETTER, gpu=CUDA)
    # print('\nInitial Oracle Sample Loss : %.4f' % oracle_loss)

    # for epoch in range(ADV_TRAIN_EPOCHS):
    #     print('\n--------\nEPOCH %d\n--------' % (epoch+1))
    #     # TRAIN GENERATOR
    #     print('\nAdversarial Training Generator : ', end='')
    #     sys.stdout.flush()
    #     train_generator_PG(gen, gen_optimizer, oracle, dis, 1)

    #     # TRAIN DISCRIMINATOR
    #     print('\nAdversarial Training Discriminator : ')
    #     train_discriminator(dis, dis_optimizer, oracle_samples, gen, oracle, 2, 2)


  from .autonotebook import tqdm as notebook_tqdm


tensor([[  87, 4410, 3560,  ..., 4767, 4973,  619],
        [4766,  468, 2145,  ..., 2707, 1370, 2515],
        [4665, 4080, 2572,  ..., 1166, 4257, 3687],
        ...,
        [ 897, 2187,  557,  ...,  973, 3541, 4199],
        [1877, 4241,  295,  ..., 4311, 1110, 4311],
        [1144, 3528, 2032,  ..., 2318, 1271, 3479]])
epoch 1 : ..

In [None]:

kt_embd= kmer_embedding.kmer_tensor('test.fa',3,100,3)
ori = generator_kmer.Generator(kt_embd, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)
print(kt_embd)
kt_embd1= kmer_embedding.kmer_tensor('test1.fa',3,100,3)
gen = generator_kmer.Generator(kt_embd1, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)
dis = discriminator_kmer.Discriminator(kt_embd1, DIS_HIDDEN_DIM, MAX_SEQ_LEN, gpu=CUDA)
print(kt_embd1)
if CUDA:
    oracle = ori.cuda()
    gen = gen.cuda()
    dis = dis.cuda()
    # oracle_samples = oracle_samples.cuda()

gen_optimizer = optim.Adam(gen.parameters(), lr=1e-2)
train_generator_MLE(gen, gen_optimizer, kt_embd, kt_embd1, MLE_TRAIN_EPOCHS)




Loading Word2Vec model...
tensor([[-0.0009,  0.0013,  0.0040,  ..., -0.0059,  0.0008,  0.0069],
        [-0.0093,  0.0056,  0.0033,  ..., -0.0004, -0.0099,  0.0054],
        [-0.0003,  0.0043, -0.0081,  ...,  0.0019,  0.0080, -0.0062],
        ...,
        [-0.0063,  0.0052, -0.0095,  ...,  0.0064,  0.0052, -0.0070],
        [-0.0030,  0.0007,  0.0015,  ...,  0.0061,  0.0069, -0.0037],
        [ 0.0003,  0.0078,  0.0047,  ...,  0.0084, -0.0036, -0.0073]])
Loading Word2Vec model...
tensor([[-0.0009,  0.0013,  0.0040,  ..., -0.0059,  0.0008,  0.0069],
        [-0.0093,  0.0056,  0.0033,  ..., -0.0004, -0.0099,  0.0054],
        [-0.0003,  0.0043, -0.0081,  ...,  0.0019,  0.0080, -0.0062],
        ...,
        [-0.0063,  0.0052, -0.0095,  ...,  0.0064,  0.0052, -0.0070],
        [-0.0030,  0.0007,  0.0015,  ...,  0.0061,  0.0069, -0.0037],
        [ 0.0003,  0.0078,  0.0047,  ...,  0.0084, -0.0036, -0.0073]])
oracle =  tensor([[-0.0009,  0.0013,  0.0040,  ..., -0.0059,  0.0008,  0.0069],


IndexError: index out of range in self