In [107]:
!pip install bcolz



In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd drive/My\ Drive/Projects/QuestionGeneration

/content/drive/My Drive/Projects/QuestionGeneration


In [0]:
import os
import math
import copy
import json
import bcolz
import random
import argparse
import numpy as np
import pandas as po
#from tqdm import tqdm
from tqdm.notebook import tqdm

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F

In [0]:
#from transformers import BertModel
#from transformers import BertTokenizer

In [0]:
#torch.set_default_tensor_type(torch.cuda.FloatTensor)

In [0]:
#from generator import Generator
#from discriminator import Discriminator
#from target_lstm import TargetLSTM
#from rollout import Rollout
#from data_iter  import GenDataIter, DisDataIter

In [0]:
vectors = bcolz.open(f'conceptnet_embeddings/300.data')[:]

In [0]:
conceptnet = {w: vectors[i] for i, w in enumerate(words)}

In [0]:
# Basic Training Paramters
SEED = 88
BATCH_SIZE = 64
TOTAL_BATCH = 200
GENERATED_NUM = 10000
POSITIVE_FILE = 'real.data'
NEGATIVE_FILE = 'generated_questions.data'
QUESTION_FILE = 'questions_real.data'
EVAL_FILE = 'eval.data'
PRE_EPOCH_NUM = 120

In [0]:
random.seed(SEED)
np.random.seed(SEED)

In [0]:
device = torch.device('cuda:0' if torch.cuda. is_available() else 'cpu')

In [0]:
train_df = po.read_csv('data/train_df.csv')

In [0]:
pickle.dump(words, open(f'conceptnet_embeddings/300_vocab.pkl', 'wb'))
pickle.dump(word2idx, open(f'conceptnet_embeddings/300_lookup_table.pkl', 'wb'))

In [0]:
train_df.head()

Unnamed: 0,Paragraph,Question,Answer
0,You fellas above says its the best Taiwanese r...,how many taiwanese restaurants are there in th...,ONLY
1,You fellas above says its the best Taiwanese r...,what do you think of the price ?,mediocre
2,BEST DINING EXPERIENCE IN THE WEST VILLAGE ! S...,do you like their decor ?,Highly impressed
3,If you are the type of person who likes being ...,how many kinds of beers do they offer ?,over 100
4,If you are the type of person who likes being ...,do you like their food ?,delicious


In [0]:
tokenized_questions = []
for ques in train_df['Question']:
  tokenized_ques = bert_tokenizer.encode(ques, add_special_tokens=True, max_length = 6, pad_to_max_length=True, pad_token = '<pad>')
  tokenized_questions.append(tokenized_ques)

In [0]:
tokenized_questions = tokenized_questions[:576]

In [0]:
with open(QUESTION_FILE, 'w') as q_file:
  for tokeneized_ques in tokenized_questions:
      ques = ' '.join([str(token) for token in tokeneized_ques])
      q_file.write('%s\n' % ques)
q_file.close()

In [0]:
# Genrator Parameters
g_emb_dim = 768 # input_dim
g_hidden_dim = 768
g_sequence_len = 6

In [0]:
def bert_embedding(tokenized_sentences, length):
  return bert_model(tokenized_sentences)[0].view(length, -1, g_emb_dim)

In [0]:
def train_epoch(model, data_iter, criterion, optimizer):
    total_loss = 0.
    total_words = 0.
    for (data, target) in data_iter:
        data = Variable(data)
        target = Variable(target)
        data, target = data.to(device), target.to(device)
        target = target.contiguous().view(-1)
        pred = model.forward(data)
        loss = criterion(pred, target)
        total_loss += loss.item()
        total_words += data.size(0) * data.size(1)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    data_iter.reset()
    return math.exp(total_loss / total_words)

In [0]:
def eval_epoch(model, data_iter, criterion):
    total_loss = 0.
    total_words = 0.
    with torch.no_grad():
        for (data, target) in data_iter:
            data = Variable(data)
            target = Variable(target)
            data, target = data.to(device), target.to(device)
            target = target.contiguous().view(-1)
            pred = model.forward(data)
            loss = criterion(pred, target)
            total_loss += loss.item()
            total_words += data.size(0) * data.size(1)
        data_iter.reset()

    assert total_words > 0  # Otherwise NullpointerException
    return math.exp(total_loss / total_words)

In [0]:
def generate_samples(model, batch_size, generated_num, output_file):
    samples = []
    for _ in range(int(generated_num / batch_size)):
        sample = model.sample(batch_size, g_sequence_len).cpu().data.numpy().tolist()
        samples.extend(sample)
    with open(output_file, 'w') as fout:
        for sample in samples:
            string = ' '.join([str(s) for s in sample])
            fout.write('%s\n' % string)

In [0]:
class GenDataIter(object):
    """ Toy data iter to load digits"""
    def __init__(self, data_file, batch_size):
        super(GenDataIter, self).__init__()
        self.batch_size = batch_size
        self.data_lis = self.read_file(data_file)
        self.data_num = len(self.data_lis)
        self.indices = range(self.data_num)
        self.num_batches = int(math.ceil(float(self.data_num)/self.batch_size))
        self.idx = 0

    def __len__(self):
        return self.num_batches

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def reset(self):
        self.idx = 0
        random.shuffle(self.data_lis)

    def next(self):
        if self.idx >= self.data_num:
            raise StopIteration
        index = self.indices[self.idx:self.idx+self.batch_size]
        d = [self.data_lis[i] for i in index]
        d = torch.LongTensor(np.asarray(d, dtype='int64')).to(device)
        #data = d
        data = torch.cat([torch.zeros(self.batch_size, 1, device=device).long(), d], dim=1).to(device)
        target = torch.cat([d, torch.zeros(self.batch_size, 1, device=device).long()], dim=1).to(device)
        #target = torch.zeros(self.batch_size, 1).long()
        self.idx += self.batch_size
        return data, target

    def read_file(self, data_file):
        with open(data_file, 'r') as f:
            lines = f.readlines()
        lis = []
        for line in lines:
            l = line.strip().split(' ')
            l = [int(s) for s in l]
            lis.append(l)
        return lis

In [0]:
class Generator(nn.Module):
    """Generator """
    def __init__(self, num_emb, emb_dim, hidden_dim, device):
        super(Generator, self).__init__()
        self.num_emb = num_emb
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.device = device
        #self.emb = nn.Embedding(num_emb, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim)
        self.lin = nn.Linear(hidden_dim, num_emb)
        self.softmax = nn.LogSoftmax(dim = 1)
        self.init_params()

    def forward(self, x):
        """
        Args:
            x: (batch_size, seq_len), sequence of tokens generated by generator
        """
        #emb = self.emb(x).to(self.device)
        emb = bert_embedding(x, x.size(1))
        h0, c0 = self.init_hidden(x.size(0))
        h0, c0 = h0.to(self.device), c0.to(self.device)
        self.lstm.flatten_parameters()
        output, (h, c) = self.lstm(emb, (h0, c0))
        output, (h, c) = output.to(self.device), (h.to(self.device), c.to(self.device))
        pred = self.softmax(self.lin(output.contiguous().view(-1, self.hidden_dim))).to(self.device)
        #pred = self.softmax(self.lin(output.contiguous().view(-1, self.hidden_dim))).to(self.device)
        return pred

    def step(self, x, h, c):
        """
        Args:
            x: (batch_size,  1), sequence of tokens generated by generator
            h: (1, batch_size, hidden_dim), lstm hidden state
            c: (1, batch_size, hidden_dim), lstm cell state
        """
        #emb = self.emb(x).to(self.device)
        emb = bert_embedding(x, x.size(1))
        self.lstm.flatten_parameters()
        output, (h, c) = self.lstm(emb, (h, c))
        h, c = h.to(self.device), c.to(self.device)
        pred = F.softmax(self.lin(output.view(-1, self.hidden_dim)), dim=1).to(self.device)
        return pred, h, c


    def init_hidden(self, batch_size):
        h = Variable(torch.zeros((1, batch_size, self.hidden_dim))).to(self.device)
        c = Variable(torch.zeros((1, batch_size, self.hidden_dim))).to(self.device)
        return h, c

    def init_params(self):
        for param in self.parameters():
            param.data.uniform_(-0.05, 0.05)

    def sample(self, batch_size, seq_len, x=None):
        res = []
        flag = False # whether sample from zero
        if x is None:
            flag = True
        if flag:
            x = Variable(torch.zeros((batch_size, 1)).long()).to(self.device)
        h, c = self.init_hidden(batch_size)
        samples = []
        if flag:
            for i in range(seq_len):
                output, h, c = self.step(x, h, c)
                x = output.multinomial(1)
                samples.append(x)
        else:
            given_len = x.size(1)
            lis = x.chunk(x.size(1), dim=1)
            for i in range(given_len):
                output, h, c = self.step(lis[i], h, c)
                samples.append(lis[i])
            x = output.multinomial(1)
            for i in range(given_len, seq_len):
                samples.append(x)
                output, h, c = self.step(x, h, c)
                x = output.multinomial(1)
        output = torch.cat(samples, dim=1).to(self.device)
        return output

In [0]:
# Define Networks
generator = Generator(bert_tokenizer.vocab_size, g_emb_dim, g_hidden_dim, device)

In [0]:
questions_data_iter = GenDataIter(QUESTION_FILE, BATCH_SIZE)

In [0]:
gen_criterion = nn.NLLLoss(reduction='sum').to(device)
gen_optimizer = optim.Adam(generator.parameters(), lr = 0.0001)

In [0]:
# Pretrain Generator using MLE #make the generator generate sentences similar to real sentences
print('Pretrain with MLE ...')
for epoch in range(PRE_EPOCH_NUM):
    loss = train_epoch(generator, questions_data_iter, gen_criterion, gen_optimizer)
    if epoch%10 == 0:
      print('Epoch [%d] Model Loss: %f'% (epoch, loss))

Pretrain with MLE ...
Epoch [0] Model Loss: 22838.567841
Epoch [10] Model Loss: 9.060800
Epoch [20] Model Loss: 4.020726
Epoch [30] Model Loss: 2.814698
Epoch [40] Model Loss: 2.319436
Epoch [50] Model Loss: 2.003290
Epoch [60] Model Loss: 1.787635
Epoch [70] Model Loss: 1.652103
Epoch [80] Model Loss: 1.541392
Epoch [90] Model Loss: 1.456236
Epoch [100] Model Loss: 1.382376
Epoch [110] Model Loss: 1.326253


In [0]:
bert_tokenizer.convert_ids_to_tokens(generator.sample(1, 6).tolist()[0])

['[unused584]', '[SEP]', '[PAD]', '[CLS]', 'reservation', 'how']

In [0]:
# Discriminator Parameters
d_emb_dim = 768 
d_filter_sizes = [1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5]
d_num_filters = [100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160]

In [0]:
d_dropout = 0.75
d_num_class = 2

In [0]:
class GANLoss(nn.Module):
    """Reward-Refined NLLLoss Function for adversial training of Generator"""
    def __init__(self):
        super(GANLoss, self).__init__()

    def forward(self, prob, target, reward):
        """
        Args:
            prob: (N, C), torch Variable
            target : (N, ), torch Variable
            reward : (N, ), torch Variable
        """
        N = target.size(0)
        C = prob.size(1)
        one_hot = torch.zeros((N, C))
        one_hot = one_hot.to(device)
        one_hot.scatter_(1, target.data.view((-1,1)), 1)
        one_hot = one_hot.type(torch.BoolTensor)
        one_hot = Variable(one_hot)
        one_hot = one_hot.to(device)
        loss = torch.masked_select(prob, one_hot)
        loss = loss * reward
        loss =  -torch.sum(loss)
        return loss

In [0]:
class Discriminator(nn.Module):
    """A CNN for text classification

    architecture: Embedding >> Convolution >> Max-pooling >> Softmax
    """

    def __init__(self, num_classes, vocab_size, emb_dim, filter_sizes, num_filters, dropout):
        super(Discriminator, self).__init__()
        #self.emb = nn.Embedding(vocab_size, emb_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, n, (f, emb_dim)) for (n, f) in zip(num_filters, filter_sizes)
        ])
        self.highway = nn.Linear(sum(num_filters), sum(num_filters))
        self.dropout = nn.Dropout(p=dropout)
        self.lin = nn.Linear(sum(num_filters), num_classes)
        self.softmax = nn.LogSoftmax(dim = 1)
        self.init_parameters()

    def forward(self, x):
        """
        Args:
            x: (batch_size * seq_len)
        """
        device = torch.device('cuda:0' if torch.cuda. is_available() else 'cpu')
        
        #emb = self.emb(x).unsqueeze(1).to(device)  # batch_size * 1 * seq_len * emb_dim
        emb = bert_embedding(x, x.size(1)).unsqueeze(1).view(BATCH_SIZE, 1, -1, d_emb_dim)
        #print(emb.shape)
        convs = [F.relu(conv(emb)).squeeze(3).to(device) for conv in self.convs]  # [batch_size * num_filter * length]
        pools = [F.max_pool1d(conv, conv.size(2)).squeeze(2).to(device) for conv in convs] # [batch_size * num_filter]
        pred = torch.cat(pools, 1).to(device)  # batch_size * num_filters_sum
        highway = self.highway(pred).to(device)
        pred = torch.sigmoid(highway) *  F.relu(highway) + (1. - torch.sigmoid(highway)) * pred
        pred = self.softmax(self.lin(self.dropout(pred))).to(device)
        return pred

    def init_parameters(self):
        for param in self.parameters():
            param.data.uniform_(-0.05, 0.05)

In [0]:
from data_iter import DisDataIter

In [0]:
discriminator = Discriminator(d_num_class, bert_tokenizer.vocab_size, d_emb_dim, d_filter_sizes, d_num_filters, d_dropout).to(device)

In [0]:
dis_criterion = nn.NLLLoss(reduction='sum').to(device)
dis_optimizer = optim.Adam(discriminator.parameters())

In [0]:
# Pretrain Discriminator
print('Pretrain Discriminator ...')
for epoch in range(5):
    generate_samples(generator, BATCH_SIZE, GENERATED_NUM, NEGATIVE_FILE)
    dis_data_iter = DisDataIter(QUESTION_FILE, NEGATIVE_FILE, BATCH_SIZE)
    for _ in range(3):
        loss = train_epoch(discriminator, dis_data_iter, dis_criterion, dis_optimizer)
        print('Epoch [%d], loss: %f' % (epoch, loss))

Pretrain Discriminator ...
Epoch [0], loss: 1.166704
Epoch [0], loss: 1.045532
Epoch [0], loss: 1.000227
Epoch [1], loss: 1.000042
Epoch [1], loss: 1.000668
Epoch [1], loss: 1.000112
Epoch [2], loss: 1.000088
Epoch [2], loss: 1.000600
Epoch [2], loss: 1.000264
Epoch [3], loss: 1.000020
Epoch [3], loss: 1.000077
Epoch [3], loss: 1.000055
Epoch [4], loss: 1.000260
Epoch [4], loss: 1.001459
Epoch [4], loss: 1.000147


In [0]:
test_sentences = generator.sample(64, 6)

In [0]:
nn.Softmax(dim = 1)(discriminator(test_sentences)).argmax(dim = 1)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [0]:
class Rollout(object):
    """Roll-out policy"""
    def __init__(self, model, update_rate):
        self.ori_model = model
        self.own_model = copy.deepcopy(model)
        self.update_rate = update_rate

    def get_reward(self, x, num, discriminator):
        """
        Args:
            x : (batch_size, seq_len) input data
            num : roll-out number
            discriminator : discrimanator model
        """
        rewards = []
        batch_size = x.size(0)
        seq_len = x.size(1)
        for i in range(num):
            for l in range(1, seq_len):
                data = x[:, 0:l]
                samples = self.own_model.sample(batch_size, seq_len, data)
                pred = discriminator(samples)
                pred = pred.cpu().data[:,1].numpy()
                if i == 0:
                    rewards.append(pred)
                else:
                    rewards[l-1] += pred

            # for the last token
            pred = discriminator(x)
            pred = pred.cpu().data[:, 1].numpy()
            if i == 0:
                rewards.append(pred)
            else:
                rewards[seq_len-1] += pred
        rewards = np.transpose(np.array(rewards)) / (1.0 * num) # batch_size * seq_len
        return rewards

    def update_params(self):
        dic = {}
        for name, param in self.ori_model.named_parameters():
            dic[name] = param.data
        for name, param in self.own_model.named_parameters():
            if name.startswith('emb'):
                param.data = dic[name]
            else:
                param.data = self.update_rate * param.data + (1 - self.update_rate) * dic[name]

In [0]:
# Adversarial Training
rollout = Rollout(generator, 0.8)
print('Start Adeversatial Training...\n')
gen_gan_loss = GANLoss()
gen_gan_optm = optim.Adam(generator.parameters())
gen_gan_loss = gen_gan_loss.to(device)
gen_criterion = nn.NLLLoss(reduction='sum').to(device)
dis_criterion = nn.NLLLoss(reduction='sum').to(device)
dis_optimizer = optim.Adam(discriminator.parameters())

Start Adeversatial Training...



In [0]:
samples = generator.sample(BATCH_SIZE, g_sequence_len)
zeros = torch.zeros((BATCH_SIZE, 1)).type(torch.LongTensor).to(device)
inputs = Variable(torch.cat([zeros, samples.data], dim = 1)[:, :-1].contiguous())
targets = Variable(samples.data).contiguous().view((-1,))
rewards = rollout.get_reward(samples, 16, discriminator)
rewards = Variable(torch.Tensor(rewards))
rewards = torch.exp(rewards).contiguous().view((-1,))
rewards = rewards.to(device)
prob = generator.forward(inputs)

In [0]:
from tqdm import tqdm_notebook

In [0]:
for total_batch in tqdm_notebook(range(TOTAL_BATCH)):
    ## Train the generator for one step

    for it in range(1):
        samples = generator.sample(BATCH_SIZE, g_sequence_len)
        # construct the input to the genrator, add zeros before samples and delete the last column
        zeros = torch.zeros((BATCH_SIZE, 1)).type(torch.LongTensor).to(device)

        inputs = Variable(torch.cat([zeros, samples.data], dim = 1)[:, :-1].contiguous())
        targets = Variable(samples.data).contiguous().view((-1,))
        # calculate the reward
        rewards = rollout.get_reward(samples, 16, discriminator)
        rewards = Variable(torch.Tensor(rewards))
        rewards = torch.exp(rewards).contiguous().view((-1,))
        rewards = rewards.to(device)
        prob = generator.forward(inputs)
        loss = gen_gan_loss(prob, targets, rewards)
        gen_gan_optm.zero_grad()
        loss.backward()
        gen_gan_optm.step()

        if total_batch % 20 == 0:
          print('Loss when TOTAL_BATCH = {} is {}'.format(total_batch, loss))

    for _ in range(4):
        generate_samples(generator, BATCH_SIZE, GENERATED_NUM, NEGATIVE_FILE)
        dis_data_iter = DisDataIter(QUESTION_FILE, NEGATIVE_FILE, BATCH_SIZE)
        for _ in range(2):
            loss = train_epoch(discriminator, dis_data_iter, dis_criterion, dis_optimizer)

    if total_batch % 20 == 0:
      print('Saving Generator and Discriminator')
      torch.save(generator, 'generator_model.pt')
      torch.save(discriminator, 'discriminator_model.pt')

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

Loss when TOTAL_BATCH = 0 is -0.0
Saving Generator and Discriminator


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Loss when TOTAL_BATCH = 20 is -0.0
Saving Generator and Discriminator
Loss when TOTAL_BATCH = 40 is -0.0
Saving Generator and Discriminator
Loss when TOTAL_BATCH = 60 is -0.0
Saving Generator and Discriminator
Loss when TOTAL_BATCH = 80 is -0.0
Saving Generator and Discriminator
Loss when TOTAL_BATCH = 100 is -0.0
Saving Generator and Discriminator
Loss when TOTAL_BATCH = 120 is -0.0
Saving Generator and Discriminator
Loss when TOTAL_BATCH = 140 is -0.0
Saving Generator and Discriminator
