In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/ANLP21')

import os
import argparse
import logging

import torch
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
import torchtext

import seq2seq

from seq2seq.trainer import SupervisedTrainer, SelfCriticalTrainer
from seq2seq.models import EncoderRNN, DecoderRNN, Seq2seq, TopKDecoder
from seq2seq.loss import Perplexity, NLLLoss, PositiveLoss
from seq2seq.optim import Optimizer
from seq2seq.dataset import SourceField, TargetField
from seq2seq.evaluator import Predictor, Evaluator
from seq2seq.util.checkpoint import Checkpoint
import torch.nn.functional as F



In [33]:

dataset = 'NL-RX-Turk'

# if len(sys.argv) < 1:
#     sys.exit(-1)

# dataset = sys.argv[1]

import warnings
warnings.filterwarnings('ignore')

In [34]:


try:
    raw_input          # Python 2
except NameError:
    raw_input = input  # Python 3

In [35]:


# Prepare dataset
src = SourceField()
tgt = TargetField()

# data/kb/train/data.txt
#data/NL-RX-Synth/train/data.txt
#data/NL-RX-Turk/train/data.txt

datasets = {
    'kb13': ('KB13', 35, 60),
    'NL-RX-Synth': ('NL-RX-Synth', 10, 40),
    'NL-RX-Turk': ('NL-RX-Turk', 10, 40)
}

data_tuple = datasets[dataset]

# max_len = 60
max_len = data_tuple[2]
def len_filter(example):
    return len(example.src) <= max_len and len(example.tgt) <= max_len
train = torchtext.legacy.data.TabularDataset(
    path='/content/drive/MyDrive/ANLP21/data/' + data_tuple[0] + '/train/data.txt', format='tsv',
    fields=[('src', src), ('tgt', tgt)],
    filter_pred=len_filter
)
dev = torchtext.legacy.data.TabularDataset(
    path='/content/drive/MyDrive/ANLP21/data/' + data_tuple[0] + '/val/data.txt', format='tsv',
    fields=[('src', src), ('tgt', tgt)],
    filter_pred=len_filter
)
test = torchtext.legacy.data.TabularDataset(
    path='/content/drive/MyDrive/ANLP21/data/' + data_tuple[0] + '/test/data.txt', format='tsv',
    fields=[('src', src), ('tgt', tgt)],
    filter_pred=len_filter
)
src.build_vocab(train, max_size=500)
tgt.build_vocab(train, max_size=500)
input_vocab = src.vocab
output_vocab = tgt.vocab

In [36]:


# Prepare loss
weight = torch.ones(len(tgt.vocab))
pad = tgt.vocab.stoi[tgt.pad_token]

loss = NLLLoss(weight, pad)

if torch.cuda.is_available():
    loss.cuda()
    
seq2seq_model = None
optimizer = None

In [37]:


hidden_size = 256
word_embedding_size = 128

bidirectional = True

encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, dropout_p=0.1,rnn_cell='lstm',
                     bidirectional=bidirectional, n_layers=2, variable_lengths=True)
decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size,rnn_cell='lstm',
                     dropout_p=0.25, use_attention=True, bidirectional=bidirectional, n_layers=2,
                     eos_id=tgt.eos_id, sos_id=tgt.sos_id)

seq2seq_model = Seq2seq(encoder, decoder)
if torch.cuda.is_available():
    seq2seq_model.cuda()

for param in seq2seq_model.parameters():
    param.data.uniform_(-0.1, 0.1)


optimizer = Optimizer(torch.optim.Adam(seq2seq_model.parameters()),  max_grad_norm=5)

In [38]:


seq2seq_model = torch.nn.DataParallel(seq2seq_model)

In [39]:
model_out_dir = '/content/drive/MyDrive/ANLP21/lstm_model/'+data_tuple[0]+'/Deepregex'
!mkdir -p $model_out_dir

In [40]:


# train

t = SupervisedTrainer(loss=loss, batch_size=8,
                      checkpoint_every=200,
                      print_every=10000, expt_dir=model_out_dir)

In [41]:
 

seq2seq_model = t.train(seq2seq_model, train,
                  num_epochs=data_tuple[1], dev_data=dev,
                  optimizer=optimizer,
                  teacher_forcing_ratio=0.5,
                  resume=False)


# ### Self Critical Training

KeyboardInterrupt: ignored

In [12]:


class compare_regex(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size):
        super(compare_regex, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.embed = Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm1 = LSTM(embedding_dim ,hidden_dim, bidirectional=True, num_layers=1, batch_first=True)
        self.lstm2 = LSTM(embedding_dim, hidden_dim, bidirectional=True, num_layers=1, batch_first=True)
        self.fc1 = Linear(hidden_dim*2*2, 60)
        self.fc2 = Linear(60, 20)
        self.fc3 = Linear(20, target_size)

        
    def init_hidden(self, bs):
        if torch.cuda.is_available():
            return (torch.zeros(2, bs, self.hidden_dim).cuda(),
                   torch.zeros(2, bs, self.hidden_dim).cuda())
        else:
            return (torch.zeros(2, bs, self.hidden_dim),
                   torch.zeros(2, bs, self.hidden_dim))
    
    def forward(self, bs, line1, line2, input1_lengths,input2_lengths):
        embeded1 = self.embed(line1)
        embeded2 = self.embed(line2)

        hidden1 = self.init_hidden(bs)
        lstm1_out, last_hidden1 = self.lstm1(embeded1,hidden1)
        hidden2 = self.init_hidden(bs)
        lstm2_out, last_hidden2 = self.lstm2(embeded2,hidden2)


        fc1_out = self.fc1(torch.cat((lstm1_out.mean(1), lstm2_out.mean(1)),1))  #encoder outputs 평균값 concat 97.8%

        
        fc1_out = F.tanh(fc1_out)
        fc2_out = self.fc2(fc1_out)
        fc2_out = F.tanh(fc2_out)
        fc3_out = self.fc3(fc2_out)
        score = F.log_softmax(fc3_out,dim=1)
        return score

In [42]:
!pip install transformers




In [43]:
import torch
import torch.nn as nn
from torch.nn import LSTM,Embedding,Linear
from torch.nn import Module
import torch.nn.functional as F
from torch.autograd import Variable
from transformers import BertModel, BertTokenizerFast, RobertaTokenizerFast, RobertaModel
reberth_path = "/content/drive/MyDrive/ANLP21/ReBERTh"
compare_model_dir = f"/content/drive/MyDrive/ANLP21/ReBERTh/compare/base"

class compare_regex_bert(torch.nn.Module):
    tokenizer = RobertaTokenizerFast.from_pretrained(compare_model_dir, do_lower_case=False, do_basic_tokenize=False)
    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size):
        super(compare_regex_bert, self).__init__()

        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.embed = Embedding(vocab_size, embedding_dim, padding_idx=0)
        # self.lstm1 = LSTM(embedding_dim ,hidden_dim, bidirectional=True, num_layers=1, batch_first=True)
        # self.lstm2 = LSTM(embedding_dim, hidden_dim, bidirectional=True, num_layers=1, batch_first=True)
        # self.model_name=params["model_name"]
        # self.tokenizer = BertTokenizer.from_pretrained("bert-base-cased", do_lower_case=False, do_basic_tokenize=False)
        self.bert1 = RobertaModel.from_pretrained(reberth_path)
        self.bert2 = RobertaModel.from_pretrained(reberth_path)

        self.bert1.resize_token_embeddings(len(self.tokenizer))
        self.bert2.resize_token_embeddings(len(self.tokenizer))

        # self.fc = Linear(embedding_dim*2*2, hidden_dim*2*2)
        self.fc1 = Linear(embedding_dim*2, 60)
        self.fc2 = Linear(60, 20)
        self.fc3 = Linear(20, target_size)

        
    def init_hidden(self, bs):
        if torch.cuda.is_available():
            return (torch.zeros(2, bs, self.hidden_dim).cuda(),
                   torch.zeros(2, bs, self.hidden_dim).cuda())
        else:
            return (torch.zeros(2, bs, self.hidden_dim),
                   torch.zeros(2, bs, self.hidden_dim))
    
    def forward(self, bs, line1, line2, input1_lengths,input2_lengths):
        # embeded1 = self.embed(line1)
        # embeded2 = self.embed(line2)



        # hidden1 = self.init_hidden(bs)
        # lstm1_out, last_hidden1 = self.lstm1(embeded1,hidden1)
        # hidden2 = self.init_hidden(bs)
        # lstm2_out, last_hidden2 = self.lstm2(embeded2,hidden2)
        # print(line1)
        # print(line2)
        # bert1_output = self.bert1(input_ids=line1["input_ids"],
        #                  attention_mask=line1["attention_mask"],
        #                  token_type_ids=line1["token_type_ids"],
        #                  output_hidden_states=True)
        # bert1_hidden_states = bert1_output['hidden_states']
        # bert1_out = bert1_hidden_states[-1][:,0,:]

        bert1_output = self.bert1(**line1)
        bert1_out = bert1_output.last_hidden_state

        bert2_output = self.bert2(**line2)
        bert2_out = bert2_output.last_hidden_state
        # bert2_output = self.bert2(input_ids=line2["input_ids"],
        #                  attention_mask=line2["attention_mask"],
        #                  token_type_ids=line2["token_type_ids"],
        #                  output_hidden_states=True)
        # bert2_hidden_states = bert2_output['hidden_states']
        # bert2_out = bert2_hidden_states[-1][:,0,:]

        # print(bert1_out.shape)
        # print(bert1_out.mean(1).shape)
        # print(bert2_out.shape)
        # print(bert2_out.mean(1).shape)
        fc1_out = self.fc1(torch.cat((bert1_out.mean(1), bert2_out.mean(1)),1))  #encoder outputs 평균값 concat 97.8%
        
        fc1_out = F.tanh(fc1_out)
        fc2_out = self.fc2(fc1_out)
        fc2_out = F.tanh(fc2_out)
        fc3_out = self.fc3(fc2_out)
        score = F.log_softmax(fc3_out,dim=1)
        return score

In [44]:


f = open('/content/drive/MyDrive/ANLP21/compare_vocab.txt')
sc_loss_vocab = dict()
for line in f.read().splitlines():
    line = line.split('\t')
    sc_loss_vocab[line[0]] = int(line[1])
f.close()
# compare_regex_model = compare_regex_bert(32, 768, 256, 2).cuda()
compare_regex_model = torch.load('/content/drive/MyDrive/ANLP21/ReBERTh/compare/base/epoch-0.pth', map_location=torch.device('cuda'))
# compare_regex_model.load_state_dict(torch.load('/content/drive/MyDrive/ANLP21/ReBERTh/compare/base/epoch-0.pth'))
compare_regex_model.eval()
# compare_regex_model.eval()
# compare_regex_model

compare_regex_bert(
  (embed): Embedding(32, 768, padding_idx=0)
  (bert1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(1009, 768)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
        

In [45]:
model_out_dir = '/content/drive/MyDrive/ANLP21/lstm_model/'+data_tuple[0]+'/SoftRegex'
!mkdir -p $model_out_dir

In [46]:


optimizer_new = Optimizer(torch.optim.Adadelta(seq2seq_model.parameters(), lr=0.05))

#if you want to train by oracle, put mode to None
sc_t = SelfCriticalTrainer(loss=PositiveLoss(mode='prob', prob_model=compare_regex_model, loss_vocab=sc_loss_vocab), batch_size=32,
                           checkpoint_every=100, print_every=100, expt_dir=model_out_dir, output_vocab=output_vocab)



seq2seq_model = sc_t.train(seq2seq_model, train,
                  num_epochs=10, dev_data=dev,
                  optimizer=optimizer_new, teacher_forcing_ratio=0.5,
                  resume=False)

epoch: 1, time: 6.75519307454427e-08
Progress: 9%, Train Random Positive Acceptance Reward: 18.8617
Finished epoch 1: Train Random Positive Acceptance Reward: 9.2470, Dev Random Positive Acceptance Reward: 13.8043, Accuracy: 0.2077
epoch: 2, time: 5.707986787954966
Progress: 14%, Train Random Positive Acceptance Reward: 0.0367
Progress: 19%, Train Random Positive Acceptance Reward: 0.0020
Finished epoch 2: Train Random Positive Acceptance Reward: 0.0206, Dev Random Positive Acceptance Reward: 15.5908, Accuracy: 0.2077
epoch: 3, time: 11.440361018975576
Progress: 24%, Train Random Positive Acceptance Reward: 0.0104
Progress: 29%, Train Random Positive Acceptance Reward: 0.0140
Finished epoch 3: Train Random Positive Acceptance Reward: 0.0094, Dev Random Positive Acceptance Reward: 16.3350, Accuracy: 0.2077
epoch: 4, time: 17.125614364941914
Progress: 34%, Train Random Positive Acceptance Reward: 0.0117
Progress: 39%, Train Random Positive Acceptance Reward: 0.0045
Finished epoch 4: Trai

In [None]:


evaluator = Evaluator()

In [None]:


evaluator.evaluate(seq2seq_model, test) # (5.799417234628771, 0.6468332123976366)