In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('./preprocessing')
sys.path.append('./seq2seq')

In [3]:
from processor import Code_Intent_Pairs, sub_slotmap
from model import Seq2Seq
from data import get_train_loader, get_test_loader, write_answer_json

### Define Hyperparameters

In [4]:
hyperP = {
    ## training parameters
    'batch_size' : 32,
    'lr' : 1e-3,
    'teacher_force_rate' : 0.90,
    'max_epochs' : 50,
    'lr_keep_rate' : 0.95,  # set to 1.0 to not decrease lr overtime
    'load_pretrain_code_embed': False,
    'freeze_embed': False,
    
    ## encoder architecture
    'encoder_layers' : 2,
    'encoder_embed_size' : 128,
    'encoder_hidden_size' : 384,
    'encoder_dropout_rate' : 0.3,
    
    ## decoder architecture
    'decoder_layers' : 2,
    'decoder_embed_size' : 128,
    'decoder_hidden_size' : 384,
    'decoder_dropout_rate' : 0.3,
    
    ## attn architecture
    'attn_hidden_size' : 384,
    
    ## visualization
    'print_every': 10,
}

### Load Data

In [5]:
code_intent_pair = Code_Intent_Pairs()

In [6]:
path = 'vocab/'
code_intent_pair.load_dict(path)
special_symbols = code_intent_pair.get_special_symbols()
word_size = code_intent_pair.get_word_size()
code_size = code_intent_pair.get_code_size()

In [7]:
train_path = 'processed_corpus/train.json'
train_entries = code_intent_pair.load_entries(train_path)
code_intent_pair.pad()

In [8]:
trainloader = get_train_loader(train_entries, special_symbols, hyperP)

In [9]:
valid_path = 'processed_corpus/valid.json'
valid_entries = code_intent_pair.load_entries(valid_path)
code_intent_pair.pad()

In [10]:
validloader = get_train_loader(valid_entries, special_symbols, hyperP)

In [11]:
test_path = 'processed_corpus/test.json'
test_entries = code_intent_pair.load_entries(test_path)

In [12]:
testloader = get_test_loader(test_entries)

### Define Model

In [13]:
model = Seq2Seq(word_size, code_size, hyperP)

In [14]:
import torch
if hyperP['load_pretrain_code_embed']:
    model.decoder.embed[0].load_state_dict(torch.load('./pretrain_code_lm/embedding-1556211835.t7'))
    if hyperP['freeze_embed']:
        for param in model.decoder.embed[0].parameters():
            param.requires_grad = False

In [15]:
# model = model.cuda()

In [16]:
# inp_seq, original_out_seq, padded_out_seq, out_lens = next(iter(trainloader))

In [17]:
# logits = model(inp_seq, padded_out_seq, out_lens)

### Training

In [18]:
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
optimizer = optim.Adam(model.parameters(), lr=hyperP['lr'], weight_decay = 1e-4)
loss_f = torch.nn.CrossEntropyLoss()

In [19]:
lr_keep_rate = hyperP['lr_keep_rate']
if lr_keep_rate != 1.0:
    lr_reduce_f = lambda epoch: lr_keep_rate ** epoch
    scheduler = LambdaLR(optimizer, lr_lambda=lr_reduce_f)

In [20]:
def train(model, trainloader, optimizer, loss_f, hyperP):
    model.train()
    total_loss = 0
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(trainloader):
        logits = model(inp_seq, padded_out_seq, out_lens)
        loss = loss_f(logits, original_out_seq)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # show stats
        loss_sum += loss.item()
        total_loss += loss.item()
        _, predictions = torch.max(logits, dim=1)
        total_correct += (predictions == original_out_seq).sum()
        size += len(original_out_seq)

        if (i+1) % print_every == 0:
            print('Train: loss:{}\tacc:{}'.format(loss_sum/print_every, float(total_correct)/size), end='\r')
            loss_sum = 0
            total_correct = 0
            size = 0
    print()
    return total_loss / len(trainloader)

In [21]:
def valid(model, validloader, loss_f, hyperP):
    model.eval()
    old_rate = model.change_teacher_force_rate(0.0)
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    with torch.no_grad():
        for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(validloader):
            logits = model(inp_seq, padded_out_seq, out_lens)
            loss = loss_f(logits, original_out_seq)

            # show stats
            loss_sum += loss.item()
            _, predictions = torch.max(logits, dim=1)
            total_correct += (predictions == original_out_seq).sum()
            size += len(original_out_seq)

    print('Valid: loss:{}\tacc:{}'.format(loss_sum/len(validloader), float(total_correct)/size))
    model.change_teacher_force_rate(old_rate)
    return float(total_correct)/size

In [22]:
best_acc = 0.0

In [23]:
losses = []
for e in range(hyperP['max_epochs']):
    loss = train(model, trainloader, optimizer, loss_f, hyperP)
    losses.append(loss)
    acc = valid(model, validloader, loss_f, hyperP)
    if acc > best_acc:
        best_acc = acc
        model.save()
        print('model saved')
    if lr_keep_rate != 1.0:
        scheduler.step()

Train: loss:2.724354648590088	acc:0.384744543055888755
Valid: loss:4.012266546487808	acc:0.17816396922260547
model saved
Train: loss:2.256230866909027	acc:0.44950827536579513
Valid: loss:4.213101878762245	acc:0.17471477845582384
Train: loss:2.0012880206108092	acc:0.4857279923242984
Valid: loss:4.322030588984489	acc:0.17166357123905546
Train: loss:1.726535177230835	acc:0.545214679779323666
Valid: loss:4.347757741808891	acc:0.1840010612894667
model saved
Train: loss:1.686847949028015	acc:0.56440393379707372
Valid: loss:4.506983742117882	acc:0.18386840010612895
Train: loss:1.5568177223205566	acc:0.5799952026864955
Valid: loss:4.583953887224197	acc:0.19474661713982488
model saved
Train: loss:1.44965341091156	acc:0.598224994003358215
Valid: loss:4.521488204598427	acc:0.1952772618731759
model saved
Train: loss:1.3718526721000672	acc:0.6253298153034301
Valid: loss:4.490613698959351	acc:0.20230830459007695
model saved
Train: loss:1.258021068572998	acc:0.64955624850083952
Valid: loss:4.64624489

KeyboardInterrupt: 

In [24]:
model.load()

### Decoding

In [25]:
sos = special_symbols['code_sos']
eos = special_symbols['code_eos']
unk = special_symbols['code_unk']
code_list = []
for i, (src_seq, slot_map, code, intent) in enumerate(testloader):
    model.eval()
    seq = model.beam_decode(src_seq, sos, eos, unk, beam_width=10)
    gen_code_tokens = code_intent_pair.idx2code(seq)
    gen_code = sub_slotmap(gen_code_tokens, slot_map)
    code_list.append(gen_code)
    
    print('intent:\t'+intent)
    print('predicted_beam: \t'+gen_code)
    print('ground_truth:   \t'+code)
    print()
    
    if i == 5:
        break

intent:	send a signal `signal.SIGUSR1` to the current process
predicted_beam: 	sys . exit ( 'signal.SIGUSR1' )
ground_truth:   	os.kill(os.getpid(), signal.SIGUSR1)

intent:	decode a hex string '4a4b4c' to UTF-8.
predicted_beam: 	"""4a4b4c""" . decode ( 'utf8' )
ground_truth:   	bytes.fromhex('4a4b4c').decode('utf-8')

intent:	check if all elements in list `myList` are identical
predicted_beam: 	all ( isinstance ( word ) for word in myList )
ground_truth:   	all(x == myList[0] for x in myList)

intent:	format number of spaces between strings `Python`, `:` and `Very Good` to be `20`
predicted_beam: 	re . sub ( ':' , ':' , Python )
ground_truth:   	print('%*s : %*s' % (20, 'Python', 20, 'Very Good'))

intent:	How to convert a string from CP-1251 to UTF-8?
predicted_beam: 	datetime . encode ( 'utf-8' )
ground_truth:   	d.decode('cp1251').encode('utf8')

intent:	get rid of None values in dictionary `kwargs`
predicted_beam: 	list ( iter ( kwargs . values ( ) ) , key = lambda x : sum ( x [ 1

In [28]:
from decoder import Decoder
beam_decoder = Decoder(model)

In [34]:
code_list = []
for i, (src_seq, slot_map, code, intent) in enumerate(testloader):
    model.eval()
    seq = beam_decoder.decode(src_seq, sos, eos, unk, beam_width=10)
    gen_code_tokens = code_intent_pair.idx2code(seq)
    gen_code = sub_slotmap(gen_code_tokens, slot_map)
    code_list.append(gen_code)
    
    print('intent:\t'+intent)
    print('predicted_beam: \t'+gen_code)
    print('ground_truth:   \t'+code)
    print()
    
    if i == 5:
        break

intent:	send a signal `signal.SIGUSR1` to the current process
predicted_beam: 	sys . exit ( 'signal.SIGUSR1' )
ground_truth:   	os.kill(os.getpid(), signal.SIGUSR1)

intent:	decode a hex string '4a4b4c' to UTF-8.
predicted_beam: 	"""4a4b4c""" . decode ( 'utf8' )
ground_truth:   	bytes.fromhex('4a4b4c').decode('utf-8')

intent:	check if all elements in list `myList` are identical
predicted_beam: 	all ( isinstance ( word ) for word in myList )
ground_truth:   	all(x == myList[0] for x in myList)

intent:	format number of spaces between strings `Python`, `:` and `Very Good` to be `20`
predicted_beam: 	re . sub ( ':' , ':' , Python )
ground_truth:   	print('%*s : %*s' % (20, 'Python', 20, 'Very Good'))

intent:	How to convert a string from CP-1251 to UTF-8?
predicted_beam: 	datetime . encode ( 'utf-8' )
ground_truth:   	d.decode('cp1251').encode('utf8')

intent:	get rid of None values in dictionary `kwargs`
predicted_beam: 	list ( iter ( kwargs . values ( ) ) , key = lambda x : sum ( x [ 1

In [37]:
write_answer_json(code_list)

In [38]:
!zip answer.zip answer.txt

updating: answer.txt (deflated 73%)
