In [1]:
%load_ext autoreload
%autoreload 2

In [172]:
import json
import json_lines
import preprocess_temp as P
import model.parsers as M

### Load Data

In [7]:
directory = './conala-corpus/'
train_file = directory + 'train.json'
test_file = directory + 'test.json'

with open(train_file) as f:
    train_data = json.load(f)
    
with open(test_file) as f:
    test_data = json.load(f)

In [8]:
mine_file = directory + 'mined.jsonl'
mine_data = []
with open(mine_file) as f:
    mine_data = [line for line in json_lines.reader(f)]

### Let's preprocess the data. Everything is in Preprocess.py
### Adding mined data

In [9]:
# intent processing includes lowercase, remove punctuation'?'
train_intent, train_codes = P.process_data(train_data)
test_intent, test_codes = P.process_data(test_data)

In [10]:
mine_intent, mine_codes = P.process_data(mine_data, mine=True)

In [180]:
# this class is used for code2actions and actions2code
ast_action = P.Ast_Action()

In [12]:
train_actions = []

for code in train_codes:
    train_actions.append(ast_action.code2actions(code))

In [13]:
word_lst = P.vocab_list(train_intent, cut_freq=5)
act_lst, token_lst = P.action_list(train_actions, cut_freq=5)

In [14]:
word2num = dict(zip(word_lst, range(0,len(word_lst))))
act2num = dict(zip(act_lst, range(0,len(act_lst))))
token2num = dict(zip(token_lst, range(0,len(token_lst))))

In [15]:
train_loader = P.get_train_loader(train_intent, train_actions, word2num, act2num, token2num)

In [28]:
test_loader = P.get_test_loader(test_intent, word2num, batch_size=1)

In [17]:
action_index_copy = act2num[P.GenTokenAction('copy')]
action_index_gen = act2num[P.GenTokenAction('token')]

### Model

In [18]:
import torch
import time

In [70]:
from collections import namedtuple
hyperParamMap = {
    #### General configuration ####
    'cuda': True,      # Use gpu
    'mode': 'train',   # train or test

    #### Embedding sizes ####
    'embed_size': 128,         # Size of word embeddings
    'action_embed_size': 128,  # Size of ApplyRule/GenToken action embeddings
    'field_embed_size': 64,    # Embedding size of ASDL fields
    'type_embed_size': 64,     # Embeddings ASDL types

    #### Decoding sizes ####
    'hidden_size': 256,        # Size of LSTM hidden states

    #### training schedule details ####
    'valid_metric': 'acc',                # Metric used for validation
    'valid_every_epoch': 1,               # Perform validation every x epoch
    'log_every': 30,                      # Log training statistics every n iterations
    'save_to': 'model',                   # Save trained model to
    'clip_grad': 5.,                      # Clip gradients
    'max_epoch': 10,                      # Maximum number of training epoches
    'optimizer': 'Adam',                  # optimizer
    'lr': 0.001,                          # Learning rate
    'lr_decay': 0.5,                      # decay learning rate if the validation performance drops
    'verbose': False,                     # Verbose mode

    #### decoding/validation/testing ####
    'load_model': None,                   # Load a pre-trained model
    'beam_size': 1,                       # Beam size for beam search
    'decode_max_time_step': 100,          # Maximum number of time steps used in decoding and sampling
    'sample_size': 5,                     # Sample size
    'test_file': '',                      # Path to the test file
    'save_decode_to': None,               # Save decoding results to file
}

HyperParams = namedtuple('HyperParams', list(hyperParamMap.keys()), verbose=False)
hyperParams = HyperParams(**hyperParamMap)

In [71]:
model = M.Model(hyperParams, action_size=len(act_lst), token_size=len(token_lst), word_size=len(word_lst), 
                      action_index_copy=action_index_copy, action_index_gen=action_index_gen)

In [21]:
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
lossFunc = torch.nn.CrossEntropyLoss()

In [22]:
epoch_begin = time.time()
for e in range(20):
    for batch_ind, x in enumerate(train_loader):
        optimizer.zero_grad()

        (action_logits, action_labels), (copy_logits, copy_labels), (token_logits, token_labels) = model(x)

        loss1 = lossFunc(action_logits, action_labels)
        loss2 = torch.DoubleTensor([0.0])
        if len(copy_logits) > 0:
            loss2 = lossFunc(copy_logits, copy_labels)
        loss3 = torch.DoubleTensor([0.0])
        if len(token_logits) > 0:
            loss3 = lossFunc(token_logits, token_labels)

        total_loss = loss1 + loss2.double() + loss3.double()
        total_loss.backward()

        # clip gradient
        if hyperParams.clip_grad > 0.:
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hyperParams.clip_grad)

        optimizer.step()

        if batch_ind % hyperParams.log_every == hyperParams.log_every - 1:
            print("Action loss: {}".format(loss1.data))
            print("Copy loss: {}".format(loss2.data))
            print("Token loss: {}".format(loss3.data))
            print('-------------------------------------------------------')
            report_loss = report_examples = 0.

    print('epoch elapsed %ds' % (time.time() - epoch_begin))

Action loss: 1.7764659366050006
Copy loss: 2.4241466522216797
Token loss: 4.501692295074463
-------------------------------------------------------
Action loss: 1.3015571054586172
Copy loss: 2.3114724159240723
Token loss: 4.114023685455322
-------------------------------------------------------
Action loss: 1.3431359340564457
Copy loss: 2.422819137573242
Token loss: 3.7972452640533447
-------------------------------------------------------
Action loss: 1.1891318483576232
Copy loss: 2.785924196243286
Token loss: 3.851574182510376
-------------------------------------------------------
epoch elapsed 130s
Action loss: 0.9339700970681257
Copy loss: 2.290177583694458
Token loss: 3.2537734508514404
-------------------------------------------------------
Action loss: 0.8002638876686249
Copy loss: 2.2067904472351074
Token loss: 3.208761215209961
-------------------------------------------------------
Action loss: 0.7388962956140596
Copy loss: 2.517590284347534
Token loss: 3.245136022567749
---

KeyboardInterrupt: 

In [24]:
torch.save((model).state_dict(), 'Parameters/frist.t7')

In [173]:
model.load_state_dict(torch.load('Parameters/frist.t7'))

In [210]:
sample_sent, sample_sent_txt = next(iter(test_loader))

In [207]:
sample_hypothesis = model.parse(sample_sent, sample_sent_txt, act_lst, token_lst, ast_action)

In [204]:
ast_action.actions2code(sample_hypothesis.actions)

"os.system(\n    '<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>'\n    )"

In [208]:
sample_sent_txt

[['send',
  'a',
  'signal',
  '`signal.SIGUSR1`',
  'to',
  'the',
  'current',
  'process']]