In [1]:
%load_ext autoreload
%autoreload 2

In [27]:
from preprocessing.processor import Code_Intent_Pairs, sub_slotmap
from seq2seq2.model import Seq2Seq
from seq2seq2.data import get_train_loader, get_test_loader, write_answer_json

### Define Hyperparameters

In [5]:
hyperP = {
    ## training parameters
    'batch_size' : 32,
    'lr' : 1e-4,
    'teacher_force_rate' : 0.85,
    'max_epochs' : 20,
    'lr_keep_rate' : 0.95,  # set to 1.0 to not decrease lr overtime
    
    ## encoder architecture
    'encoder_layers' : 2,
    'encoder_embed_size' : 128,
    'encoder_hidden_size' : 384,
    'encoder_dropout_rate' : 0.3,
    
    ## decoder architecture
    'decoder_layers' : 2,
    'decoder_embed_size' : 128,
    'decoder_hidden_size' : 384,
    'decoder_dropout_rate' : 0.3,
    
    ## attn architecture
    'attn_hidden_size' : 384,
    
    ## visualization
    'print_every': 5,
}

### Load Data

In [6]:
code_intent_pair = Code_Intent_Pairs()

In [7]:
path = 'vocab/'
code_intent_pair.load_dict(path)
special_symbols = code_intent_pair.get_special_symbols()
word_size = code_intent_pair.get_word_size()
code_size = code_intent_pair.get_code_size()

In [8]:
train_path = 'processed_corpus/train.json'
train_entries = code_intent_pair.load_entries(train_path)
code_intent_pair.pad()

In [9]:
trainloader = get_train_loader(train_entries, special_symbols, hyperP)

In [10]:
test_path = 'processed_corpus/test.json'
test_entries = code_intent_pair.load_entries(test_path)

In [11]:
testloader = get_test_loader(test_entries)

### Define Model

In [12]:
model = Seq2Seq(word_size, code_size, hyperP)

### Training

In [13]:
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
optimizer = optim.Adam(model.parameters(), lr=hyperP['lr'])
loss_f = torch.nn.CrossEntropyLoss()

In [14]:
lr_keep_rate = hyperP['lr_keep_rate']
if lr_keep_rate != 1.0:
    lr_reduce_f = lambda epoch: lr_keep_rate ** epoch
    scheduler = LambdaLR(optimizer, lr_lambda=lr_reduce_f)

In [22]:
def train(model, trainloader, optimizer, loss_f, hyperP):
    model.train()
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(trainloader):
        logits = model(inp_seq, padded_out_seq, out_lens)
        loss = loss_f(logits, original_out_seq)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # show stats
        loss_sum += loss.item()
        _, predictions = torch.max(logits, dim=1)
        total_correct += (predictions == original_out_seq).sum()
        size += len(original_out_seq)

        if (i+1) % print_every == 0:
            print('Train: loss:{}\tacc:{}'.format(loss_sum/print_every, float(total_correct)/size))
            loss_sum = 0
            total_correct = 0
            size = 0

In [23]:
def valid(model, validloader, loss_f, hyperP):
    model.eval()
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    with torch.no_grad():
        for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(validloader):
            logits = model(inp_seq, padded_out_seq, out_lens)
            loss = loss_f(logits, original_out_seq)

            # show stats
            loss_sum += loss.item()
            _, predictions = torch.max(logits, dim=1)
            total_correct += (predictions == original_out_seq).sum()
            size += len(original_out_seq)

    print('Valid: loss:{}\tacc:{}'.format(loss_sum/len(validloader), float(total_correct)/size))
    return float(total_correct)/size

In [24]:
best_acc = 0.0
for e in range(20):
    train(model, trainloader, optimizer, loss_f, hyperP)
    acc = valid(model, trainloader, loss_f, hyperP)
    if acc > best_acc:
        best_acc = acc
        model.save()
        print()
        print('model saved')
    if lr_keep_rate != 1.0:
        scheduler.step()

Train: loss:3.624136543273926	acc:0.23060796645702306


Train: loss:3.6429473876953127	acc:0.22873512560599382


Train: loss:3.7524927616119386	acc:0.21129622104835433


Train: loss:3.6546747207641603	acc:0.23501303214596003


Train: loss:3.6199349880218508	acc:0.22411474675033619


Train: loss:3.585883045196533	acc:0.2661113546690048


Train: loss:3.5825339794158935	acc:0.2607052896725441


Train: loss:3.7155691146850587	acc:0.21247960848287112


KeyboardInterrupt: 

In [33]:
model.load()

### Decoding

In [30]:
sos = special_symbols['code_sos']
eos = special_symbols['code_eos']
code_list = []
for i, (src_seq, slot_map, code, intent) in enumerate(testloader):
    model.eval()
    seq = model.greedy_decode(src_seq, sos, eos)
    gen_code_tokens = code_intent_pair.idx2code(seq)
    gen_code = sub_slotmap(gen_code_tokens, slot_map)
    code_list.append(gen_code)
    print('intent:\t'+intent)
    print('predicted:\t'+gen_code+'\nground_truth:\t'+code)
    print()
    
    # if i == 50:
    #     break
write_answer_json(code_list)

intent:	send a signal `signal.SIGUSR1` to the current process
predicted:	. . ( ( )
ground_truth:	os.kill(os.getpid(), signal.SIGUSR1)

intent:	decode a hex string '4a4b4c' to UTF-8.
predicted:	. ( ( )
ground_truth:	bytes.fromhex('4a4b4c').decode('utf-8')

intent:	check if all elements in list `myList` are identical
predicted:	myList . ( ( )
ground_truth:	all(x == myList[0] for x in myList)

intent:	format number of spaces between strings `Python`, `:` and `Very Good` to be `20`
predicted:	Python . ( ( )
ground_truth:	print('%*s : %*s' % (20, 'Python', 20, 'Very Good'))

intent:	How to convert a string from CP-1251 to UTF-8?
predicted:	. . ( ( )
ground_truth:	d.decode('cp1251').encode('utf8')

intent:	get rid of None values in dictionary `kwargs`
predicted:	kwargs . ( ( )
ground_truth:	res = {k: v for k, v in list(kwargs.items()) if v is not None}

intent:	get rid of None values in dictionary `kwargs`
predicted:	kwargs . ( ( )
ground_truth:	res = dict((k, v) for k, v in kwargs.items() i

intent:	find intersection data between series `s1` and series `s2`
predicted:	s1 . ( ( )
ground_truth:	pd.Series(list(set(s1).intersection(set(s2))))

intent:	sending http headers to `client`
predicted:	. . ( ( )
ground_truth:	client.send('HTTP/1.0 200 OK\r\n')

intent:	Format a datetime string `when` to extract date only
predicted:	when . ( ( )
ground_truth:	then = datetime.datetime.strptime(when, '%Y-%m-%d').date()

intent:	split a multi-line string `inputString` into separate strings
predicted:	. . ( ( )
ground_truth:	inputString.split('\n')

intent:	Split a multi-line string ` a \n b \r\n c ` by new line character `\n`
predicted:	. . ( ( )
ground_truth:	' a \n b \r\n c '.split('\n')

intent:	concatenate elements of list `b` by a colon ":"
predicted:	b . ( ( )
ground_truth:	""":""".join(str(x) for x in b)

intent:	get the first object from a queryset in django model `Entry`
predicted:	. . ( ( )
ground_truth:	Entry.objects.filter()[:1].get()

intent:	Calculate sum over all rows of 2D


predicted:	. . ( ( <unk> )

intent:	concatenate items of list `l` with a space ' '
predicted:	l . ( ( )
ground_truth:	print(' '.join(map(str, l)))

intent:	run script 'hello.py' with argument 'htmlfilename.htm' on terminal using python executable
predicted:	. . ( ( <unk> )
ground_truth:	subprocess.call(['python.exe', 'hello.py', 'htmlfilename.htm'])

intent:	How can I parse a time string containing milliseconds in it with python?
predicted:	. . ( ( )
ground_truth:	time.strptime('30/03/09 16:31:32.123', '%d/%m/%y %H:%M:%S.%f')

intent:	convert a string `my_string` with dot and comma into a float number `my_float`
predicted:	my_string . ( ( )
ground_truth:	my_float = float(my_string.replace(',', ''))

intent:	convert a string `123,456.908` with dot and comma into a floating number
predicted:	. ( ( )
ground_truth:	float('123,456.908'.replace(',', ''))

intent:	set pythonpath in python script.
predicted:	. . ( ( )
ground_truth:	sys.path.append('/path/to/whatever')

intent:	split string 'W

intent:	download a file "http://www.example.com/songs/mp3.mp3" over HTTP and save to "mp3.mp3"
predicted:	. ( ( )
ground_truth:	urllib.request.urlretrieve('http://www.example.com/songs/mp3.mp3', 'mp3.mp3')

intent:	download a file `url` over HTTP and save to `file_name`
predicted:	url . ( ( )
ground_truth:	u = urllib.request.urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.getheaders('Content-Length')[0])
print(('Downloading: %s Bytes: %s' % (file_name, file_size)))
file_size_dl = 0
block_sz = 8192
while True:
    buffer = u.read(block_sz)
    if (not buffer):
        break
    file_size_dl += len(buffer)
    f.write(buffer)
    status = ('%10d  [%3.2f%%]' % (file_size_dl, ((file_size_dl * 100.0) / file_size)))
    status = (status + (chr(8) * (len(status) + 1)))
    print(status, end=' ')
f.close()

intent:	download a file 'http://www.example.com/' over HTTP
predicted:	. ( ( )
ground_truth:	response = urllib.request.urlopen('http://www.example.com/')
html = 

intent:	read keyboard-input
predicted:	. ( ( )
ground_truth:	input('Enter your input:')

intent:	enable debug mode on Flask application `app`
predicted:	. . ( ( )
ground_truth:	app.run(debug=True)

intent:	python save list `mylist` to file object 'save.txt'
predicted:	mylist . ( ( )
ground_truth:	pickle.dump(mylist, open('save.txt', 'wb'))

intent:	Multiply a matrix `P` with a 3d tensor `T` in scipy
predicted:	P . ( ( )
ground_truth:	scipy.tensordot(P, T, axes=[1, 1]).swapaxes(0, 1)

intent:	Create 3d array of zeroes of size `(3,3,3)`
predicted:	. . ( ( )
ground_truth:	numpy.zeros((3, 3, 3))

intent:	cut off the last word of a sentence `content`
predicted:	. . ( ( )
ground_truth:	""" """.join(content.split(' ')[:-1])

intent:	convert scalar `x` to array
predicted:	. . ( ( )
ground_truth:	x = np.asarray(x).reshape(1, -1)[(0), :]

intent:	sum all elements of nested list `L`
predicted:	L . ( ( )
ground_truth:	sum(sum(i) if isinstance(i, list) else i for i in L)

intent:	convert hex string

intent:	Join elements of list `l` with a comma `,`
predicted:	l . ( ( )
ground_truth:	""",""".join(l)

intent:	make a comma-separated string from a list `myList`
predicted:	myList . ( ( )
ground_truth:	myList = ','.join(map(str, myList))

intent:	reverse the list that contains 1 to 10
predicted:	var_0 . ( ( )
ground_truth:	list(reversed(list(range(10))))

intent:	remove substring 'bag,' from a string 'lamp, bag, mirror'
predicted:	. ( ( )
ground_truth:	print('lamp, bag, mirror'.replace('bag,', ''))

intent:	Reverse the order of words, delimited by `.`, in string `s`
predicted:	s . ( ( )
ground_truth:	""".""".join(s.split('.')[::-1])

