In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('./preprocessing')
sys.path.append('./seq2seq2')

In [3]:
from processor import Code_Intent_Pairs, sub_slotmap
from model import Seq2Seq
from data import get_train_loader, get_test_loader, write_answer_json

### Define Hyperparameters

In [32]:
hyperP = {
    ## training parameters
    'batch_size' : 32,
    'lr' : 1e-3,
    'teacher_force_rate' : 0.90,
    'max_epochs' : 50,
    'lr_keep_rate' : 0.95,  # set to 1.0 to not decrease lr overtime
    'load_pretrain_code_embed': True,
    'freeze_embed': True,
    
    ## encoder architecture
    'encoder_layers' : 2,
    'encoder_embed_size' : 128,
    'encoder_hidden_size' : 384,
    'encoder_dropout_rate' : 0.3,
    
    ## decoder architecture
    'decoder_layers' : 2,
    'decoder_embed_size' : 128,
    'decoder_hidden_size' : 384,
    'decoder_dropout_rate' : 0.3,
    
    ## attn architecture
    'attn_hidden_size' : 384,
    
    ## visualization
    'print_every': 10,
}

### Load Data

In [33]:
code_intent_pair = Code_Intent_Pairs()

In [34]:
path = 'vocab/'
code_intent_pair.load_dict(path)
special_symbols = code_intent_pair.get_special_symbols()
word_size = code_intent_pair.get_word_size()
code_size = code_intent_pair.get_code_size()

In [35]:
train_path = 'processed_corpus/train.json'
train_entries = code_intent_pair.load_entries(train_path)
code_intent_pair.pad()

In [36]:
trainloader = get_train_loader(train_entries, special_symbols, hyperP)

In [37]:
valid_path = 'processed_corpus/valid.json'
valid_entries = code_intent_pair.load_entries(valid_path)
code_intent_pair.pad()

In [38]:
validloader = get_train_loader(valid_entries, special_symbols, hyperP)

In [39]:
test_path = 'processed_corpus/test.json'
test_entries = code_intent_pair.load_entries(test_path)

In [40]:
testloader = get_test_loader(test_entries)

### Define Model

In [41]:
model = Seq2Seq(word_size, code_size, hyperP)

In [42]:
import torch
if hyperP['load_pretrain_code_embed']:
    model.decoder.embed[0].load_state_dict(torch.load('./pretrain_code_lm/embedding-1556211835.t7'))
    if hyperP['freeze_embed']:
        for param in model.decoder.embed[0].parameters():
            param.requires_grad = False

### Training

In [43]:
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
optimizer = optim.Adam(model.parameters(), lr=hyperP['lr'])
loss_f = torch.nn.CrossEntropyLoss()

In [44]:
lr_keep_rate = hyperP['lr_keep_rate']
if lr_keep_rate != 1.0:
    lr_reduce_f = lambda epoch: lr_keep_rate ** epoch
    scheduler = LambdaLR(optimizer, lr_lambda=lr_reduce_f)

In [45]:
def train(model, trainloader, optimizer, loss_f, hyperP):
    model.train()
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(trainloader):
        logits = model(inp_seq, padded_out_seq, out_lens)
        loss = loss_f(logits, original_out_seq)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # show stats
        loss_sum += loss.item()
        _, predictions = torch.max(logits, dim=1)
        total_correct += (predictions == original_out_seq).sum()
        size += len(original_out_seq)

        if (i+1) % print_every == 0:
            print('Train: loss:{}\tacc:{}'.format(loss_sum/print_every, float(total_correct)/size), end='\r')
            loss_sum = 0
            total_correct = 0
            size = 0
    print()

In [46]:
def valid(model, validloader, loss_f, hyperP):
    model.eval()
    old_rate = model.change_teacher_force_rate(0.0)
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    with torch.no_grad():
        for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(validloader):
            logits = model(inp_seq, padded_out_seq, out_lens)
            loss = loss_f(logits, original_out_seq)

            # show stats
            loss_sum += loss.item()
            _, predictions = torch.max(logits, dim=1)
            total_correct += (predictions == original_out_seq).sum()
            size += len(original_out_seq)

    print('Valid: loss:{}\tacc:{}'.format(loss_sum/len(validloader), float(total_correct)/size))
    model.change_teacher_force_rate(old_rate)
    return float(total_correct)/size

In [47]:
best_acc = 0.0

In [48]:
for e in range(hyperP['max_epochs']):
    train(model, trainloader, optimizer, loss_f, hyperP)
    acc = valid(model, validloader, loss_f, hyperP)
    if acc > best_acc:
        best_acc = acc
        model.save()
        print('model saved')
    if lr_keep_rate != 1.0:
        scheduler.step()

Train: loss:2.7410809278488157	acc:0.38641291255119253
model saved:4.1762754917144775	acc:0.18628101366591482
Train: loss:2.2352020144462585	acc:0.4615755239701277
Train: loss:1.968255603313446	acc:0.498675018067935467
Train: loss:1.8727014303207397	acc:0.5225246928450976
Train: loss:1.7315370917320252	acc:0.5384244760298723
model saved:4.693239361047745	acc:0.19570120737694044
Train: loss:1.6005605101585387	acc:0.5711876656227415
model saved:4.812899440526962	acc:0.2041926495953297
Train: loss:1.477571976184845	acc:0.59359190556492411
Train: loss:1.3144090056419373	acc:0.6196097325945555
Train: loss:1.2200110316276551	acc:0.6444230305950374
Train: loss:1.1188059210777284	acc:0.6682727053721995
Train: loss:1.1110976815223694	acc:0.6793543724403758
Train: loss:1.0007394194602965	acc:0.7019995181883883
Train: loss:0.9413577616214752	acc:0.72247651168393163
Train: loss:0.865354734659195	acc:0.73909901228619618
Train: loss:0.7689890086650848	acc:0.7723440134907251
Train: loss:0.69239344596

In [49]:
model.load()

### Decoding

In [50]:
sos = special_symbols['code_sos']
eos = special_symbols['code_eos']
unk = special_symbols['code_unk']
code_list = []
for i, (src_seq, slot_map, code, intent) in enumerate(testloader):
    model.eval()
    seq = model.beam_decode(src_seq, sos, eos, unk, beam_width=10)
    gen_code_tokens = code_intent_pair.idx2code(seq)
    gen_code = sub_slotmap(gen_code_tokens, slot_map)
    code_list.append(gen_code)
    
    # seq = model.greedy_decode(src_seq, sos, eos, unk)
    # gen_code_tokens = code_intent_pair.idx2code(seq)
    # gen_code = sub_slotmap(gen_code_tokens, slot_map)
    # code_list.append(gen_code)
    print('intent:\t'+intent)
#     print('predicted_beams:\t'+gen_code)
    print('predicted_beam :\t'+gen_code)
    print('ground_truth:   \t'+code)
    print()
    
#     if i == 5:
#         break

intent:	send a signal `signal.SIGUSR1` to the current process
predicted_greed:	os.path.urlopen('signal.SIGUSR1')
ground_truth:   	os.kill(os.getpid(), signal.SIGUSR1)

intent:	decode a hex string '4a4b4c' to UTF-8.
predicted_greed:	"""4a4b4c""".decode('hex')
ground_truth:   	bytes.fromhex('4a4b4c').decode('utf-8')

intent:	check if all elements in list `myList` are identical
predicted_greed:	print(all(word [ 0 ].for() for word in myList))
ground_truth:   	all(x == myList[0] for x in myList)

intent:	format number of spaces between strings `Python`, `:` and `Very Good` to be `20`
predicted_greed:	print(Python.format(Python).split('Very Good'))
ground_truth:   	print('%*s : %*s' % (20, 'Python', 20, 'Very Good'))

intent:	How to convert a string from CP-1251 to UTF-8?
predicted_greed:	s.format(s)
ground_truth:   	d.decode('cp1251').encode('utf8')

intent:	get rid of None values in dictionary `kwargs`
predicted_greed:	sum(map(list(kwargs.values()) , key = lambda x : x [ 1 ])
ground_truth:

intent:	Multiple each value by `2` for all keys in a dictionary `my_dict`
predicted_greed:	dict((k , v) for k , v in my_dict.items() if v [ '2' ])
ground_truth:   	my_dict.update((x, y * 2) for x, y in list(my_dict.items()))

intent:	running bash script 'sleep.sh'
predicted_greed:	os.system('sleep.sh')
ground_truth:   	subprocess.call('sleep.sh', shell=True)

intent:	Join elements of list `l` with a comma `,`
predicted_greed:	[.join(l)
ground_truth:   	""",""".join(l)

intent:	make a comma-separated string from a list `myList`
predicted_greed:	myList.join(myList)
ground_truth:   	myList = ','.join(map(str, myList))

intent:	reverse the list that contains 1 to 10
predicted_greed:	[ i for i in range(10) if i > 10 ]
ground_truth:   	list(reversed(list(range(10))))

intent:	remove substring 'bag,' from a string 'lamp, bag, mirror'
predicted_greed:	re.sub('bag,' , 'lamp, bag, mirror' , 'bag,')
ground_truth:   	print('lamp, bag, mirror'.replace('bag,', ''))

intent:	Reverse the order of word

intent:	convert a list of lists `L` to list of integers
predicted_greed:	[ map(int , sublist) for sublist in L ]
ground_truth:   	L = [int(''.join([str(y) for y in x])) for x in L]

intent:	write the elements of list `lines` concatenated by special character '\n' to file `myfile`
predicted_greed:	myfile.insert(myfile , lines)
ground_truth:   	myfile.write('\n'.join(lines))

intent:	removing an element from a list based on a predicate 'X' or 'N'
predicted_greed:	[ 'a' for x in re.compile('X') if re.startswith('N' , N) ]
ground_truth:   	[x for x in ['AAT', 'XAC', 'ANT', 'TTA'] if 'X' not in x and 'N' not in x]

intent:	Remove duplicate words from a string `text` using regex
predicted_greed:	re.sub(' ' , '\\1' , text)
ground_truth:   	text = re.sub('\\b(\\w+)( \\1\\b)+', '\\1', text)

intent:	count non zero values in each column in pandas data frame
predicted_greed:	df.apply(lambda x : x.mean() , axis = 1)
ground_truth:   	df.astype(bool).sum(axis=1)

intent:	search for string that match

intent:	combine values from column 'b' and column 'a' of dataframe `df`  into column 'c' of datafram `df`
predicted_greed:	df [ df [ 'b' ].str(df [ 'c' ] , df [ 'a' ]) ]
ground_truth:   	df['c'] = np.where(df['a'].isnull, df['b'], df['a'])

intent:	remove key 'ele' from dictionary `d`
predicted_greed:	d.pop('ele' , None)
ground_truth:   	del d['ele']

intent:	Update datetime field in `MyModel` to be the existing `timestamp` plus 100 years
predicted_greed:	{ { timestamp.MyModel.read() } }
ground_truth:   	MyModel.objects.update(timestamp=F('timestamp') + timedelta(days=36524.25))

intent:	merge list `['it']` and list `['was']` and list `['annoying']` into one list
predicted_greed:	zip([ ['it'] ] , [ 3 ])
ground_truth:   	['it'] + ['was'] + ['annoying']

intent:	increment a value with leading zeroes in a number `x`
predicted_greed:	[(int(x) for x in x)
ground_truth:   	str(int(x) + 1).zfill(len(x))

intent:	check if a pandas dataframe `df`'s index is sorted
predicted_greed:	df.values.val

intent:	extract a url from a string `myString`
predicted_greed:	return k.join(ch for ch in myString if k.row(ch)) 
 return return myString [ : : - 1 ] 
ground_truth:   	print(re.search('(?P<url>https?://[^\\s]+)', myString).group('url'))

intent:	remove all special characters, punctuation and spaces from a string `mystring` using regex
predicted_greed:	re.sub('' , '' , mystring)
ground_truth:   	re.sub('[^A-Za-z0-9]+', '', mystring)

intent:	create a DatetimeIndex containing 13 periods of the second friday of each month starting from date '2016-01-01'
predicted_greed:	print((.format(2))
ground_truth:   	pd.date_range('2016-01-01', freq='WOM-2FRI', periods=13)

intent:	Create multidimensional array `matrix` with 3 rows and 2 columns in python
predicted_greed:	matrix = np.concatenate((matrix , var_1))
ground_truth:   	matrix = [[a, b], [c, d], [e, f]]

intent:	replace spaces with underscore
predicted_greed:	for.format(0)
ground_truth:   	mystring.replace(' ', '_')

intent:	get an absolut

intent:	How to sort dictionaries by keys in Python
predicted_greed:	sorted(list(list.items()) , key = lambda x : x [ 1 ] [ 0 ] , reverse = True)
ground_truth:   	sorted(d.items())

intent:	convert string "1" into integer
predicted_greed:	int('1' , 2)
ground_truth:   	int('1')

intent:	function to convert strings into integers
predicted_greed:	int(s , 2)
ground_truth:   	int()

intent:	convert items in `T1` to integers
predicted_greed:	i = int(T1)
ground_truth:   	T2 = [map(int, x) for x in T1]

intent:	call a shell script `./test.sh` using subprocess
predicted_greed:	subprocess.call([ 'shutdown' , './test.sh' , './test.sh' , './test.sh' ])
ground_truth:   	subprocess.call(['./test.sh'])

intent:	call a shell script `notepad` using subprocess
predicted_greed:	subprocess.call(notepad , var_1 = None)
ground_truth:   	subprocess.call(['notepad'])

intent:	combine lists `l1` and `l2`  by alternating their elements
predicted_greed:	l1.sort(key = lambda x : l2.index(x [ 1 ]))
ground_truth:   

intent:	write records in dataframe `df` to table 'test' in schema 'a_schema'
predicted_greed:	df.set_index([ 'a_schema' ]).str()
ground_truth:   	df.to_sql('test', engine, schema='a_schema')

intent:	Extract brackets from string `s`
predicted_greed:	re.findall(s , s)
ground_truth:   	brackets = re.sub('[^(){}[\\]]', '', s)

intent:	remove duplicate elements from list 'L'
predicted_greed:	[ x for x in L if x not in [ L ] ]
ground_truth:   	list(dict((x[0], x) for x in L).values())

intent:	read a file `file` without newlines
predicted_greed:	open.open(file , os.path.read(file))
ground_truth:   	[line.rstrip('\n') for line in file]

intent:	get the position of item 1 in `testlist`
predicted_greed:	len(testlist)
ground_truth:   	[i for (i, x) in enumerate(testlist) if (x == 1)]

intent:	get the position of item 1 in `testlist`
predicted_greed:	len(testlist)
ground_truth:   	[i for (i, x) in enumerate(testlist) if (x == 1)]

intent:	get the position of item 1 in `testlist`
predicted_greed:

intent:	Get a new list `list2`by removing empty list from a list of lists `list1`
predicted_greed:	[ i for i , j in zip(list2 , list1) ]
ground_truth:   	list2 = [x for x in list1 if x != []]

intent:	Create `list2` to contain the lists from list `list1` excluding the empty lists from `list1`
predicted_greed:	list1 = [ [ ] for i in list1 ]
ground_truth:   	list2 = [x for x in list1 if x]

intent:	Django response with JSON `data`
predicted_greed:	data.delete(data ,(=()
ground_truth:   	return HttpResponse(data, mimetype='application/json')

intent:	get all text that is not enclosed within square brackets in string `example_str`
predicted_greed:	re.findall('example_str' , example_str)
ground_truth:   	re.findall('(.*?)\\[.*?\\]', example_str)

intent:	Use a regex to get all text in a string `example_str` that is not surrounded by square brackets
predicted_greed:	re.split('example_str' , example_str)
ground_truth:   	re.findall('(.*?)(?:\\[.*?\\]|$)', example_str)

intent:	get whatever is

intent:	pandas dataframe get first row of each group by 'id'
predicted_greed:	pd.concat(id , 'id' = 'id')
ground_truth:   	df.groupby('id').first()

intent:	split a list in first column  into multiple columns keeping other columns as well in pandas data frame
predicted_greed:	pd.concat([ df , pd , 7 , 7 , 7 , 7 ] , axis = [ True , 8 ] , axis = 1)
ground_truth:   	pd.concat([df[0].apply(pd.Series), df[1]], axis=1)

intent:	extract attributes 'src="js/([^"]*\\bjquery\\b[^"]*)"' from string `data`
predicted_greed:	data.translate('src="js/([^"]*\\bjquery\\b[^"]*)"')
ground_truth:   	re.findall('src="js/([^"]*\\bjquery\\b[^"]*)"', data)

intent:	Sum integers contained in strings in list `['', '3.4', '', '', '1.0']`
predicted_greed:	map(int , ['', '3.4', '', '', '1.0'].split())
ground_truth:   	sum(int(float(item)) for item in [_f for _f in ['', '3.4', '', '', '1.0'] if _f])

intent:	Call a subprocess with arguments `c:\\Program Files\\VMware\\VMware Server\\vmware-cmd.bat` that may contain 

intent:	determine the type of variable `v`
predicted_greed:	return v(v)
ground_truth:   	type(v)

intent:	get the type of variable `variable_name`
predicted_greed:	print(variable_name.__file__)
ground_truth:   	print(type(variable_name))

intent:	get the 5th item of a generator
predicted_greed:	np.name.2()
ground_truth:   	next(itertools.islice(range(10), 5, 5 + 1))

intent:	Print a string `word` with string format
predicted_greed:	print(word.format(word))
ground_truth:   	print('"{}"'.format(word))

intent:	join a list of strings `list` using a space ' '
predicted_greed:	list.join('')
ground_truth:   	""" """.join(list)

intent:	create list `y` containing two empty lists
predicted_greed:	y = [ [ ] for i in range(3) ]
ground_truth:   	y = [[] for n in range(2)]

intent:	read a file 'C:/name/MyDocuments/numbers' into a list `data`
predicted_greed:	with open('C:/name/MyDocuments/numbers') as f : 
      data = f.readlines() ]
ground_truth:   	data = [line.strip() for line in open('C:/name

intent:	eliminate all strings from list `lst`
predicted_greed:	"""""" = ''.join(map(i for i in lst))
ground_truth:   	[element for element in lst if isinstance(element, int)]

intent:	get all the elements except strings from the list 'lst'.
predicted_greed:	[ x for x in lst if x [ 2 ] ]
ground_truth:   	[element for element in lst if not isinstance(element, str)]

intent:	Sort a list of dictionaries `list_to_be_sorted` by the value of the dictionary key `name`
predicted_greed:	sorted(list_to_be_sorted , key = lambda x : name.index(list(x.values()) [ 0 ]))
ground_truth:   	newlist = sorted(list_to_be_sorted, key=lambda k: k['name'])

intent:	sort a list of dictionaries `l` by values in key `name` in descending order
predicted_greed:	sorted(l , key = lambda x : name.index(list(x.values()) [ 0 ] , reverse = True)
ground_truth:   	newlist = sorted(l, key=itemgetter('name'), reverse=True)

intent:	How do I sort a list of dictionaries by values of the dictionary in Python?
predicted_greed:	s

intent:	add a new axis to array `a`
predicted_greed:	a = [ row [ 0 ] for row in range(0) ]
ground_truth:   	a[:, (np.newaxis)]



In [51]:
write_answer_json(code_list)