In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
sys.path.append('./preprocessing')
sys.path.append('./seq2seq2')

In [2]:
from processor import Code_Intent_Pairs, sub_slotmap
from model import Seq2Seq
from data import get_train_loader, get_test_loader, write_answer_json

### Define Hyperparameters

In [3]:
hyperP = {
    ## training parameters
    'batch_size' : 32,
    'lr' : 1e-3,
    'teacher_force_rate' : 0.90,
    'max_epochs' : 50,
    'lr_keep_rate' : 0.95,  # set to 1.0 to not decrease lr overtime
    'load_pretrain_code_embed': True,
    'freeze_embed': True,
    
    ## encoder architecture
    'encoder_layers' : 2,
    'encoder_embed_size' : 128,
    'encoder_hidden_size' : 384,
    'encoder_dropout_rate' : 0.3,
    
    ## decoder architecture
    'decoder_layers' : 2,
    'decoder_embed_size' : 128,
    'decoder_hidden_size' : 384,
    'decoder_dropout_rate' : 0.3,
    
    ## attn architecture
    'attn_hidden_size' : 384,
    
    ## visualization
    'print_every': 10,
}

### Load Data

In [4]:
code_intent_pair = Code_Intent_Pairs()

In [5]:
path = 'vocab/'
code_intent_pair.load_dict(path)
special_symbols = code_intent_pair.get_special_symbols()
word_size = code_intent_pair.get_word_size()
code_size = code_intent_pair.get_code_size()

In [6]:
train_path = 'processed_corpus/train.json'
train_entries = code_intent_pair.load_entries(train_path)
code_intent_pair.pad()

In [7]:
trainloader = get_train_loader(train_entries, special_symbols, hyperP)

In [8]:
valid_path = 'processed_corpus/valid.json'
valid_entries = code_intent_pair.load_entries(valid_path)
code_intent_pair.pad()

In [9]:
validloader = get_train_loader(valid_entries, special_symbols, hyperP)

In [10]:
test_path = 'processed_corpus/test.json'
test_entries = code_intent_pair.load_entries(test_path)

In [11]:
testloader = get_test_loader(test_entries)

### Define Model

In [12]:
model = Seq2Seq(word_size, code_size, hyperP)

In [13]:
import torch
if hyperP['load_pretrain_code_embed']:
    model.decoder.embed[0].load_state_dict(torch.load('./pretrain_code_lm/embedding-1556211835.t7'))
    if hyperP['freeze_embed']:
        for param in model.decoder.embed[0].parameters():
            param.requires_grad = False

### Training

In [14]:
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
optimizer = optim.Adam(model.parameters(), lr=hyperP['lr'])
loss_f = torch.nn.CrossEntropyLoss()

In [15]:
lr_keep_rate = hyperP['lr_keep_rate']
if lr_keep_rate != 1.0:
    lr_reduce_f = lambda epoch: lr_keep_rate ** epoch
    scheduler = LambdaLR(optimizer, lr_lambda=lr_reduce_f)

In [16]:
def train(model, trainloader, optimizer, loss_f, hyperP):
    model.train()
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(trainloader):
        logits = model(inp_seq, padded_out_seq, out_lens)
        loss = loss_f(logits, original_out_seq)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # show stats
        loss_sum += loss.item()
        _, predictions = torch.max(logits, dim=1)
        total_correct += (predictions == original_out_seq).sum()
        size += len(original_out_seq)

        if (i+1) % print_every == 0:
            print('Train: loss:{}\tacc:{}'.format(loss_sum/print_every, float(total_correct)/size))
            loss_sum = 0
            total_correct = 0
            size = 0
    print()

In [17]:
def valid(model, validloader, loss_f, hyperP):
    model.eval()
    old_rate = model.change_teacher_force_rate(0.0)
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    with torch.no_grad():
        for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(validloader):
            logits = model(inp_seq, padded_out_seq, out_lens)
            loss = loss_f(logits, original_out_seq)

            # show stats
            loss_sum += loss.item()
            _, predictions = torch.max(logits, dim=1)
            total_correct += (predictions == original_out_seq).sum()
            size += len(original_out_seq)

    print('Valid: loss:{}\tacc:{}'.format(loss_sum/len(validloader), float(total_correct)/size))
    model.change_teacher_force_rate(old_rate)
    return float(total_correct)/size

In [18]:
best_acc = 0.0

In [19]:
for e in range(hyperP['max_epochs']):
    train(model, trainloader, optimizer, loss_f, hyperP)
    acc = valid(model, validloader, loss_f, hyperP)
    if acc > best_acc:
        best_acc = acc
        model.save()
        print('model saved')
    if lr_keep_rate != 1.0:
        scheduler.step()

Train: loss:4.905031514167786	acc:0.12290502793296089


KeyboardInterrupt: 

In [None]:
model.load()

### Decoding

In [63]:
sos = special_symbols['code_sos']
eos = special_symbols['code_eos']
unk = special_symbols['code_unk']
code_list = []
for i, (src_seq, slot_map, code, intent) in enumerate(testloader):
    model.eval()
    seq = model.beam_decode(src_seq, sos, eos, unk, beam_width=13)
    gen_code_tokens = code_intent_pair.idx2code(seq)
    gen_code = sub_slotmap(gen_code_tokens, slot_map)
    code_list.append(gen_code)
    
    # seq = model.greedy_decode(src_seq, sos, eos, unk)
    # gen_code_tokens = code_intent_pair.idx2code(seq)
    # gen_code = sub_slotmap(gen_code_tokens, slot_map)
    # code_list.append(gen_code)
    print('intent:\t'+intent)
#     print('predicted_beams:\t'+gen_code)
    print('predicted_greed:\t'+gen_code)
    print('ground_truth:   \t'+code)
    print()
    
#     if i == 5:
#         break

intent:	send a signal `signal.SIGUSR1` to the current process
predicted_greed:	sys.encode('signal.SIGUSR1' , 1000.encode('signal.SIGUSR1'))
ground_truth:   	os.kill(os.getpid(), signal.SIGUSR1)

intent:	decode a hex string '4a4b4c' to UTF-8.
predicted_greed:	"""4a4b4c""".encode('hex')
ground_truth:   	bytes.fromhex('4a4b4c').decode('utf-8')

intent:	check if all elements in list `myList` are identical
predicted_greed:	all(isinstance(x , int) for x in myList)
ground_truth:   	all(x == myList[0] for x in myList)

intent:	format number of spaces between strings `Python`, `:` and `Very Good` to be `20`
predicted_greed:	print(Python.format(Python).split('Very Good'))
ground_truth:   	print('%*s : %*s' % (20, 'Python', 20, 'Very Good'))

intent:	How to convert a string from CP-1251 to UTF-8?
predicted_greed:	s.encode('utf-8')
ground_truth:   	d.decode('cp1251').encode('utf8')

intent:	get rid of None values in dictionary `kwargs`
predicted_greed:	next(iter(list(kwargs.values()) , key = lambd

intent:	sum all elements of nested list `L`
predicted_greed:	sum(len(x) for x in L)
ground_truth:   	sum(sum(i) if isinstance(i, list) else i for i in L)

intent:	convert hex string '470FC614' to a float number
predicted_greed:	int('470FC614' , 0)
ground_truth:   	struct.unpack('!f', '470FC614'.decode('hex'))[0]

intent:	Multiple each value by `2` for all keys in a dictionary `my_dict`
predicted_greed:	dict((k , v) for k , v in my_dict.items() if v != '2')
ground_truth:   	my_dict.update((x, y * 2) for x, y in list(my_dict.items()))

intent:	running bash script 'sleep.sh'
predicted_greed:	os.system('sleep.sh' , shell = os.os)
ground_truth:   	subprocess.call('sleep.sh', shell=True)

intent:	Join elements of list `l` with a comma `,`
predicted_greed:	l.xpath(',' , l)
ground_truth:   	""",""".join(l)

intent:	make a comma-separated string from a list `myList`
predicted_greed:	myList = [.join(myList)
ground_truth:   	myList = ','.join(map(str, myList))

intent:	reverse the list that conta

intent:	convert list of lists `L` to list of integers
predicted_greed:	[ int(i) for i in L ]
ground_truth:   	[int(''.join(str(d) for d in x)) for x in L]

intent:	combine elements of each list in list `L` into digits of a single integer
predicted_greed:	list(itertools.join(map(int , L)))
ground_truth:   	[''.join(str(d) for d in x) for x in L]

intent:	convert a list of lists `L` to list of integers
predicted_greed:	[ int(i) for i in L ]
ground_truth:   	L = [int(''.join([str(y) for y in x])) for x in L]

intent:	write the elements of list `lines` concatenated by special character '\n' to file `myfile`
predicted_greed:	lines.xpath(myfile)
ground_truth:   	myfile.write('\n'.join(lines))

intent:	removing an element from a list based on a predicate 'X' or 'N'
predicted_greed:	list(itertools.product('X' , N))
ground_truth:   	[x for x in ['AAT', 'XAC', 'ANT', 'TTA'] if 'X' not in x and 'N' not in x]

intent:	Remove duplicate words from a string `text` using regex
predicted_greed:	re.sub(

intent:	Put the curser at beginning of the file
predicted_greed:	open() , 'w').close()
ground_truth:   	file.seek(0)

intent:	combine values from column 'b' and column 'a' of dataframe `df`  into column 'c' of datafram `df`
predicted_greed:	df [ [ 'b' , 'a' ] ].df(df [ 'c' ] , axis = 'index')
ground_truth:   	df['c'] = np.where(df['a'].isnull, df['b'], df['a'])

intent:	remove key 'ele' from dictionary `d`
predicted_greed:	d.pop('ele' , None)
ground_truth:   	del d['ele']

intent:	Update datetime field in `MyModel` to be the existing `timestamp` plus 100 years
predicted_greed:	MyModel.objects.filter() = 'timestamp')
ground_truth:   	MyModel.objects.update(timestamp=F('timestamp') + timedelta(days=36524.25))

intent:	merge list `['it']` and list `['was']` and list `['annoying']` into one list
predicted_greed:	[ list(a) for a in zip([ ['it'] ] , [ ['was'] ]) ]
ground_truth:   	['it'] + ['was'] + ['annoying']

intent:	increment a value with leading zeroes in a number `x`
predicted_greed:	

intent:	extract a url from a string `myString`
predicted_greed:	""" """.join(myString [ i : i.split()) [ 0 ] for i in myString ])
ground_truth:   	print(re.findall('(https?://[^\\s]+)', myString))

intent:	extract a url from a string `myString`
predicted_greed:	""" """.join(myString [ i : i.split()) [ 0 ] for i in myString ])
ground_truth:   	print(re.search('(?P<url>https?://[^\\s]+)', myString).group('url'))

intent:	remove all special characters, punctuation and spaces from a string `mystring` using regex
predicted_greed:	re.sub('' , '\\1' , mystring)
ground_truth:   	re.sub('[^A-Za-z0-9]+', '', mystring)

intent:	create a DatetimeIndex containing 13 periods of the second friday of each month starting from date '2016-01-01'
predicted_greed:	url(x , '2016-01-01'()))
ground_truth:   	pd.date_range('2016-01-01', freq='WOM-2FRI', periods=13)

intent:	Create multidimensional array `matrix` with 3 rows and 2 columns in python
predicted_greed:	matrix = np.random(matrix ,(3 , 3))
ground_tru

intent:	get sorted list of keys of dict `d`
predicted_greed:	list(d.keys())
ground_truth:   	sorted(d)

intent:	How to sort dictionaries by keys in Python
predicted_greed:	sorted(list(var_0.items()) , key = lambda x : x [ 1 ] [ 0 ])
ground_truth:   	sorted(d.items())

intent:	convert string "1" into integer
predicted_greed:	int('1' , 2)
ground_truth:   	int('1')

intent:	function to convert strings into integers
predicted_greed:	int(s)
ground_truth:   	int()

intent:	convert items in `T1` to integers
predicted_greed:	[ int(i) for i in T1 ]
ground_truth:   	T2 = [map(int, x) for x in T1]

intent:	call a shell script `./test.sh` using subprocess
predicted_greed:	subprocess.call('./test.sh' , shell = True)
ground_truth:   	subprocess.call(['./test.sh'])

intent:	call a shell script `notepad` using subprocess
predicted_greed:	subprocess.call(notepad).T()
ground_truth:   	subprocess.call(['notepad'])

intent:	combine lists `l1` and `l2`  by alternating their elements
predicted_greed:	l1 = [

intent:	find 10 largest differences between each respective elements of list `l1` and list `l2`
predicted_greed:	[(i + j) for i , j in zip(l1 , l2) ]
ground_truth:   	heapq.nlargest(10, range(len(l1)), key=lambda i: abs(l1[i] - l2[i]))

intent:	BeautifulSoup find all 'span' elements in HTML string `soup` with class of 'starGryB sp'
predicted_greed:	soup = re.sub('span' , 'starGryB sp' , soup)
ground_truth:   	soup.find_all('span', {'class': 'starGryB sp'})

intent:	write records in dataframe `df` to table 'test' in schema 'a_schema'
predicted_greed:	df.groupby('test').fillna()
ground_truth:   	df.to_sql('test', engine, schema='a_schema')

intent:	Extract brackets from string `s`
predicted_greed:	print(re.findall(s , s))
ground_truth:   	brackets = re.sub('[^(){}[\\]]', '', s)

intent:	remove duplicate elements from list 'L'
predicted_greed:	[ set(item) for item in set([ L ]) ]
ground_truth:   	list(dict((x[0], x) for x in L).values())

intent:	read a file `file` without newlines
predic

intent:	generate all 2-element subsets of tuple `(1, 2, 3)`
predicted_greed:	print(permutations(list(range(9)) , 2))
ground_truth:   	list(itertools.combinations((1, 2, 3), 2))

intent:	get a value of datetime.today() in the UTC time zone
predicted_greed:	self.monthrange([ 0 , 0 , 0 ] , n = True)
ground_truth:   	datetime.now(pytz.utc)

intent:	Get a new list `list2`by removing empty list from a list of lists `list1`
predicted_greed:	[ y for item in set()) for item in set()) for item in value ]
ground_truth:   	list2 = [x for x in list1 if x != []]

intent:	Create `list2` to contain the lists from list `list1` excluding the empty lists from `list1`
predicted_greed:	list1 = [ list1 [ : ] for list2 in list1 ]
ground_truth:   	list2 = [x for x in list1 if x]

intent:	Django response with JSON `data`
predicted_greed:	data = requests.loads(data.dumps())
ground_truth:   	return HttpResponse(data, mimetype='application/json')

intent:	get all text that is not enclosed within square brackets i

intent:	check if path `my_path` is an absolute path
predicted_greed:	os.path.path(my_path) == os.path.exists(my_path)
ground_truth:   	os.path.isabs(my_path)

intent:	get number of keys in dictionary `yourdict`
predicted_greed:	sum(len(v) for v in list(yourdict.values())
ground_truth:   	len(list(yourdict.keys()))

intent:	count the number of keys in dictionary `yourdictfile`
predicted_greed:	sum(len(v) for v in yourdictfile.values())
ground_truth:   	len(set(open(yourdictfile).read().split()))

intent:	pandas dataframe get first row of each group by 'id'
predicted_greed:	df.groupby('id') [ 'id' ].map(lambda x : x [ 'id' ])
ground_truth:   	df.groupby('id').first()

intent:	split a list in first column  into multiple columns keeping other columns as well in pandas data frame
predicted_greed:	pd.concat([ df [ 'value' ] , pd.DataFrame(df [ 1 ] , axis = 1) , axis = 1)
ground_truth:   	pd.concat([df[0].apply(pd.Series), df[1]], axis=1)

intent:	extract attributes 'src="js/([^"]*\\bjquery\\

intent:	converting two lists `[1, 2, 3]` and `[4, 5, 6]` into a matrix
predicted_greed:	[(a , b) for a , b in zip([1, 2, 3] , [4, 5, 6]) ,([4, 5, 6] , [4, 5, 6]) ]
ground_truth:   	np.column_stack(([1, 2, 3], [4, 5, 6]))

intent:	get the type of `i`
predicted_greed:	i.i
ground_truth:   	type(i)

intent:	determine the type of variable `v`
predicted_greed:	v = os.tag()
ground_truth:   	type(v)

intent:	determine the type of variable `v`
predicted_greed:	v = os.tag()
ground_truth:   	type(v)

intent:	determine the type of variable `v`
predicted_greed:	v = os.tag()
ground_truth:   	type(v)

intent:	determine the type of variable `v`
predicted_greed:	v = os.tag()
ground_truth:   	type(v)

intent:	get the type of variable `variable_name`
predicted_greed:	variable_name = os.variable_name.dirname(variable_name)
ground_truth:   	print(type(variable_name))

intent:	get the 5th item of a generator
predicted_greed:	return = os.array()
ground_truth:   	next(itertools.islice(range(10), 5, 5 + 1))

i

intent:	replace dot characters  '.' associated with ascii letters in list `s` with space ' '
predicted_greed:	s.replace('.' , '')
ground_truth:   	[re.sub('(?<!\\d)\\.(?!\\d)', ' ', i) for i in s]

intent:	sort list `list_of_strings` based on second index of each string `s`
predicted_greed:	list_of_strings.sort(key = lambda x : s.index(x [ 1 ]))
ground_truth:   	sorted(list_of_strings, key=lambda s: s.split(',')[1])

intent:	call multiple bash function ‘vasp’ and ‘tee tee_output’ using ‘|’
predicted_greed:	pd.parser(x , y , y =(, <eos> = 'str_1' , <eos> = 'str_1')
ground_truth:   	subprocess.check_call('vasp | tee tee_output', shell=True)

intent:	eliminate all strings from list `lst`
predicted_greed:	"""""" = [ i for i in lst if i != in ]
ground_truth:   	[element for element in lst if isinstance(element, int)]

intent:	get all the elements except strings from the list 'lst'.
predicted_greed:	[ x for x in lst if x [ 2 ] ]
ground_truth:   	[element for element in lst if not isinstance(

intent:	match urls whose domain doesn't start with `t` from string `document` using regex
predicted_greed:	re.findall(document , t)
ground_truth:   	re.findall('http://[^t][^s"]+\\.html', document)

intent:	split a string `mystring` considering the spaces ' '
predicted_greed:	re.split('' , mystring)
ground_truth:   	mystring.replace(' ', '! !').split('!')

intent:	open file `path` with mode 'r'
predicted_greed:	path = open('r' , os.path | 'r')
ground_truth:   	open(path, 'r')

intent:	sum elements at the same index in list `data`
predicted_greed:	map(sum , zip(* data))
ground_truth:   	[[sum(item) for item in zip(*items)] for items in zip(*data)]

intent:	add a new axis to array `a`
predicted_greed:	a.append()
ground_truth:   	a[:, (np.newaxis)]



In [64]:
write_answer_json(code_list)

In [65]:
token_list = ['all','.', '(', 'isinstance', '(', 'word' ',', 'myList',')', ')' ,'==','0']
sub_slotmap(token_list, {0:"0"})

'all.(isinstance(word, myList)) == 0'

In [66]:
model.save()