In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from preprocessing.processor import Code_Intent_Pairs, sub_slotmap
from seq2seq2.model import Seq2Seq
from seq2seq2.data import get_train_loader, get_test_loader, write_answer_json

### Define Hyperparameters

In [13]:
hyperP = {
    ## training parameters
    'batch_size' : 32,
    'lr' : 1e-3,
    'teacher_force_rate' : 0.90,
    'max_epochs' : 50,
    'lr_keep_rate' : 0.95,  # set to 1.0 to not decrease lr overtime
    
    ## encoder architecture
    'encoder_layers' : 2,
    'encoder_embed_size' : 128,
    'encoder_hidden_size' : 384,
    'encoder_dropout_rate' : 0.3,
    
    ## decoder architecture
    'decoder_layers' : 2,
    'decoder_embed_size' : 128,
    'decoder_hidden_size' : 384,
    'decoder_dropout_rate' : 0.3,
    
    ## attn architecture
    'attn_hidden_size' : 384,
    
    ## visualization
    'print_every': 5,
}

### Load Data

In [4]:
code_intent_pair = Code_Intent_Pairs()

In [5]:
path = 'vocab/'
code_intent_pair.load_dict(path)
special_symbols = code_intent_pair.get_special_symbols()
word_size = code_intent_pair.get_word_size()
code_size = code_intent_pair.get_code_size()

In [6]:
train_path = 'processed_corpus/train.json'
train_entries = code_intent_pair.load_entries(train_path)
code_intent_pair.pad()

In [7]:
trainloader = get_train_loader(train_entries, special_symbols, hyperP)

In [8]:
valid_path = 'processed_corpus/valid.json'
valid_entries = code_intent_pair.load_entries(valid_path)
code_intent_pair.pad()

In [9]:
validloader = get_train_loader(valid_entries, special_symbols, hyperP)

In [10]:
test_path = 'processed_corpus/test.json'
test_entries = code_intent_pair.load_entries(test_path)

In [11]:
testloader = get_test_loader(test_entries)

### Define Model

In [14]:
model = Seq2Seq(word_size, code_size, hyperP)

### Training

In [15]:
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
optimizer = optim.Adam(model.parameters(), lr=hyperP['lr'])
loss_f = torch.nn.CrossEntropyLoss()

In [16]:
lr_keep_rate = hyperP['lr_keep_rate']
if lr_keep_rate != 1.0:
    lr_reduce_f = lambda epoch: lr_keep_rate ** epoch
    scheduler = LambdaLR(optimizer, lr_lambda=lr_reduce_f)

In [17]:
def train(model, trainloader, optimizer, loss_f, hyperP):
    model.train()
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(trainloader):
        logits = model(inp_seq, padded_out_seq, out_lens)
        loss = loss_f(logits, original_out_seq)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # show stats
        loss_sum += loss.item()
        _, predictions = torch.max(logits, dim=1)
        total_correct += (predictions == original_out_seq).sum()
        size += len(original_out_seq)

        if (i+1) % print_every == 0:
            print('Train: loss:{}\tacc:{}'.format(loss_sum/print_every, float(total_correct)/size), end='\r')
            loss_sum = 0
            total_correct = 0
            size = 0
    print()

In [18]:
def valid(model, validloader, loss_f, hyperP):
    model.eval()
    old_rate = model.change_teacher_force_rate(0.0)
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    with torch.no_grad():
        for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(validloader):
            logits = model(inp_seq, padded_out_seq, out_lens)
            loss = loss_f(logits, original_out_seq)

            # show stats
            loss_sum += loss.item()
            _, predictions = torch.max(logits, dim=1)
            total_correct += (predictions == original_out_seq).sum()
            size += len(original_out_seq)

    print('Valid: loss:{}\tacc:{}'.format(loss_sum/len(validloader), float(total_correct)/size))
    model.change_teacher_force_rate(old_rate)
    return float(total_correct)/size

In [21]:
best_acc = 0.0
for e in range(hyperP['max_epochs']):
    train(model, trainloader, optimizer, loss_f, hyperP)
    acc = valid(model, validloader, loss_f, hyperP)
    if acc > best_acc:
        best_acc = acc
        model.save()
        print('model saved')
    if lr_keep_rate != 1.0:
        scheduler.step()

Train: loss:2.814048719406128	acc:0.371139240506329176
Valid: loss:4.190293937921524	acc:0.1763301048162399
model saved
Train: loss:2.4241441249847413	acc:0.43594936708860765
Valid: loss:4.367486447095871	acc:0.16890009287514926
Train: loss:2.001986193656921	acc:0.506329113924050734
Valid: loss:4.472135618329048	acc:0.18508690460395383
model saved
Train: loss:1.8112791776657104	acc:0.51848101265822797
Valid: loss:4.655263438820839	acc:0.18601565609659015
model saved
Train: loss:1.6849470376968383	acc:0.5513924050632911
Valid: loss:4.778821617364883	acc:0.17500331696961655
Train: loss:1.5022643566131593	acc:0.5787341772151898
Valid: loss:4.873151898384094	acc:0.18070850471009686
Train: loss:1.4043403387069702	acc:0.6015189873417721
Valid: loss:4.887567043304443	acc:0.20273318296404405
model saved
Train: loss:1.2382980585098267	acc:0.6450632911392405
Valid: loss:5.223802596330643	acc:0.2016717526867454
Train: loss:1.164399755001068	acc:0.64860759493670889
Valid: loss:5.3540588319301605	a

KeyboardInterrupt: 

In [23]:
model.load()

### Decoding

In [33]:
sos = special_symbols['code_sos']
eos = special_symbols['code_eos']
unk = special_symbols['code_unk']
code_list = []
for i, (src_seq, slot_map, code, intent) in enumerate(testloader):
    model.eval()
    seq = model.greedy_decode(src_seq, sos, eos, unk)
    gen_code_tokens = code_intent_pair.idx2code(seq)
    gen_code = sub_slotmap(gen_code_tokens, slot_map)
    code_list.append(gen_code)
    print('intent:\t'+intent)
    print('predicted:\t'+gen_code+'\nground_truth:\t'+code)
    print()
    
    # if i == 50:
    #     break
write_answer_json(code_list)

intent:	send a signal `signal.SIGUSR1` to the current process
predicted:	logging . info ( 'signal.SIGUSR1' , 1000 . write )
ground_truth:	os.kill(os.getpid(), signal.SIGUSR1)

intent:	decode a hex string '4a4b4c' to UTF-8.
predicted:	"""4a4b4c""" . encode ( 'utf-8' )
ground_truth:	bytes.fromhex('4a4b4c').decode('utf-8')

intent:	check if all elements in list `myList` are identical
predicted:	all ( isinstance ( word , myList ) ) == 0
ground_truth:	all(x == myList[0] for x in myList)

intent:	format number of spaces between strings `Python`, `:` and `Very Good` to be `20`
predicted:	re . sub ( 'Very Good' , ' ' , Python )
ground_truth:	print('%*s : %*s' % (20, 'Python', 20, 'Very Good'))

intent:	How to convert a string from CP-1251 to UTF-8?
predicted:	int . encode ( 'utf-8' , 'ignore' )
ground_truth:	d.decode('cp1251').encode('utf8')

intent:	get rid of None values in dictionary `kwargs`
predicted:	sum ( ( x , sum ( x ) ) for x in list ( kwargs . values ( ) ) )
ground_truth:	res = {k: 

intent:	Join elements of list `l` with a comma `,`
predicted:	[ l for l in l if ',' in l ]
ground_truth:	""",""".join(l)

intent:	make a comma-separated string from a list `myList`
predicted:	[ = '' . join ( myList . join ( myList ) for i in myList )
ground_truth:	myList = ','.join(map(str, myList))

intent:	reverse the list that contains 1 to 10
predicted:	[ ( i , 2 ) for i in range ( 10 ) ]
ground_truth:	list(reversed(list(range(10))))

intent:	remove substring 'bag,' from a string 'lamp, bag, mirror'
predicted:	re . sub ( 'lamp, bag, mirror' , 'bag,' , 'lamp, bag, mirror' )
ground_truth:	print('lamp, bag, mirror'.replace('bag,', ''))

intent:	Reverse the order of words, delimited by `.`, in string `s`
predicted:	s . find ( '.' , 1 )
ground_truth:	""".""".join(s.split('.')[::-1])

intent:	convert epoch time represented as milliseconds `s` to string using format '%Y-%m-%d %H:%M:%S.%f'
predicted:	s . strftime ( '%Y-%m-%d %H:%M:%S.%f' , '%Y-%m-%d %H:%M:%S.%f' )
ground_truth:	datetime.da

intent:	search for occurrences of regex pattern '>.*<' in xml string `line`
predicted:	re . findall ( '>.*<' , line , re . find )
ground_truth:	print(re.search('>.*<', line).group(0))

intent:	erase all the contents of a file `filename`
predicted:	open ( 'filename' , 'w' ) . close ( )
ground_truth:	open(filename, 'w').close()

intent:	convert a string into datetime using the format '%Y-%m-%d %H:%M:%S.%f'
predicted:	datetime . strptime ( '%Y-%m-%d %H:%M:%S.%f' , '%Y-%m-%d %H:%M:%S.%f' )
ground_truth:	datetime.datetime.strptime(string_date, '%Y-%m-%d %H:%M:%S.%f')

intent:	find the index of a list with the first element equal to '332' within the list of lists `thelist`
predicted:	[ x [ 0 ] for x in thelist ]
ground_truth:	[index for index, item in enumerate(thelist) if item[0] == '332']

intent:	lower a string `text` and remove non-alphanumeric characters aside from space
predicted:	re . findall ( '\\' , text )
ground_truth:	re.sub('[^\\sa-zA-Z0-9]', '', text).lower().strip()

intent:	re

intent:	decode string "\\x89\\n" into a normal string
predicted:	str . encode ( '\\x89\\n' )
ground_truth:	"""\\x89\\n""".decode('string_escape')

intent:	convert a raw string `raw_string` into a normal string
predicted:	unicodedata . join ( raw_string , raw_string ) . encode ( 0 , 'ignore' )
ground_truth:	raw_string.decode('string_escape')

intent:	convert a raw string `raw_byte_string` into a normal string
predicted:	unicodedata . join ( raw_byte_string , raw_byte_string ) . encode ( 0 , 'ignore' )
ground_truth:	raw_byte_string.decode('unicode_escape')

intent:	split a string `s` with into all strings of repeated characters
predicted:	re . findall ( '\\d+' , s )
ground_truth:	[m.group(0) for m in re.finditer('(\\d)\\1*', s)]

intent:	scatter a plot with x, y position of `np.random.randn(100)` and face color equal to none
predicted:	plot . plot ( 1 , ( ( 1 , 3 , - = ( 100 , 7 , 1 ) )
ground_truth:	plt.scatter(np.random.randn(100), np.random.randn(100), facecolors='none')

intent:	do a

intent:	reverse list `ut` based on the `count` attribute of each object
predicted:	ut = [ [ ut [ i ] for i in range ( 0 , len ( ut ) , count ) ]
ground_truth:	ut.sort(key=lambda x: x.count, reverse=True)

intent:	sort a list of objects `ut` in reverse order by their `count` property
predicted:	ut . sort ( key = lambda x : count . index ( x [ 1 ] ) )
ground_truth:	ut.sort(key=lambda x: x.count, reverse=True)

intent:	click a href button 'Send' with selenium
predicted:	driver . find_element_by_id ( 'Send' , extra . click ( ) )
ground_truth:	driver.find_element_by_partial_link_text('Send').click()

intent:	click a href button having text `Send InMail` with selenium
predicted:	driver . find_element_by_id ( 'Send InMail' , 'Send InMail' )
ground_truth:	driver.findElement(By.linkText('Send InMail')).click()

intent:	click a href button with text 'Send InMail' with selenium
predicted:	driver . find_element_by_xpath ( 'Send InMail' , extra = 'Send InMail' )
ground_truth:	driver.find_element_by

intent:	Call a base class's class method `do` from derived class `Derived`
predicted:	Derived ( ( do , 'Derived' , None = 'Derived' )
ground_truth:	super(Derived, cls).do(a)

intent:	selecting rows in Numpy ndarray 'a', where the value in the first column is 0 and value in the second column is 1
predicted:	( ( a [ 0 ] == 0 ) . 0 ( ( 1 , 0 ) )
ground_truth:	a[np.where((a[:, (0)] == 0) * (a[:, (1)] == 1))]

intent:	separate words delimited by one or more spaces into a list
predicted:	re . split ( ) , 1 )
ground_truth:	re.split(' +', 'hello world sample text')

intent:	length of longest element in list `words`
predicted:	[ ( sum ( x ) for x in words )
ground_truth:	len(max(words, key=len))

intent:	get the value associated with unicode key 'from_user' of first dictionary in list `result`
predicted:	max ( result , key = lambda d : 'from_user' [ 'from_user' ] )
ground_truth:	result[0]['from_user']

intent:	Retrieve each line from a file 'File.txt' as a list
predicted:	[ x for x in open ( 'F

intent:	encode `Decimal('3.9')` to a JSON string
predicted:	ast . encode ( 'Decimal('3.9')' )
ground_truth:	json.dumps(Decimal('3.9'))

intent:	Add key "mynewkey" to dictionary `d` with value "mynewvalue"
predicted:	d [ 'mynewvalue' ] = 'mynewvalue'
ground_truth:	d['mynewkey'] = 'mynewvalue'

intent:	Add key 'a' to dictionary `data` with value 1
predicted:	data . update ( { a } )
ground_truth:	data.update({'a': 1, })

intent:	Add key 'a' to dictionary `data` with value 1
predicted:	data . update ( { a } )
ground_truth:	data.update(dict(a=1))

intent:	Add key 'a' to dictionary `data` with value 1
predicted:	data . update ( { a } )
ground_truth:	data.update(a=1)

intent:	find maximal value in matrix `matrix`
predicted:	matrix . isnull ( matrix . 0 , axis = 0 )
ground_truth:	max([max(i) for i in matrix])

intent:	Round number `answer` to 2 precision after the decimal point
predicted:	answer . round ( round , var_1 )
ground_truth:	answer = str(round(answer, 2))

intent:	extract ip address 

intent:	split string `input` based on occurrences of regex pattern '[ ](?=[A-Z]+\\b)'
predicted:	re . findall ( '[ ](?=[A-Z]+\\b)' , input )
ground_truth:	re.split('[ ](?=[A-Z]+\\b)', input)

intent:	Split string `input` at every space followed by an upper-case letter
predicted:	re . split ( ) , input )
ground_truth:	re.split('[ ](?=[A-Z])', input)

intent:	send multipart encoded file `files` to url `url` with headers `headers` and metadata `data`
predicted:	url . files ( files , url , headers = None )
ground_truth:	r = requests.post(url, files=files, headers=headers, data=data)

intent:	write bytes `bytes_` to a file `filename` in python 3
predicted:	filename . write ( bytes_ )
ground_truth:	open('filename', 'wb').write(bytes_)

intent:	get a list from a list `lst` with values mapped into a dictionary `dct`
predicted:	[ dict ( item ) for item in lst ]
ground_truth:	[dct[k] for k in lst]

intent:	find duplicate names in column 'name' of the dataframe `x`
predicted:	x [ 'name' ] . apply

intent:	add an item with key 'third_key' and value 1 to an dictionary `my_dict`
predicted:	my_dict [ my_dict ]
ground_truth:	my_dict.update({'third_key': 1})

intent:	declare an array
predicted:	np . array ( np . arange ( 1 , 3 , - 1 ) )
ground_truth:	my_list = []

intent:	Insert item `12` to a list `my_list`
predicted:	my_list . append ( '12' )
ground_truth:	my_list.append(12)

intent:	add an entry 'wuggah' at the beginning of list `myList`
predicted:	myList = [ wuggah ]
ground_truth:	myList.insert(0, 'wuggah')

intent:	convert a hex-string representation to actual bytes
predicted:	chr ( int . format ( ) . encode ( 'utf-8' ) )
ground_truth:	"""\\xF3\\xBE\\x80\\x80""".replace('\\x', '').decode('hex')

intent:	select the last column of dataframe `df`
predicted:	df . groupby ( df . columns [ df . columns ( 0 ) . sum ( ) ] )
ground_truth:	df[df.columns[-1]]

intent:	get the first value from dataframe `df` where column 'Letters' is equal to 'C'
predicted:	df . groupby ( 'C' ) [ 'Letters' ]

intent:	download the file from url `url` and save it under file `file_name`
predicted:	with . open ( file_name , file_name . read ( ) , 'r' )
ground_truth:	urllib.request.urlretrieve(url, file_name)

intent:	split string `text` by space
predicted:	text . split ( )
ground_truth:	text.split()

intent:	split string `text` by ","
predicted:	text . split ( ',' )
ground_truth:	text.split(',')

intent:	Split string `line` into a list by whitespace
predicted:	list = map ( int , line . split ( ) )
ground_truth:	line.split()

intent:	replace dot characters  '.' associated with ascii letters in list `s` with space ' '
predicted:	s . replace ( '.' , ' ' )
ground_truth:	[re.sub('(?<!\\d)\\.(?!\\d)', ' ', i) for i in s]

intent:	sort list `list_of_strings` based on second index of each string `s`
predicted:	list_of_strings . sort ( key = lambda x : x . s )
ground_truth:	sorted(list_of_strings, key=lambda s: s.split(',')[1])

intent:	call multiple bash function ‘vasp’ and ‘tee tee_output’ using ‘|’
p

intent:	match urls whose domain doesn't start with `t` from string `document` using regex
predicted:	re . findall ( [ , document , document )
ground_truth:	re.findall('http://[^t][^s"]+\\.html', document)

intent:	split a string `mystring` considering the spaces ' '
predicted:	mystring . rstrip ( '' )
ground_truth:	mystring.replace(' ', '! !').split('!')

intent:	open file `path` with mode 'r'
predicted:	os . open ( 'r' , os . path . join ( 'r' ) )
ground_truth:	open(path, 'r')

intent:	sum elements at the same index in list `data`
predicted:	sum ( i * i for i in data )
ground_truth:	[[sum(item) for item in zip(*items)] for items in zip(*data)]

intent:	add a new axis to array `a`
predicted:	a = [ ( a [ 0 ] ) . T ( )
ground_truth:	a[:, (np.newaxis)]



In [27]:
write_answer_json(code_list)