In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from preprocessing.processor import Code_Intent_Pairs, sub_slotmap
from seq2seq2.model import Seq2Seq
from seq2seq2.data import get_train_loader, get_test_loader, write_answer_json

### Define Hyperparameters

In [3]:
hyperP = {
    ## training parameters
    'batch_size' : 32,
    'lr' : 1e-3,
    'teacher_force_rate' : 0.80,
    'max_epochs' : 50,
    'lr_keep_rate' : 0.90,  # set to 1.0 to not decrease lr overtime
    
    ## encoder architecture
    'encoder_layers' : 3,
    'encoder_embed_size' : 128,
    'encoder_hidden_size' : 512,
    'encoder_dropout_rate' : 0.3,
    
    ## decoder architecture
    'decoder_layers' : 3,
    'decoder_embed_size' : 128,
    'decoder_hidden_size' : 512,
    'decoder_dropout_rate' : 0.3,
    
    ## attn architecture
    'attn_hidden_size' : 512,
    
    ## visualization
    'print_every': 5,
}

### Load Data

In [4]:
code_intent_pair = Code_Intent_Pairs()

In [5]:
path = 'vocab/'
code_intent_pair.load_dict(path)
special_symbols = code_intent_pair.get_special_symbols()
word_size = code_intent_pair.get_word_size()
code_size = code_intent_pair.get_code_size()

In [6]:
train_path = 'processed_corpus/train.json'
train_entries = code_intent_pair.load_entries(train_path)
code_intent_pair.pad()

In [7]:
trainloader = get_train_loader(train_entries, special_symbols, hyperP)

In [8]:
valid_path = 'processed_corpus/valid.json'
valid_entries = code_intent_pair.load_entries(valid_path)
code_intent_pair.pad()

In [9]:
validloader = get_train_loader(valid_entries, special_symbols, hyperP)

In [10]:
test_path = 'processed_corpus/test.json'
test_entries = code_intent_pair.load_entries(test_path)

In [11]:
testloader = get_test_loader(test_entries)

### Define Model

In [12]:
model = Seq2Seq(word_size, code_size, hyperP)

### Training

In [13]:
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
optimizer = optim.Adam(model.parameters(), lr=hyperP['lr'])
loss_f = torch.nn.CrossEntropyLoss()

In [14]:
lr_keep_rate = hyperP['lr_keep_rate']
if lr_keep_rate != 1.0:
    lr_reduce_f = lambda epoch: lr_keep_rate ** epoch
    scheduler = LambdaLR(optimizer, lr_lambda=lr_reduce_f)

In [15]:
def train(model, trainloader, optimizer, loss_f, hyperP):
    model.train()
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(trainloader):
        logits = model(inp_seq, padded_out_seq, out_lens)
        loss = loss_f(logits, original_out_seq)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # show stats
        loss_sum += loss.item()
        _, predictions = torch.max(logits, dim=1)
        total_correct += (predictions == original_out_seq).sum()
        size += len(original_out_seq)

        if (i+1) % print_every == 0:
            print('Train: loss:{}\tacc:{}'.format(loss_sum/print_every, float(total_correct)/size), end='\r')
            loss_sum = 0
            total_correct = 0
            size = 0
    print()

In [16]:
def valid(model, validloader, loss_f, hyperP):
    model.eval()
    old_rate = model.change_teacher_force_rate(0.0)
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    with torch.no_grad():
        for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(validloader):
            logits = model(inp_seq, padded_out_seq, out_lens)
            loss = loss_f(logits, original_out_seq)

            # show stats
            loss_sum += loss.item()
            _, predictions = torch.max(logits, dim=1)
            total_correct += (predictions == original_out_seq).sum()
            size += len(original_out_seq)

    print('Valid: loss:{}\tacc:{}'.format(loss_sum/len(validloader), float(total_correct)/size))
    model.change_teacher_force_rate(old_rate)
    return float(total_correct)/size

In [18]:
best_acc = 0.0
for e in range(hyperP['max_epochs']):
    train(model, trainloader, optimizer, loss_f, hyperP)
    acc = valid(model, validloader, loss_f, hyperP)
    if acc > best_acc:
        best_acc = acc
        model.save()
        print('model saved')
    if lr_keep_rate != 1.0:
        scheduler.step()

Train: loss:2.314716339111328	acc:0.452151898734177234
Valid: loss:2.3388794511556625	acc:0.4483216133740215
model saved
Train: loss:2.149797487258911	acc:0.466329113924050633
Valid: loss:2.2836208269000053	acc:0.46304895847154043
model saved
Train: loss:1.9833499670028687	acc:0.51139240506329115
Valid: loss:2.2194713950157166	acc:0.4662332493034364
model saved
Train: loss:2.1242095947265627	acc:0.47594936708860765
Valid: loss:2.1713496819138527	acc:0.48082791561629296
model saved
Train: loss:2.0213445901870726	acc:0.48708860759493673
Valid: loss:2.121318705379963	acc:0.49064614568130555
model saved
Train: loss:1.8100811004638673	acc:0.5281012658227848
Valid: loss:2.123566836118698	acc:0.49197293352792887
model saved
Train: loss:1.8103779554367065	acc:0.5255696202531646
Valid: loss:1.9890411347150803	acc:0.5231524479235771
model saved
Train: loss:1.6398745059967041	acc:0.5458227848101266
Valid: loss:2.0708600506186485	acc:0.5239485206315511
model saved
Train: loss:1.5978514909744264	ac

In [19]:
model.load()

### Decoding

In [20]:
sos = special_symbols['code_sos']
eos = special_symbols['code_eos']
unk = special_symbols['code_unk']
code_list = []
for i, (src_seq, slot_map, code, intent) in enumerate(testloader):
    model.eval()
    seq = model.greedy_decode(src_seq, sos, eos, unk)
    gen_code_tokens = code_intent_pair.idx2code(seq)
    gen_code = sub_slotmap(gen_code_tokens, slot_map)
    code_list.append(gen_code)
    print('intent:\t'+intent)
    print('predicted:\t'+gen_code+'\nground_truth:\t'+code)
    print()
    
    # if i == 50:
    #     break
write_answer_json(code_list)

intent:	send a signal `signal.SIGUSR1` to the current process
predicted:	parser . add_argument ( 'signal.SIGUSR1' )
ground_truth:	os.kill(os.getpid(), signal.SIGUSR1)

intent:	decode a hex string '4a4b4c' to UTF-8.
predicted:	"""4a4b4c""" . decode ( 'utf-8' )
ground_truth:	bytes.fromhex('4a4b4c').decode('utf-8')

intent:	check if all elements in list `myList` are identical
predicted:	len ( set ( myList ) ) == 0
ground_truth:	all(x == myList[0] for x in myList)

intent:	format number of spaces between strings `Python`, `:` and `Very Good` to be `20`
predicted:	print ( Python . replace ( ':' , '' ) . group ( ':' ) )
ground_truth:	print('%*s : %*s' % (20, 'Python', 20, 'Very Good'))

intent:	How to convert a string from CP-1251 to UTF-8?
predicted:	int . format ( 'utf-8' )
ground_truth:	d.decode('cp1251').encode('utf8')

intent:	get rid of None values in dictionary `kwargs`
predicted:	sum ( list ** y ( kwargs ) )
ground_truth:	res = {k: v for k, v in list(kwargs.items()) if v is not None}

intent:	Multiple each value by `2` for all keys in a dictionary `my_dict`
predicted:	[ { [ '2' ] for d in my_dict ]
ground_truth:	my_dict.update((x, y * 2) for x, y in list(my_dict.items()))

intent:	running bash script 'sleep.sh'
predicted:	os . system ( 'sleep.sh' )
ground_truth:	subprocess.call('sleep.sh', shell=True)

intent:	Join elements of list `l` with a comma `,`
predicted:	[ l [ ',' ] for i in l ]
ground_truth:	""",""".join(l)

intent:	make a comma-separated string from a list `myList`
predicted:	[ i . split ( ) for x in myList ]
ground_truth:	myList = ','.join(map(str, myList))

intent:	reverse the list that contains 1 to 10
predicted:	[ ( x + 2 ) for x in range ( 10 ) ]
ground_truth:	list(reversed(list(range(10))))

intent:	remove substring 'bag,' from a string 'lamp, bag, mirror'
predicted:	"""bag,""" . replace ( 'bag,' , '' )
ground_truth:	print('lamp, bag, mirror'.replace('bag,', ''))

intent:	Reverse the order of words, delimited by `.`, in string `s`
predicted:	s [ : :

intent:	count non zero values in each column in pandas data frame
predicted:	df . groupby ( df . columns , columns = [ ) , 'B' ] , axis = 1 )
ground_truth:	df.astype(bool).sum(axis=1)

intent:	search for string that matches regular expression pattern '(?<!Distillr)\\\\AcroTray\\.exe' in string 'C:\\SomeDir\\AcroTray.exe'
predicted:	re . findall ( '(?<!Distillr)\\AcroTray\.exe' , 'C:\SomeDir\AcroTray.exe' )
ground_truth:	re.search('(?<!Distillr)\\\\AcroTray\\.exe', 'C:\\SomeDir\\AcroTray.exe')

intent:	split string 'QH QD JC KD JS' into a list on white spaces
predicted:	re . split ( 'QH QD JC KD JS' , 'QH QD JC KD JS' )
ground_truth:	"""QH QD JC KD JS""".split()

intent:	search for occurrences of regex pattern '>.*<' in xml string `line`
predicted:	re . findall ( '>.*<' , line )
ground_truth:	print(re.search('>.*<', line).group(0))

intent:	erase all the contents of a file `filename`
predicted:	os . path ( filename )
ground_truth:	open(filename, 'w').close()

intent:	convert a string in

intent:	send the output of pprint object `dataobject` to file `logFile`
predicted:	dataobject . dataobject ( dataobject , logFile )
ground_truth:	pprint.pprint(dataobject, logFile)

intent:	get index of rows in column 'BoolCol'
predicted:	df . objects . order_by ( 'BoolCol' ) [ 'BoolCol' ]
ground_truth:	df.loc[df['BoolCol']]

intent:	Create a list containing the indexes of rows where the value of column 'BoolCol' in dataframe `df` are equal to True
predicted:	df [ 'BoolCol' ] = df [ 'BoolCol' ] . isin ( lambda x : x . sum ( x ) )
ground_truth:	df.iloc[np.flatnonzero(df['BoolCol'])]

intent:	get list of indexes of rows where column 'BoolCol' values match True
predicted:	[ x for x in BoolCol if x [ 'BoolCol' ] == 1 ]
ground_truth:	df[df['BoolCol'] == True].index.tolist()

intent:	get index of rows in dataframe `df` which column 'BoolCol' matches value True
predicted:	df . groupby ( df . columns [ 0 ] == 0 )
ground_truth:	df[df['BoolCol']].index.tolist()

intent:	change working directory 

intent:	copy all values in a column 'B' to a new column 'D' in a pandas data frame 'df'
predicted:	df [ 'B' ] . apply ( df . columns [ 'D' ] ) . sum ( )
ground_truth:	df['D'] = df['B']

intent:	find a value within nested json 'data' where the key inside another key 'B' is unknown.
predicted:	df1 . objects . filter ( [ = { 'data' : 'B' } )
ground_truth:	list(data['A']['B'].values())[0]['maindata'][0]['Info']

intent:	check characters of string `string` are true predication of function `predicate`
predicted:	print ( re . findall ( string , predicate ) )
ground_truth:	all(predicate(x) for x in string)

intent:	determine number of files on a drive with python
predicted:	sys . path . insert ( 'str_0' , 'rb' )
ground_truth:	os.statvfs('/').f_files - os.statvfs('/').f_ffree

intent:	how to get a single result from a SQLite query in python?
predicted:	root . gca ( ) )
ground_truth:	cursor.fetchone()[0]

intent:	convert string `user_input` into a list of integers `user_list`
predicted:	user_lis

intent:	replace all elements in array `A` that are not present in array `[1, 3, 4]` with zeros
predicted:	A [ np . all ( A != 0 ) for x in range ( 0 , len ( A ) , 2 ) ]
ground_truth:	np.where(np.in1d(A, [1, 3, 4]).reshape(A.shape), A, 0)

intent:	calculate mean across dimension in a 2d array `a`
predicted:	np . isnan ( a . shape )
ground_truth:	np.mean(a, axis=1)

intent:	running r script '/pathto/MyrScript.r' from python
predicted:	os . system ( '/pathto/MyrScript.r' )
ground_truth:	subprocess.call(['/usr/bin/Rscript', '--vanilla', '/pathto/MyrScript.r'])

intent:	run r script '/usr/bin/Rscript --vanilla /pathto/MyrScript.r'
predicted:	os . system ( '/usr/bin/Rscript --vanilla /pathto/MyrScript.r' )
ground_truth:	subprocess.call('/usr/bin/Rscript --vanilla /pathto/MyrScript.r', shell=True)

intent:	add a header to a csv file
predicted:	sys . path . dirname ( os . path . abspath ( __file__ ) )
ground_truth:	writer.writeheader()

intent:	replacing nan in the dataframe `df` with row aver

intent:	remove elements from list `oldlist` that have an index number mentioned in list `removelist`
predicted:	[ x for x in removelist if x not in removelist ]
ground_truth:	newlist = [v for i, v in enumerate(oldlist) if i not in removelist]

intent:	Open a file `yourfile.txt` in write mode
predicted:	exec = open ( 'yourfile.txt' , 'r' )
ground_truth:	f = open('yourfile.txt', 'w')

intent:	get attribute 'attr' from object `obj`
predicted:	obj . write ( 'attr' )
ground_truth:	getattr(obj, 'attr')

intent:	convert tuple of tuples `(('aa',), ('bb',), ('cc',))` to tuple
predicted:	list ( [ (('aa',), ('bb',), ('cc',)) ] )
ground_truth:	from functools import reduce
reduce(lambda a, b: a + b, (('aa',), ('bb',), ('cc',)))

intent:	convert tuple of tuples `(('aa',), ('bb',), ('cc',))` to list in one line
predicted:	[ ( [ (('aa',), ('bb',), ('cc',)) ] )
ground_truth:	map(lambda a: a[0], (('aa',), ('bb',), ('cc',)))

intent:	Python Pandas: How to replace a characters in a column of a dataframe?


intent:	check if object `obj` is a string
predicted:	isinstance ( obj , str )
ground_truth:	isinstance(obj, str)

intent:	check if object `o` is a string
predicted:	isinstance ( o , str )
ground_truth:	isinstance(o, str)

intent:	check if object `o` is a string
predicted:	isinstance ( o , str )
ground_truth:	(type(o) is str)

intent:	check if object `o` is a string
predicted:	isinstance ( o , str )
ground_truth:	isinstance(o, str)

intent:	check if `obj_to_test` is a string
predicted:	obj_to_test . isdigit ( )
ground_truth:	isinstance(obj_to_test, str)

intent:	append list `list1` to `list2`
predicted:	list2 = copy . copy ( list1 )
ground_truth:	list2.extend(list1)

intent:	append list `mylog` to `list1`
predicted:	list1 = copy . copy ( mylog )
ground_truth:	list1.extend(mylog)

intent:	append list `a` to `c`
predicted:	c = copy . copy ( a )
ground_truth:	c.extend(a)

intent:	append items in list `mylog` to `list1`
predicted:	list1 = [ x for x in list1 if x not in list1 ]
ground_truth:

intent:	numpy concatenate two arrays `a` and `b` along the first axis
predicted:	np . einsum ( a , b , b )
ground_truth:	np.array((a, b))

intent:	fetch address information for host 'google.com' ion port 80
predicted:	os . system ( 'google.com' , ( )
ground_truth:	print(socket.getaddrinfo('google.com', 80))

intent:	add a column 'day' with value 'sat' to dataframe `df`
predicted:	df [ 'day' ] = df [ 'day' ] . str ( )
ground_truth:	df.xs('sat', level='day', drop_level=False)

intent:	return a 401 unauthorized in django
predicted:	url ( ) , 2 )
ground_truth:	return HttpResponse('Unauthorized', status=401)

intent:	Flask set folder 'wherever' as the default template folder
predicted:	os . check_output ( 'wherever' , shell = True )
ground_truth:	Flask(__name__, template_folder='wherever')

intent:	How do I INSERT INTO t1 (SELECT * FROM t2) in SQLAlchemy?
predicted:	root . find_element_by_xpath ( ) , <eos> = 1 )
ground_truth:	session.execute('INSERT INTO t1 (SELECT * FROM t2)')

intent:	sor

intent:	write multiple strings `line1`, `line2` and `line3` in one line in a file `target`
predicted:	line2 . write ( { )
ground_truth:	target.write('%r\n%r\n%r\n' % (line1, line2, line3))

intent:	Convert list of lists `data` into a flat list
predicted:	list ( itertools . chain ( * data ) )
ground_truth:	[y for x in data for y in (x if isinstance(x, list) else [x])]

intent:	Print new line character as `\n` in a string `foo\nbar`
predicted:	"""foo
bar""" . count ( '
' )
ground_truth:	print('foo\nbar'.encode('string_escape'))

intent:	remove last comma character ',' in string `s`
predicted:	s = re . sub ( ',' , '' , s )
ground_truth:	"""""".join(s.rsplit(',', 1))

intent:	calculate the mean of each element in array `x` with the element previous to it
predicted:	np . array ( x ) . sum ( )
ground_truth:	(x[1:] + x[:-1]) / 2

intent:	get an array of the mean of each two consecutive values in numpy array `x`
predicted:	x [ np . arange ( x ) ]
ground_truth:	x[:-1] + (x[1:] - x[:-1]) / 2

in

intent:	get the last key of dictionary `dict`
predicted:	sum ( dict . values ( ) )
ground_truth:	list(dict.keys())[-1]

intent:	write line "hi there" to file `f`
predicted:	f . write ( 'hi there' )
ground_truth:	print('hi there', file=f)

intent:	write line "hi there" to file `myfile`
predicted:	myfile . write ( 'hi there' )
ground_truth:	f = open('myfile', 'w')
f.write('hi there\n')
f.close()

intent:	write line "Hello" to file `somefile.txt`
predicted:	with open ( 'Hello' , 'somefile.txt' ) as f : 
      pass 
ground_truth:	with open('somefile.txt', 'a') as the_file:
    the_file.write('Hello\n')

intent:	convert unicode string `s` to ascii
predicted:	s . decode ( 'unicode_escape' )
ground_truth:	s.encode('iso-8859-15')

intent:	Django get maximum value associated with field 'added' in model `AuthorizedEmail`
predicted:	AuthorizedEmail . update ( 'added' , 'str_1' = True )
ground_truth:	AuthorizedEmail.objects.filter(group=group).order_by('-added')[0]

intent:	Find all numbers and do

In [21]:
write_answer_json(code_list)