In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
from preprocessing.processor import Code_Intent_Pairs, sub_slotmap
from seq2seq2.model import Seq2Seq
from seq2seq2.data import get_train_loader, get_test_loader, write_answer_json

### Define Hyperparameters

In [9]:
hyperP = {
    ## training parameters
    'batch_size' : 32,
    'lr' : 5e-4,
    'teacher_force_rate' : 0.90,
    'max_epochs' : 20,
    'lr_keep_rate' : 0.90,  # set to 1.0 to not decrease lr overtime
    
    ## encoder architecture
    'encoder_layers' : 2,
    'encoder_embed_size' : 128,
    'encoder_hidden_size' : 384,
    'encoder_dropout_rate' : 0.3,
    
    ## decoder architecture
    'decoder_layers' : 2,
    'decoder_embed_size' : 128,
    'decoder_hidden_size' : 384,
    'decoder_dropout_rate' : 0.3,
    
    ## attn architecture
    'attn_hidden_size' : 384,
    
    ## visualization
    'print_every': 5,
}

### Load Data

In [10]:
code_intent_pair = Code_Intent_Pairs()

In [11]:
path = 'vocab/'
code_intent_pair.load_dict(path)
special_symbols = code_intent_pair.get_special_symbols()
word_size = code_intent_pair.get_word_size()
code_size = code_intent_pair.get_code_size()

In [12]:
train_path = 'processed_corpus/train.json'
train_entries = code_intent_pair.load_entries(train_path)
code_intent_pair.pad()

In [13]:
trainloader = get_train_loader(train_entries, special_symbols, hyperP)

In [15]:
valid_path = 'processed_corpus/valid.json'
valid_entries = code_intent_pair.load_entries(valid_path)
code_intent_pair.pad()

In [16]:
validloader = get_train_loader(valid_entries, special_symbols, hyperP)

In [17]:
test_path = 'processed_corpus/test.json'
test_entries = code_intent_pair.load_entries(test_path)

In [18]:
testloader = get_test_loader(test_entries)

### Define Model

In [19]:
model = Seq2Seq(word_size, code_size, hyperP)

### Training

In [20]:
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
optimizer = optim.Adam(model.parameters(), lr=hyperP['lr'])
loss_f = torch.nn.CrossEntropyLoss()

In [21]:
lr_keep_rate = hyperP['lr_keep_rate']
if lr_keep_rate != 1.0:
    lr_reduce_f = lambda epoch: lr_keep_rate ** epoch
    scheduler = LambdaLR(optimizer, lr_lambda=lr_reduce_f)

In [29]:
def train(model, trainloader, optimizer, loss_f, hyperP):
    model.train()
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(trainloader):
        logits = model(inp_seq, padded_out_seq, out_lens)
        loss = loss_f(logits, original_out_seq)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # show stats
        loss_sum += loss.item()
        _, predictions = torch.max(logits, dim=1)
        total_correct += (predictions == original_out_seq).sum()
        size += len(original_out_seq)

        if (i+1) % print_every == 0:
            print('Train: loss:{}\tacc:{}'.format(loss_sum/print_every, float(total_correct)/size), end='\r')
            loss_sum = 0
            total_correct = 0
            size = 0
    print()

In [30]:
def valid(model, validloader, loss_f, hyperP):
    model.eval()
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    with torch.no_grad():
        for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(validloader):
            logits = model(inp_seq, padded_out_seq, out_lens)
            loss = loss_f(logits, original_out_seq)

            # show stats
            loss_sum += loss.item()
            _, predictions = torch.max(logits, dim=1)
            total_correct += (predictions == original_out_seq).sum()
            size += len(original_out_seq)

    print('Valid: loss:{}\tacc:{}'.format(loss_sum/len(validloader), float(total_correct)/size))
    return float(total_correct)/size

In [31]:
best_acc = 0.0
for e in range(20):
    train(model, trainloader, optimizer, loss_f, hyperP)
    acc = valid(model, validloader, loss_f, hyperP)
    if acc > best_acc:
        best_acc = acc
        model.save()
        print()
        print('model saved')
    if lr_keep_rate != 1.0:
        scheduler.step()

Train: loss:2.784852647781372	acc:0.398481012658227835
Valid: loss:2.617232233285904	acc:0.3993631418336208

model saved
Train: loss:2.349023962020874	acc:0.455696202531645564
Valid: loss:2.325969561934471	acc:0.4499137587899695

model saved
Train: loss:2.2136358261108398	acc:0.47240506329113924
Valid: loss:2.087650902569294	acc:0.4905134668966432

model saved
Train: loss:2.0232731103897095	acc:0.50936708860759497
Valid: loss:2.0761683881282806	acc:0.48945203661934455
Train: loss:1.9176480770111084	acc:0.52151898734177223
Valid: loss:1.9412105903029442	acc:0.5296537083720313

model saved
Train: loss:1.753191065788269	acc:0.53873417721518994
Valid: loss:1.8944778591394424	acc:0.5341647870505506

model saved
Train: loss:1.6441272735595702	acc:0.5605063291139241
Valid: loss:1.8864103555679321	acc:0.5362876476051479

model saved
Train: loss:1.4747915506362914	acc:0.6010126582278481
Valid: loss:1.9147342815995216	acc:0.5422581929149529

model saved
Train: loss:1.4132025003433228	acc:0.60556

In [32]:
model.load()

### Decoding

In [38]:
sos = special_symbols['code_sos']
eos = special_symbols['code_eos']
unk = special_symbols['code_unk']
code_list = []
for i, (src_seq, slot_map, code, intent) in enumerate(testloader):
    model.eval()
    seq = model.greedy_decode(src_seq, sos, eos, unk)
    gen_code_tokens = code_intent_pair.idx2code(seq)
    gen_code = sub_slotmap(gen_code_tokens, slot_map)
    code_list.append(gen_code)
    print('intent:\t'+intent)
    print('predicted:\t'+gen_code+'\nground_truth:\t'+code)
    print()
    
    # if i == 50:
    #     break
write_answer_json(code_list)

intent:	send a signal `signal.SIGUSR1` to the current process
predicted:	os . system ( 'signal.SIGUSR1' )
ground_truth:	os.kill(os.getpid(), signal.SIGUSR1)

intent:	decode a hex string '4a4b4c' to UTF-8.
predicted:	. encode ( '4a4b4c' )
ground_truth:	bytes.fromhex('4a4b4c').decode('utf-8')

intent:	check if all elements in list `myList` are identical
predicted:	all ( isinstance ( myList ) == 1 for i in myList )
ground_truth:	all(x == myList[0] for x in myList)

intent:	format number of spaces between strings `Python`, `:` and `Very Good` to be `20`
predicted:	print ( . format ( ':' , 'Very Good' ) )
ground_truth:	print('%*s : %*s' % (20, 'Python', 20, 'Very Good'))

intent:	How to convert a string from CP-1251 to UTF-8?
predicted:	. format ( )
ground_truth:	d.decode('cp1251').encode('utf8')

intent:	get rid of None values in dictionary `kwargs`
predicted:	sum ( x [ 0 ] for x in kwargs )
ground_truth:	res = {k: v for k, v in list(kwargs.items()) if v is not None}

intent:	get rid of No

intent:	parse milliseconds epoch time '1236472051807' to format '%Y-%m-%d %H:%M:%S'
predicted:	. ( '1236472051807' , '%Y-%m-%d %H:%M:%S' )
ground_truth:	time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(1236472051807 / 1000.0))

intent:	get the date 7 days before the current date
predicted:	datetime . datetime . now ( )
ground_truth:	(datetime.datetime.now() - datetime.timedelta(days=7)).date()

intent:	sum elements at index `column` of each list in list `data`
predicted:	[ ( column , data ) for i in data ]
ground_truth:	print(sum(row[column] for row in data))

intent:	sum columns of a list `array`
predicted:	array [ : : - 1 ]
ground_truth:	[sum(row[i] for row in array) for i in range(len(array[0]))]

intent:	encode binary string 'your string' to base64 code
predicted:	"""str_0""" . decode ( 'your string' )
ground_truth:	base64.b64encode(bytes('your string', 'utf-8'))

intent:	combine list of dictionaries `dicts` with the same keys in each list to a single dictionary
predicted:	[ ( d [ 's

intent:	subscript text 'H20' with '2' as subscripted in matplotlib labels for arrays 'x' and 'y'.
predicted:	np . zeros ( [ H20 , 'x' , 'x' ] , ( 2 , 9 ) )
ground_truth:	plt.plot(x, y, label='H\u2082O')

intent:	subscript text 'H20' with '2' as subscripted in matplotlib labels for arrays 'x' and 'y'.
predicted:	np . zeros ( [ H20 , 'x' , 'x' ] , ( 2 , 9 ) )
ground_truth:	plt.plot(x, y, label='$H_2O$')

intent:	loop over a list `mylist` if sublists length equals 3
predicted:	mylist = [ i for i in range ( 10 ) ]
ground_truth:	[x for x in mylist if len(x) == 3]

intent:	initialize a list `lst` of 100 objects Object()
predicted:	lst . ( )
ground_truth:	lst = [Object() for _ in range(100)]

intent:	create list `lst` containing 100 instances of object `Object`
predicted:	lst = Object . append ( Object )
ground_truth:	lst = [Object() for i in range(100)]

intent:	get the content of child tag with`href` attribute whose parent has css `someclass`
predicted:	. ( )
ground_truth:	self.driver.find_

intent:	do a scatter plot with empty circles
predicted:	. ( )
ground_truth:	plt.plot(np.random.randn(100), np.random.randn(100), 'o', mfc='none')

intent:	remove a div with a id `main-content` using beautifulsoup
predicted:	. ( )
ground_truth:	soup.find('div', id='main-content').decompose()

intent:	filter rows containing key word `ball` in column `ids`
predicted:	ids [ ball ] . ids [ 'ball' ] . ids ]
ground_truth:	df[df['ids'].str.contains('ball')]

intent:	convert index at level 0 into a column in dataframe `df`
predicted:	df . groupby ( level = 1 ) . sum ( )
ground_truth:	df.reset_index(level=0, inplace=True)

intent:	Add indexes in a data frame `df` to a column `index1`
predicted:	index1 [ 'df' ] . index1 . ( )
ground_truth:	df['index1'] = df.index

intent:	convert pandas index in a dataframe to columns
predicted:	df . groupby ( df . columns ) . apply ( )
ground_truth:	df.reset_index(level=['tick', 'obs'])

intent:	Get reverse of list items from list 'b' using extended slicing
pred

intent:	prepend the line '#test firstline\n' to the contents of file 'infile' and save as the file 'outfile'
predicted:	with open ( 'infile' , 'infile' ) as f : 
      print ( ) 
 . ( 'infile' ) 
  = . ( 'infile' ) 
  = . ( 'infile' ) 
  = . ( 'infile' )
ground_truth:	open('outfile', 'w').write('#test firstline\n' + open('infile').read())

intent:	sort a list `l` by length of value in tuple
predicted:	l . sort ( key = lambda x : x [ 1 ] )
ground_truth:	l.sort(key=lambda t: len(t[1]), reverse=True)

intent:	split string `s` by words that ends with 'd'
predicted:	re . split ( , s )
ground_truth:	re.findall('\\b(\\w+)d\\b', s)

intent:	return `True` if string `foobarrrr` contains regex `ba[rzd]`
predicted:	True . replace ( 'ba[rzd]' , = True )
ground_truth:	bool(re.search('ba[rzd]', 'foobarrrr'))

intent:	Removing duplicates in list `t`
predicted:	[ ( i for i in t if i in t ]
ground_truth:	list(set(t))

intent:	Removing duplicates in list `source_list`
predicted:	[ ( i for i in source_lis

intent:	Encode a latin character in string `Sopet\xc3\xb3n` properly
predicted:	re . sub ( , , )
ground_truth:	'Sopet\xc3\xb3n'.encode('latin-1').decode('utf-8')

intent:	resized image `image` to width, height of `(x, y)` with filter of `ANTIALIAS`
predicted:	image . ( , = )
ground_truth:	image = image.resize((x, y), Image.ANTIALIAS)

intent:	regex, find "n"s only in the middle of string `s`
predicted:	re . sub ( , s , s )
ground_truth:	re.findall('n(?<=[^n]n)n+(?=[^n])(?i)', s)

intent:	display the float `1/3*100` as a percentage
predicted:	. ( '1/3*100' )
ground_truth:	print('{0:.0f}%'.format(1.0 / 3 * 100))

intent:	sort a list of dictionary `mylist` by the key `title`
predicted:	mylist . sort ( key = lambda x : title . index ( x [ 1 ] ) )
ground_truth:	mylist.sort(key=lambda x: x['title'])

intent:	sort a list `l` of dicts by dict value 'title'
predicted:	l . sort ( key = lambda x : x [ 1 ] )
ground_truth:	l.sort(key=lambda x: x['title'])

intent:	sort a list of dictionaries by the

intent:	Get a new list `list2`by removing empty list from a list of lists `list1`
predicted:	[ x for x in zip ( list2 , list1 ) ]
ground_truth:	list2 = [x for x in list1 if x != []]

intent:	Create `list2` to contain the lists from list `list1` excluding the empty lists from `list1`
predicted:	list1 = [ list1 for i in list1 ]
ground_truth:	list2 = [x for x in list1 if x]

intent:	Django response with JSON `data`
predicted:	data . ( )
ground_truth:	return HttpResponse(data, mimetype='application/json')

intent:	get all text that is not enclosed within square brackets in string `example_str`
predicted:	re . sub ( , '' , example_str )
ground_truth:	re.findall('(.*?)\\[.*?\\]', example_str)

intent:	Use a regex to get all text in a string `example_str` that is not surrounded by square brackets
predicted:	re . findall ( , example_str )
ground_truth:	re.findall('(.*?)(?:\\[.*?\\]|$)', example_str)

intent:	get whatever is between parentheses as a single match, and any char outside as an indi

intent:	reverse a priority queue `q` in python without using classes
predicted:	. ( , = )
ground_truth:	q.put((-n, n))

intent:	make a barplot of data in column `group` of dataframe `df` colour-coded according to list `color`
predicted:	df . groupby ( df ) . size ( lambda x : df . index ( color ) )
ground_truth:	df['group'].plot(kind='bar', color=['r', 'g', 'b', 'r', 'g', 'b', 'r'])

intent:	find all matches of regex pattern '([a-fA-F\\d]{32})' in string `data`
predicted:	re . sub ( , '' , data )
ground_truth:	re.findall('([a-fA-F\\d]{32})', data)

intent:	Get the length of list `my_list`
predicted:	my_list = [ ]
ground_truth:	len(my_list)

intent:	Getting the length of array `l`
predicted:	l [ : - 1 ]
ground_truth:	len(l)

intent:	Getting the length of array `s`
predicted:	s [ : - 1 ]
ground_truth:	len(s)

intent:	Getting the length of `my_tuple`
predicted:	print ( my_tuple )
ground_truth:	len(my_tuple)

intent:	Getting the length of `my_string`
predicted:	print ( my_string )
ground_t

intent:	get elements from list `myList`, that have a field `n` value 30
predicted:	myList = [ x for x in n if x not not ]
ground_truth:	[x for x in myList if x.n == 30]

intent:	converting list of strings `intstringlist` to list of integer `nums`
predicted:	[ nums for item in intstringlist if item not nums ]
ground_truth:	nums = [int(x) for x in intstringlist]

intent:	convert list of string numbers into list of integers
predicted:	[ int ( x ) for x in zip ( , 2 ) ]
ground_truth:	map(int, eval(input('Enter the unfriendly numbers: ')))

intent:	print "." without newline
predicted:	print ( . format ( '.' ) )
ground_truth:	sys.stdout.write('.')

intent:	round off the float that is the product of `2.52 * 100` and convert it to an int
predicted:	print ( int ( int ( x . ( ) ) )
ground_truth:	int(round(2.51 * 100))

intent:	Find all files in directory "/mydir" with extension ".txt"
predicted:	= open ( , 'r' )
ground_truth:	os.chdir('/mydir')
for file in glob.glob('*.txt'):
    pass

intent:	F

intent:	convert int values in list `numlist` to float
predicted:	= int ( numlist )
ground_truth:	numlist = [float(x) for x in numlist]

intent:	write dataframe `df`, excluding index, to a csv file
predicted:	df . to_csv ( 'str_0' , = False )
ground_truth:	df.to_csv(filename, index=False)

intent:	convert a urllib unquoted string `unescaped` to a json data `json_data`
predicted:	= . format ( json_data )
ground_truth:	json_data = json.loads(unescaped)

intent:	Create a list containing all ascii characters as its elements
predicted:	[ x for x in for x in range ( 10 ) ]
ground_truth:	[chr(i) for i in range(127)]

intent:	write `newFileBytes` to a binary file `newFile`
predicted:	= newFile . ( newFileBytes )
ground_truth:	newFile.write(struct.pack('5B', *newFileBytes))

intent:	python regex - check for a capital letter with a following lowercase in string `string`
predicted:	re . sub ( , , string )
ground_truth:	re.sub('^[A-Z0-9]*(?![a-z])', '', string)

intent:	get the last key of dictiona

In [34]:
write_answer_json(code_list)