In [42]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [91]:
from preprocessing.processor import Code_Intent_Pairs, sub_slotmap
from seq2seq2.model import Seq2Seq
from seq2seq2.data import get_train_loader, get_test_loader

### Define Hyperparameters

In [5]:
hyperP = {
    ## training parameters
    'batch_size' : 32,
    'lr' : 1e-4,
    'teacher_force_rate' : 0.85,
    'max_epochs' : 20,
    
    ## encoder architecture
    'encoder_layers' : 2,
    'encoder_embed_size' : 128,
    'encoder_hidden_size' : 384,
    'encoder_dropout_rate' : 0.3,
    
    ## decoder architecture
    'decoder_layers' : 2,
    'decoder_embed_size' : 128,
    'decoder_hidden_size' : 384,
    'decoder_dropout_rate' : 0.3,
    
    ## attn architecture
    'attn_hidden_size' : 384,
    
    ## visualization
    'print_every': 5,
}

### Load Data

In [35]:
code_intent_pair = Code_Intent_Pairs()

In [36]:
path = 'vocab/'
code_intent_pair.load_dict(path)
special_symbols = code_intent_pair.get_special_symbols()
word_size = code_intent_pair.get_word_size()
code_size = code_intent_pair.get_code_size()

In [37]:
train_path = 'processed_corpus/train.json'
train_entries = code_intent_pair.load_entries(train_path)
code_intent_pair.pad()

In [39]:
trainloader = get_train_loader(train_entries, special_symbols, hyperP)

In [82]:
test_path = 'processed_corpus/test.json'
test_entries = code_intent_pair.load_entries(test_path)

In [83]:
testloader = get_test_loader(test_entries)

### Define Model

In [48]:
model = Seq2Seq(word_size, code_size, hyperP)

### Training

In [49]:
import torch
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=hyperP['lr'])
loss_f = torch.nn.CrossEntropyLoss()

In [50]:
def train(model, trainloader, optimizer, loss_f, hyperP):
    model.train()
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(trainloader):
        logits = model(inp_seq, padded_out_seq, out_lens)
        loss = loss_f(logits, original_out_seq)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # show stats
        loss_sum += loss.item()
        _, predictions = torch.max(logits, dim=1)
        total_correct += (predictions == original_out_seq).sum()
        size += len(original_out_seq)

        if (i+1) % print_every == 0:
            print('Train: loss:{}\tacc:{}'.format(loss_sum/print_every, float(total_correct)/size), end='\r')
            loss_sum = 0
            total_correct = 0
            size = 0

In [51]:
def valid(model, validloader, loss_f, hyperP):
    model.eval()
    loss_sum = 0
    total_correct = 0
    size = 0
    print_every = hyperP['print_every']
    
    with torch.no_grad():
        for i, (inp_seq, original_out_seq, padded_out_seq, out_lens) in enumerate(validloader):
            logits = model(inp_seq, padded_out_seq, out_lens)
            loss = loss_f(logits, original_out_seq)

            # show stats
            loss_sum += loss.item()
            _, predictions = torch.max(logits, dim=1)
            total_correct += (predictions == original_out_seq).sum()
            size += len(original_out_seq)

    print('Valid: loss:{}\tacc:{}'.format(loss_sum/len(validloader), float(total_correct)/size), end='\r')
    return float(total_correct)/size

In [32]:
best_acc = 0.0
for e in range(10):
    train(model, trainloader, optimizer, loss_f, hyperP)
    acc = valid(model, trainloader, loss_f, hyperP)
    if acc > best_acc:
        best_acc = acc
        model.save()
        print()
        print('model saved')

Valid: loss:1.8927247524261475	acc:0.51947593110554984
model saved
Valid: loss:1.8615587202707926	acc:0.52462829383188585
model saved
Valid: loss:1.8052612018585206	acc:0.53422640953923163
model saved
Valid: loss:1.7580639139811198	acc:0.5432356837921389
model saved
Valid: loss:1.713199872970581	acc:0.55489474459001912
model saved
Valid: loss:1.6467161607742309	acc:0.56620050051523634
model saved
Valid: loss:1.6186739095052083	acc:0.5725599882231709
model saved
Valid: loss:1.5654699357350668	acc:0.5809509789489182
model saved
Valid: loss:1.5506923739115397	acc:0.5880170764021787
model saved


In [52]:
model.load()

### Decoding

In [93]:
sos = special_symbols['code_sos']
eos = special_symbols['code_eos']
for src_seq, slot_map, true_code_idx, intent in testloader:
    seq = model.greedy_decode(src_seq, sos, eos)
    gen_code_tokens = code_intent_pair.idx2code(seq)
    gen_code = sub_slotmap(gen_code_tokens, slot_map)
    true_code_tokens = code_intent_pair.idx2code(true_code_idx, intent)
    true_code = sub_slotmap(true_code_tokens, slot_map)
    print('intent:'+' '.join(intent))
    print('predicted:\t'+gen_code+'\nground_truth:\t'+true_code)
    print()

intent:send a signal str_0 to the current process
predicted:	os . system ( 'signal.SIGUSR1' )
ground_truth:	os . <unk> ( os . <unk> ( ) , signal . <unk> )

intent:decode a hex string str_0 to utf-8 .
predicted:	"""str_0""" . decode ( '4a4b4c' )
ground_truth:	<unk> . <unk> ( '4a4b4c' ) . decode ( 'utf-8' )

intent:check if all elements in list var_0 are identical
predicted:	[ x for i in myList if myList in myList ]
ground_truth:	all ( x == myList [ 0 ] for x in myList )

intent:format number of spaces between strings var_0 , str_0 and str_1 to be str_2
predicted:	<unk> . <unk> ( ':' , 'Very Good' )
ground_truth:	print ( <unk> % ( 20 , 'Python' , 20 , 'Very Good' ) )

intent:how to convert a string from cp-1251 to utf-8 ?
predicted:	<unk> . format ( <unk> )
ground_truth:	d . decode ( <unk> ) . encode ( 'utf8' )

intent:get rid of none values in dictionary var_0
predicted:	sum ( iter ( kwargs . values ( ) ) )
ground_truth:	<unk> = { k : v for k , v in list ( kwargs . items ( ) ) if v is n

intent:make a comma-separated string from a list var_0
predicted:	[ i for i in myList ]
ground_truth:	myList = ',' . join ( map ( str , myList ) )

intent:reverse the list that contains 1 to 10
predicted:	[ ( [ i ] for x in range ( 3 ) ]
ground_truth:	list ( reversed ( list ( range ( 10 ) ) ) )

intent:remove substring str_0 from a string str_1
predicted:	re . sub ( 'bag,' , 'lamp, bag, mirror' )
ground_truth:	print ( 'lamp, bag, mirror' . replace ( 'bag,' , '' ) )

intent:reverse the order of words , delimited by str_0 , in string var_0
predicted:	s . replace ( '.' , '.' )
ground_truth:	"""str_0""" . join ( s . split ( '.' ) [ : : - 1 ] )

intent:convert epoch time represented as milliseconds var_0 to string using format str_0
predicted:	s . datetime . <unk> ( <unk> )
ground_truth:	datetime . datetime . <unk> ( s ) . strftime ( '%Y-%m-%d %H:%M:%S.%f' )

intent:parse milliseconds epoch time str_0 to format str_1
predicted:	datetime . datetime . now ( '1236472051807' )
ground_truth:	tim

intent:remove all non-alphanumeric characters except space from a string var_0 and lower it
predicted:	re . sub ( <unk> , text , <unk> )
ground_truth:	re . sub ( <unk> , '' , text ) . lower ( ) . strip ( )

intent:subscript text str_0 with str_1 as subscripted in matplotlib labels for arrays str_2 and str_3 .
predicted:	df . <unk> ( [ 'H20' , 'x' ] , 'x' = 1 )
ground_truth:	plt . plot ( x , y , <unk> = <unk> )

intent:subscript text str_0 with str_1 as subscripted in matplotlib labels for arrays str_2 and str_3 .
predicted:	<unk> = re . <unk> ( 'H20' , <unk> = <unk> )
ground_truth:	plt . plot ( x , y , <unk> = <unk> )

intent:loop over a list var_0 if sublists length equals 3
predicted:	mylist . <unk> ( <unk> )
ground_truth:	[ x for x in mylist if len ( x ) == 3 ]

intent:initialize a list var_0 of 100 objects object ( )
predicted:	lst = [ i ]
ground_truth:	lst = [ <unk> ( ) for _ in range ( 100 ) ]

intent:create list var_0 containing 100 instances of object var_1
predicted:	Object = 

intent:convert index at level 0 into a column in dataframe var_0
predicted:	df . groupby ( df . columns . columns ( df ) )
ground_truth:	df . reset_index ( level = 0 , inplace = True )

intent:add indexes in a data frame var_0 to a column var_1
predicted:	df . <unk> ( index1 , index1 )
ground_truth:	df [ 'index1' ] = df . index

intent:convert pandas index in a dataframe to columns
predicted:	df . groupby ( [ str_0 , <unk> ] )
ground_truth:	df . reset_index ( level = [ <unk> , <unk> ] )

intent:get reverse of list items from list str_0 using extended slicing
predicted:	[ x for x in range ( ) if x in range ( 'b' ) ]
ground_truth:	[ x [ : : - 1 ] for x in b ]

intent:join each element in array var_0 with element at the same index in array var_1 as a tuple
predicted:	a [ np ( a , b [ 0 ] , b [ 1 ] ) ]
ground_truth:	np . array ( [ zip ( x , y ) for x , y in zip ( a , b ) ] )

intent:zip two 2-d arrays var_0 and var_1
predicted:	b = np . <unk> ( a , b )
ground_truth:	np . array ( zip ( a . 

intent:removing duplicates in list var_0
predicted:	t = [ i for i in t ]
ground_truth:	list ( set ( t ) )

intent:removing duplicates in list var_0
predicted:	source_list = [ ( source_list )
ground_truth:	list ( set ( source_list ) )

intent:removing duplicates in list var_0
predicted:	abracadabra = [ ( abracadabra )
ground_truth:	list ( OrderedDict . <unk> ( 'abracadabra' ) )

intent:convert array var_0 into a list
predicted:	a = [ ( [ i for i in range ( a ) ]
ground_truth:	numpy . array ( a ) . reshape ( - 1 ) . tolist ( )

intent:convert the first row of numpy matrix var_0 to a list
predicted:	a [ 0 ]
ground_truth:	numpy . array ( a ) [ 0 ] . tolist ( )

intent:in var_0 , get the content of the sibling of the var_1 tag with text content str_0
predicted:	soup . <unk> ( )
ground_truth:	print ( soup . find ( text = 'Address:' ) . <unk> ( 'td' ) . <unk> [ 0 ] )

intent:convert elements of each tuple in list var_0 into a string separated by character str_0
predicted:	[ ( [ '@' ] for item

intent:sort a list of dictionary var_0 by the key var_1
predicted:	sorted ( mylist , key = lambda x : title . itemgetter ( x ) ]
ground_truth:	mylist . sort ( key = lambda x : x [ 'title' ] )

intent:sort a list var_0 of dicts by dict value str_0
predicted:	sorted ( l , key = lambda x : x [ 1 ] )
ground_truth:	l . sort ( key = lambda x : x [ 'title' ] )

intent:sort a list of dictionaries by the value of keys str_0 , str_1 , str_2 in ascending order .
predicted:	sorted ( var_0 , key = lambda x : x [ 'title' ] )
ground_truth:	l . sort ( key = lambda x : ( x [ 'title' ] , x [ 'title_url' ] , x [ 'id' ] ) )

intent:find 10 largest differences between each respective elements of list var_0 and list var_1
predicted:	[ ( x , j ) for x in zip ( l1 , l2 ) ]
ground_truth:	<unk> . <unk> ( 10 , range ( len ( l1 ) ) , key = lambda i : abs ( l1 [ i ] - l2 [ i ] ) )

intent:beautifulsoup find all str_0 elements in html string var_0 with class of str_1
predicted:	soup . replace ( 'span' , 'starGryB s

intent:count most frequent 100 words in column str_0 of dataframe var_0
predicted:	df . <unk> ( 'text' , <unk> = True )
ground_truth:	Counter ( ' ' . join ( df [ 'text' ] ) . split ( ) ) . <unk> ( 100 )

intent:python split a string using regex
predicted:	re . sub ( <unk> , <unk> )
ground_truth:	re . findall ( <unk> , text )

intent:generate all 2-element subsets of tuple str_0
predicted:	<unk> . <unk> ( '(1, 2, 3)' )
ground_truth:	list ( itertools . combinations ( ( (1, 2, 3) ) , 2 ) )

intent:get a value of datetime.today ( ) in the utc time zone
predicted:	<unk> . <unk> ( )
ground_truth:	datetime . now ( <unk> . utc )

intent:get a new list var_0by removing empty list from a list of lists var_1
predicted:	[ ( [ [ ] for x in zip ( list2 , list1 ) ]
ground_truth:	list2 = [ x for x in list1 if x != [ ] ]

intent:create var_0 to contain the lists from list var_1 excluding the empty lists from var_1
predicted:	list1 = list1 . <unk> ( list1 )
ground_truth:	list2 = [ x for x in list1 if x 

intent:get number of keys in dictionary var_0
predicted:	sum ( yourdict . values ( ) ) for k in yourdict ]
ground_truth:	len ( list ( yourdict . keys ( ) ) )

intent:count the number of keys in dictionary var_0
predicted:	yourdictfile . join ( yourdictfile . items ( ) )
ground_truth:	len ( set ( open ( yourdictfile ) . read ( ) . split ( ) ) )

intent:pandas dataframe get first row of each group by str_0
predicted:	df . groupby ( [ 'id' ] . <unk> ( ) . apply ( ) . <unk> ( ) )
ground_truth:	df . groupby ( 'id' ) . first ( )

intent:split a list in first column into multiple columns keeping other columns as well in pandas data frame
predicted:	df . groupby ( [ df , <unk> ] , axis = 1 , axis = 1 )
ground_truth:	pd . concat ( [ df [ 0 ] . apply ( pd . Series ) , df [ 1 ] ] , axis = 1 )

intent:extract attributes str_0 from string var_0
predicted:	data . find ( 'src="js/([^"]*\bjquery\b[^"]*)"' )
ground_truth:	re . findall ( 'src="js/([^"]*\bjquery\b[^"]*)"' , data )

intent:sum integers co

intent:create list var_0 containing two empty lists
predicted:	y = [ i ]
ground_truth:	y = [ [ ] for n in range ( 2 ) ]

intent:read a file str_0 into a list var_0
predicted:	data = os . path . <unk> ( 'C:/name/MyDocuments/numbers' )
ground_truth:	data = [ line . strip ( ) for line in open ( 'C:/name/MyDocuments/numbers' , 'r' ) ]

intent:delete all occurrences of character str_0 in string str_1
predicted:	re . sub ( 'i' , 'it is icy' )
ground_truth:	"""""" . join ( [ <unk> for <unk> in 'it is icy' if <unk> != 'i' ] )

intent:delete all instances of a character str_0 in a string str_1
predicted:	re . findall ( 'i' , 'i' )
ground_truth:	re . sub ( 'i' , '' , 'it is icy' )

intent:delete all characters str_0 in string str_1
predicted:	re . sub ( 'i' , 'it is icy' )
ground_truth:	"""str_1""" . replace ( 'i' , '' )

intent:how to delete all instances of a character in a string in python ?
predicted:	re . sub ( <unk> , <unk> )
ground_truth:	"""""" . join ( [ <unk> for <unk> in <unk> if <unk

intent:remove all strings from a list a strings var_0 where the values starts with str_0 or str_1
predicted:	sents . pop ( )
ground_truth:	[ x for x in sents if not x . startswith ( '@$	' ) and not x . startswith ( '#' ) ]

intent:django filter by hour
predicted:	<unk> . <unk> ( <unk> , <unk> )
ground_truth:	<unk> . objects . filter ( <unk> = <unk> )

intent:sort a list of dictionary var_0 first by key var_1 and then by var_2
predicted:	list = sorted ( list , key = lambda x : x [ points ] )
ground_truth:	list . sort ( key = lambda item : ( item [ 'points' ] , item [ 'time' ] ) )

intent:convert datetime object str_0 to seconds
predicted:	os . system ( '(1970, 1, 1)' )
ground_truth:	( t - datetime . datetime ( (1970, 1, 1) ) ) . <unk> ( )

intent:insert var_0 before the file extension in str_0 or replace var_1 with var_2 if it precedes the extension .
predicted:	_a . <unk> ( 'long.file.name.jpg' , _a = 'long.file.name.jpg' )
ground_truth:	re . sub ( <unk> , <unk> , 'long.file.name.jpg' 