In [4]:
from __future__ import print_function

import os
import sys
import random
import traceback
import pickle
from keras.optimizers import RMSprop, Adam
from scipy.stats import rankdata
import math
from math import log
from models import *
import argparse
from datashape.coretypes import real
random.seed(42)
import threading
import tables  
import configs
import codecs
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

from utils import cos_np, normalize,cos_np_for_normalized
from configs import get_config
from models import JointEmbeddingModel
from parsePython import CodeVisitor

Using TensorFlow backend.


In [5]:


def nan_and_inf(a):
    where_are_nan = np.isnan(a)
    where_are_inf = np.isinf(a)
    a[where_are_nan] = 0
    a[where_are_inf] = 0
    return a


class CodeSearcher:
    def __init__(self, conf=None):
        self.cv = CodeVisitor()
        self.py_path = "./data/pydata/"
        self.transfer_path = "./data/transfer"

        self.py_codebase = self.load_pickle(self.py_path + "python_qid_to_code.pickle")
        #self.py_desc = self.load_pickle(self.py_path + "python_qid_to_title.pickle")
        self.py_use_codevec = self.py_path + "use_codevecs.h5"
        self.py_use_rawcode = self.py_path + "use_rawcode.pkl"
        self.py_use_token = self.py_path + "use_token.pkl"
        self.py_use_methname = self.py_path + "use_methname.pkl"
        self.py_use_apiseq = self.py_path + "use_apiseq.pkl"
        self._pycode_reprs=None

        self.transfer_Xs_new = self.transfer_path + "transfer_Xs_new.h5"
        self.transfer_Xt_new = self.transfer_path + "transfer_Xt_new.h5"

        self.conf = dict() if conf is None else conf
        self.path = self.conf.get('workdir', '../data/github/codesearch/')
        self.train_params = conf.get('training_params', dict())
        self.data_params=conf.get('data_params',dict())
        self.model_params=conf.get('model_params',dict())
                
        self.vocab_methname = self.load_pickle(self.path+self.data_params['vocab_methname'])
        self.vocab_apiseq=self.load_pickle(self.path+self.data_params['vocab_apiseq'])
        self.vocab_tokens=self.load_pickle(self.path+self.data_params['vocab_tokens'])
        self.vocab_desc=self.load_pickle(self.path+self.data_params['vocab_desc'])
        
        self._eval_sets = None
        
        self._code_reprs=None
        self._code_base=None
        self._code_base_chunksize=2000000
        
    def load_pickle(self, filename):
        return pickle.load(open(filename, 'rb'))    

    ##### Data Set #####
    def load_training_data_chunk(self,offset,chunk_size):
        logger.debug('Loading a chunk of training data..')
        logger.debug('methname')
        chunk_methnames=self.load_hdf5(self.path+self.data_params['train_methname'],offset,chunk_size)
        logger.debug('apiseq')
        chunk_apiseqs=self.load_hdf5(self.path+self.data_params['train_apiseq'],offset,chunk_size)
        logger.debug('tokens')
        chunk_tokens=self.load_hdf5(self.path+self.data_params['train_tokens'],offset,chunk_size)
        logger.debug('desc')
        chunk_descs=self.load_hdf5(self.path+self.data_params['train_desc'],offset,chunk_size)   
        return chunk_methnames,chunk_apiseqs,chunk_tokens,chunk_descs    
    def load_valid_data_chunk(self,chunk_size):
        logger.debug('Loading a chunk of validation data..')
        logger.debug('methname')
        chunk_methnames=self.load_hdf5(self.path+self.data_params['valid_methname'],0,chunk_size)
        logger.debug('apiseq')
        chunk_apiseqs=self.load_hdf5(self.path+self.data_params['valid_apiseq'],0,chunk_size)
        logger.debug('tokens')
        chunk_tokens=self.load_hdf5(self.path+self.data_params['valid_tokens'],0,chunk_size)
        logger.debug('desc')
        chunk_descs=self.load_hdf5(self.path+self.data_params['valid_desc'],0,chunk_size)   
        return chunk_methnames,chunk_apiseqs,chunk_tokens,chunk_descs   
    def load_use_data(self):
        logger.info('Loading use data..')
        logger.info('methname')
        methnames=self.load_hdf5(self.path+self.data_params['use_methname'],0,-1)
        #print(methnames)
        logger.info('apiseq')
        apiseqs=self.load_hdf5(self.path+self.data_params['use_apiseq'],0,-1)
        logger.info('tokens')
        tokens=self.load_hdf5(self.path+self.data_params['use_tokens'],0,-1) 
        return methnames,apiseqs,tokens      
    def load_codebase(self):
        """load codebase
        codefile: h5 file that stores raw code
        """
        logger.info('Loading codebase (chunk size={})..'.format(self._code_base_chunksize))
        if self._code_base==None:
            codebase=[]
            codes=codecs.open(self.path+self.data_params['use_codebase']).readlines()
                #use codecs to read in case of encoding problem
            for i in range(0,len(codes),self._code_base_chunksize):
                codebase.append(codes[i:i+self._code_base_chunksize])            
            self._code_base=codebase
    
    ### Results Data ###
    def load_code_reprs(self):
        logger.debug('Loading code vectors (chunk size={})..'.format(self._code_base_chunksize))
        if self._code_reprs==None:            
            """reads vectors (2D numpy array) from a hdf5 file"""
            codereprs=[]
            h5f = tables.open_file(self.path+self.data_params['use_codevecs'])
            vecs= h5f.root.vecs
            for i in range(0,len(vecs),self._code_base_chunksize):
                codereprs.append(vecs[i:i+self._code_base_chunksize])
            h5f.close()
            self._code_reprs=codereprs
        return self._code_reprs
        
    def save_code_reprs(self,vecs,filename):
        npvecs=np.array(vecs)
        fvec = tables.open_file(filename, 'w')
        atom = tables.Atom.from_dtype(npvecs.dtype)
        filters = tables.Filters(complib='blosc', complevel=5)
        ds = fvec.create_carray(fvec.root, 'vecs', atom, npvecs.shape,filters=filters)
        ds[:] = npvecs
        fvec.close()

    
    def load_hdf5(self,vecfile,start_offset,chunk_size):
        """reads training sentences(list of int array) from a hdf5 file"""  
        table = tables.open_file(vecfile)
        data, index = (table.get_node('/phrases'),table.get_node('/indices'))
        data_len = index.shape[0]
        if chunk_size==-1:#if chunk_size is set to -1, then, load all data
            chunk_size=data_len
        start_offset = start_offset%data_len
        offset=start_offset
        logger.debug("{} entries".format(data_len))
        logger.debug("starting from offset {} to {}".format(start_offset,start_offset+chunk_size))
        sents = []
        while offset < start_offset+chunk_size:
            if offset>=data_len:   
                logger.warn('Warning: offset exceeds data length, starting from index 0..')             
                chunk_size=start_offset+chunk_size-data_len
                start_offset=0
                offset=0
            len, pos = index[offset]['length'], index[offset]['pos']
            offset += 1
            sents.append(data[pos:pos + len].astype('int32'))
        table.close()
        return sents 

    ##### Converting / reverting #####
    def convert(self, vocab, words):
        """convert words into indices"""        
        if type(words) == str:
            words = words.strip().lower().split(' ')
        return [vocab.get(w, 0) for w in words]
    def revert(self, vocab, indices):
        """revert indices into words"""
        ivocab = dict((v, k) for k, v in vocab.items())
        return [ivocab.get(i, 'UNK') for i in indices]

    ##### Padding #####
    def pad(self, data, len=None):
        from keras.preprocessing.sequence import pad_sequences
        return pad_sequences(data, maxlen=len, padding='post', truncating='post', value=0)
    
    ##### Model Loading / saving #####
    def save_model_epoch(self, model, epoch):
        if not os.path.exists(self.path+'models/'+self.model_params['model_name']+'/'):
            os.makedirs(self.path+'models/'+self.model_params['model_name']+'/')
        model.save("{}models/{}/epo{:d}_code.h5".format(self.path, self.model_params['model_name'], epoch),
                   "{}models/{}/epo{:d}_desc.h5".format(self.path, self.model_params['model_name'], epoch), overwrite=True)
        
    def load_model_epoch(self, model, epoch):
        assert os.path.exists(
            "{}models/{}/epo{:d}_code.h5".format(self.path, self.model_params['model_name'], epoch))\
            ,"Weights at epoch {:d} not found".format(epoch)
        assert os.path.exists(
            "{}models/{}/epo{:d}_desc.h5".format(self.path, self.model_params['model_name'], epoch))\
            ,"Weights at epoch {:d} not found".format(epoch)
        model.load("{}models/{}/epo{:d}_code.h5".format(self.path, self.model_params['model_name'], epoch),
                   "{}models/{}/epo{:d}_desc.h5".format(self.path, self.model_params['model_name'], epoch))

    ##### Training #####
    def train(self, model):
        if self.train_params['reload']>0:
            self.load_model_epoch(model, self.train_params['reload'])
        valid_every = self.train_params.get('valid_every', None)
        save_every = self.train_params.get('save_every', None)
        batch_size = self.train_params.get('batch_size', 128)
        nb_epoch = self.train_params.get('nb_epoch', 10)
        split = self.train_params.get('validation_split', 0)
        
        val_loss = {'loss': 1., 'epoch': 0}

        logger.info("To run " + str(nb_epoch) + " times.")
        for i in range(self.train_params['reload']+1, nb_epoch):
            print('Epoch %d :: \n' % i, end='')            
            logger.debug('loading data chunk..')
            chunk_methnames,chunk_apiseqs,chunk_tokens,chunk_descs =\
                    self.load_training_data_chunk(\
                                        (i-1)*self.train_params.get('chunk_size', 100000),\
                                        self.train_params.get('chunk_size', 100000))
            logger.debug('padding data..')
            chunk_padded_methnames = self.pad(chunk_methnames, self.data_params['methname_len'])
            chunk_padded_apiseqs = self.pad(chunk_apiseqs, self.data_params['apiseq_len'])
            chunk_padded_tokens = self.pad(chunk_tokens, self.data_params['tokens_len'])
            chunk_padded_good_descs = self.pad(chunk_descs,self.data_params['desc_len'])
            chunk_bad_descs=[desc for desc in chunk_descs]
            random.shuffle(chunk_bad_descs)
            chunk_padded_bad_descs = self.pad(chunk_bad_descs, self.data_params['desc_len'])

            hist = model.fit([chunk_padded_methnames,chunk_padded_apiseqs,chunk_padded_tokens, chunk_padded_good_descs, chunk_padded_bad_descs], epochs=1, batch_size=batch_size, validation_split=split)

            if hist.history['val_loss'][0] < val_loss['loss']:
                val_loss = {'loss': hist.history['val_loss'][0], 'epoch': i}
            print('Best: Loss = {}, Epoch = {}'.format(val_loss['loss'], val_loss['epoch']))

            if valid_every is not None and i % valid_every == 0:
                acc1,mrr = self.valid(model,1000,1)   
                #acc,mrr,map,ndcg=self.eval(model, 1000, 1)             
                        
            if save_every is not None and i % save_every == 0:
                self.save_model_epoch(model, i)
    
    def valid(self, model, poolsize,K):
        """
        quick validation in a code pool. 
        param:
            poolsize - size of the code pool, if -1, load the whole test set
        """
        #load test dataset
        if self._eval_sets is None:
            #self._eval_sets = dict([(s, self.load(s)) for s in ['dev', 'test1', 'test2']])
            methnames,apiseqs,tokens,descs=self.load_valid_data_chunk(poolsize)
            self._eval_sets=dict()
            self._eval_sets['methnames']=methnames
            self._eval_sets['apiseqs']=apiseqs
            self._eval_sets['tokens']=tokens
            self._eval_sets['descs']=descs
           
        c_1, c_2 = 0, 0        
        data_len=len(self._eval_sets['descs'])
        for i in range(data_len):
            bad_descs=[desc for desc in self._eval_sets['descs']]
            random.shuffle(bad_descs)
            descs=bad_descs 
            descs[0]=self._eval_sets['descs'][i]#good desc
            descs=self.pad(descs,self.data_params['desc_len'])
            methnames=self.pad([self._eval_sets['methnames'][i]]*data_len,self.data_params['methname_len'])
            apiseqs=self.pad([self._eval_sets['apiseqs'][i]]*data_len,self.data_params['apiseq_len'])
            tokens=self.pad([self._eval_sets['tokens'][i]]*data_len,self.data_params['tokens_len'])
            n_good = K
            
            sims = model.predict([methnames, apiseqs,tokens, descs], batch_size=data_len).flatten()
            r = rankdata(sims, method='max')
            max_r = np.argmax(r)
            max_n = np.argmax(r[:n_good])
            c_1 += 1 if max_r == max_n else 0
            c_2 += 1 / float(r[max_r] - r[max_n] + 1)

        top1 = c_1 / float(data_len)
            #percentage of predicted most similar desc that is really the corresponding desc
        mrr = c_2 / float(data_len)
        logger.info('Top-1 Precision={}, MRR={}'.format(top1,mrr))
        
        return top1, mrr  
    

    ##### Evaluation in the develop set #####
    def eval(self, model, poolsize, K):
        """
        validate in a code pool. 
        param:
            poolsize - size of the code pool, if -1, load the whole test set
        """
        def ACC(real,predict):
            sum=0.0
            for val in real:
                try:
                    index=predict.index(val)
                except ValueError:
                    index=-1
                if index!=-1:
                    sum=sum+1  
            return sum/float(len(real))
        def MAP(real,predict):
            sum=0.0
            for id,val in enumerate(real):
                try:
                    index=predict.index(val)
                except ValueError:
                    index=-1
                if index!=-1:
                    sum=sum+(id+1)/float(index+1)
            return sum/float(len(real))
        def MRR(real,predict):
            sum=0.0
            for val in real:
                try:
                    index=predict.index(val)
                except ValueError:
                    index=-1
                if index!=-1:
                    sum=sum+1.0/float(index+1)
            return sum/float(len(real))
        def NDCG(real,predict):
            dcg=0.0
            idcg=IDCG(len(real))
            for i,predictItem in enumerate(predict):
                if predictItem in real:
                    itemRelevance=1
                    rank = i+1
                    dcg+=(math.pow(2,itemRelevance)-1.0)*(math.log(2)/math.log(rank+1))
            return dcg/float(idcg)
        def IDCG(n):
            idcg=0
            itemRelevance=1
            for i in range(n):
                idcg+=(math.pow(2,itemRelevance)-1.0)*(math.log(2)/math.log(i+2))
            return idcg
			
        #load valid dataset
        if self._eval_sets is None:
            methnames,apiseqs,tokens,descs=self.load_valid_data_chunk(poolsize)
            self._eval_sets=dict()
            self._eval_sets['methnames']=methnames
            self._eval_sets['apiseqs']=apiseqs
            self._eval_sets['tokens']=tokens
            self._eval_sets['descs']=descs
        acc,mrr,map,ndcg=0,0,0,0
        data_len=len(self._eval_sets['descs'])
        for i in range(data_len):
            print(i) 
            desc=self._eval_sets['descs'][i]#good desc
            descs=self.pad([desc]*data_len,self.data_params['desc_len'])
            methnames=self.pad(self._eval_sets['methnames'],self.data_params['methname_len'])
            apiseqs=self.pad(self._eval_sets['apiseqs'],self.data_params['apiseq_len'])
            tokens=self.pad(self._eval_sets['tokens'],self.data_params['tokens_len'])
            n_results = K          
            sims = model.predict([methnames, apiseqs,tokens, descs], batch_size=data_len).flatten()
            negsims=np.negative(sims)
            predict=np.argsort(negsims)#predict = np.argpartition(negsims, kth=n_results-1)
            predict = predict[:n_results]   
            predict = [int(k) for k in predict]
            real=[i]
            acc+=ACC(real,predict)
            mrr+=MRR(real,predict)
            map+=MAP(real,predict)
            ndcg+=NDCG(real,predict)                          
        acc = acc / float(data_len)
        mrr = mrr / float(data_len)
        map = map / float(data_len)
        ndcg= ndcg/ float(data_len)
        logger.info('ACC={}, MRR={}, MAP={}, nDCG={}'.format(acc,mrr,map,ndcg))
        
        return acc,mrr,map,ndcg
    
    
    ##### Compute Representation #####
    def repr_code(self,model):
        methnames,apiseqs,tokens=self.load_use_data()
        padded_methnames = self.pad(methnames, self.data_params['methname_len'])
        padded_apiseqs = self.pad(apiseqs, self.data_params['apiseq_len'])
        padded_tokens = self.pad(tokens, self.data_params['tokens_len'])
        
        vecs=model.repr_code([padded_methnames,padded_apiseqs,padded_tokens],batch_size=1000)
        vecs=vecs.astype('float32')
        self.save_code_reprs(vecs, self.path+self.data_params['use_codevecs'])
        return vecs

    def search(self,model,query,n_results=10):
        desc=[self.convert(self.vocab_desc,query)]#convert desc sentence to word indices
        padded_desc = self.pad(desc, self.data_params['desc_len'])
        desc_repr=model.repr_desc([padded_desc])
        desc_repr=desc_repr.astype('float32')
        codes=[]
        sims=[]
        threads=[]

        for i,code_reprs_chunk in enumerate(self._code_reprs):
            t = threading.Thread(target=self.search_thread, args = (codes,sims,desc_repr,code_reprs_chunk,i,n_results))
            threads.append(t)
        for t in threads:
            t.start()
        for t in threads:#wait until all sub-threads finish
            t.join()
        return codes,sims
                 
    def search_thread(self,codes,sims,desc_repr,code_reprs,i,n_results):        
    #1. compute similarity
        chunk_sims=cos_np_for_normalized(normalize(desc_repr),code_reprs) 
        
    #2. choose top results
        negsims=np.negative(chunk_sims[0])
        maxinds = np.argpartition(negsims, kth=n_results-1)
        maxinds = maxinds[:n_results]
        chunk_codes=[self._code_base[i][k] for k in maxinds]
        chunk_sims=chunk_sims[0][maxinds]
        codes.extend(chunk_codes)
        sims.extend(chunk_sims)

    """
    ==================================================================
    Python Data
    """
    def load_python_codebase(self):
        """load codebase
        codefile: pickle that stores raw code
        """
        logger.info('Loading codebase (chunk size={})..'.format(self._code_base_chunksize))
        if self._code_base==None:
            codebase=[]
            codes=self.load_pickle(self.py_use_rawcode)
                #use codecs to read in case of encoding problem
            codebase.append(codes)
            self._code_base=codebase

    def load_use_python_data(self):
        logger.info('Loading use data..')
        logger.info('methname')
        methnames=self.load_pickle(self.py_use_methname)
        #print(methnames)
        logger.info('apiseq')
        apiseqs=self.load_pickle(self.py_use_apiseq)
        logger.info('tokens')
        tokens=self.load_pickle(self.py_use_token) 
        return methnames,apiseqs,tokens 
    
    def load_pycode_reprs(self):
        logger.debug('Loading code vectors (chunk size={})..'.format(self._code_base_chunksize))
        if self._pycode_reprs==None:            
            """reads vectors (2D numpy array) from a hdf5 file"""
            codereprs=[]
            h5f = tables.open_file(self.py_use_codevec)
            vecs= h5f.root.vecs
            for i in range(0,len(vecs),self._code_base_chunksize):
                codereprs.append(vecs[i:i+self._code_base_chunksize])
            h5f.close()
            self._pycode_reprs=codereprs
        return self._pycode_reprs
        
    def repr_python_code(self, model):
        methnames,apiseqs,tokens=self.load_use_python_data()
        padded_methnames = self.pad(methnames, self.data_params['methname_len'])
        padded_apiseqs = self.pad(apiseqs, self.data_params['apiseq_len'])
        padded_tokens = self.pad(tokens, self.data_params['tokens_len'])
        
        vecs=model.repr_code([padded_methnames,padded_apiseqs,padded_tokens],batch_size=1000)
        vecs=vecs.astype('float32')
        self.save_code_reprs(vecs, self.py_use_codevec)
        return vecs

    def preprocess(self, model):
        cv = self.cv
        #py_desc = self.py_desc
        py_codebase = self.py_codebase

        raw_code = open(self.py_use_rawcode,"wb")
        code_list = []
        """
        index = 2490334
        code = py_codebase.get(2490334)
        token = cv.getToken(code)
        token_vec = self.convert(self.vocab_tokens,token)
        """

        keys = self.py_codebase.keys()
        token_list = []
        methname_list = []
        apiseq_list = []

        """
        index = 2490334
        code = py_codebase.get(index)
        print(code)
        cv.printAST(code)
        methodname = cv.getMethodName(code)
        apiSequence = cv.getAPISequence(code)
        token = cv.getToken(code)

        token_vec = self.convert(self.vocab_tokens,token)
        methodname_vec = self.convert(self.vocab_methname,methodname)
        apiSequence_vec = self.convert(self.vocab_apiseq,apiSequence)

        print(methodname_vec)
        print(apiSequence_vec)
        print(token_vec)
        """
        print("==================================================================================================")
        print("Embedding...")
        #"""
        for key in keys:
            code = py_codebase.get(key)
            #desc = py_desc.get(key)
            #print(code)
            try:
                token = cv.getToken(code)
                methodname = cv.getMethodName(code)
                apiSequence = cv.getAPISequence(code)

            except:
                #print("Error")
                continue
            else:
                token_vec = self.convert(self.vocab_tokens,token)
                methodname_vec = self.convert(self.vocab_methname,methodname)
                apiSequence_vec = self.convert(self.vocab_apiseq,apiSequence)
                
                token_list.append(token_vec)
                methname_list.append(methodname_vec)
                apiseq_list.append(apiSequence_vec)
                code_list.append(code)

                #desc_vec = self.convert(self.vocab_desc,desc)
        #"""
        pickle.dump(code_list,raw_code)
        raw_code.close()

        print("Embedding completed.")
        print("Saving...")

        token_output = open(self.py_use_token,'wb')
        pickle.dump(token_list, token_output)
        token_output.close()

        methname_output = open(self.py_use_methname,'wb')
        pickle.dump(methname_list, methname_output)
        methname_output.close()

        apiseq_output = open(self.py_use_apiseq,'wb')
        pickle.dump(apiseq_list, apiseq_output)
        apiseq_output.close()
        print("Saved.")

    '''
    =======================================
    Transfer
    '''  
    def transfer(self, method="TCA"):
        Xs = []
        Xt = []
        print("Transfering Source Data ...")
        methnames, apiseqs, tokens = self.load_use_data()
        py_methnames, py_apiseqs, py_tokens = self.load_use_python_data()

        # Because the array is too large
        # If we don't cut it, there'll be memory error
        methnames = methnames[0:5000]
        apiseqs = apiseqs[0:5000]
        tokens = tokens[0:5000]

        py_methnames = py_methnames[0:2000]
        py_apiseqs = py_apiseqs[0:2000]
        py_tokens = py_tokens[0:2000]

        padded_methnames = self.pad(methnames, self.data_params['methname_len'])
        padded_apiseqs = self.pad(apiseqs, self.data_params['apiseq_len'])
        padded_tokens = self.pad(tokens, self.data_params['tokens_len'])

        padded_py_methnames = self.pad(py_methnames, self.data_params['methname_len'])
        padded_py_apiseqs = self.pad(py_apiseqs, self.data_params['apiseq_len'])
        padded_py_tokens = self.pad(py_tokens, self.data_params['tokens_len'])

        for i in range(len(padded_methnames)):
            temp = np.append(padded_methnames[i], padded_apiseqs[i])
            temp = np.append(temp, padded_tokens[i])
            Xs.append(temp)

        for j in range(len(padded_py_methnames)):
            temp = np.append(padded_py_methnames[j], padded_py_apiseqs[j])
            temp = np.append(temp, padded_py_tokens[j])
            Xt.append(temp)

        Xs = np.array(Xs)
        Xt = np.array(Xt)
        Xs_new = []
        Xt_new = []
        if method == "TCA":
            from TCA import TCA
            tca = TCA(Xs, Xt, dim=self.data_params['methname_len'] + self.data_params['apiseq_len'] +  self.data_params['tokens_len'])
            Xs_new, Xt_new = tca.fit()
        else:
            print("Unknown Transfer")
            return
        print("Transfer Completed")
        self.save_code_reprs(Xs_new, self.transfer_Xs_new)
        self.save_code_reprs(Xt_new, self.transfer_Xt_new)

def parse_args():
    parser = argparse.ArgumentParser("Train and Test Code Search(Embedding) Model")
    parser.add_argument("--proto", choices=["get_config"],  default="get_config",
                        help="Prototype config to use for config")
    parser.add_argument("--mode", choices=["train","eval","repr_code","search","repr_python_code","preprocess", "search_python", "transfer"], default='train',
                        help="The mode to run. The `train` mode trains a model;"
                        " the `eval` mode evaluat models in a test set "
                        " The `repr_code/repr_desc` mode computes vectors"
                        " for a code snippet or a natural language description with a trained model.")
    parser.add_argument("--verbose",action="store_true", default=True, help="Be verbose")
    return parser.parse_args()


In [6]:
def kernel(ker, X, X2, gamma):
    if not ker or ker == 'primal':
        return X
    elif ker == 'linear':
        if not X2:
            K = np.dot(X.T, X)
        else:
            K = np.dot(X.T, X2)
    elif ker == 'rbf':
        n1sq = np.sum(X ** 2, axis=0)
        n1 = X.shape[1]
        if not X2:
            D = (np.ones((n1, 1)) * n1sq).T + np.ones((n1, 1)) * n1sq - 2 * np.dot(X.T, X)
        else:
            n2sq = np.sum(X2 ** 2, axis=0)
            n2 = X2.shape[1]
            D = (np.ones((n2, 1)) * n1sq).T + np.ones((n1, 1)) * n2sq - 2 * np.dot(X.T, X)
        K = np.exp(-gamma * D)
    elif ker == 'sam':
        if not X2:
            D = np.dot(X.T, X)
        else:
            D = np.dot(X.T, X2)
        K = np.exp(-gamma * np.arccos(D) ** 2)
    return K

In [7]:
conf = getattr(configs, "get_config")()
cs = CodeSearcher(conf)

In [11]:
methname, apiseq, token, desc = cs.load_training_data_chunk(0, 5000) 
py_methnames, py_apiseqs, py_tokens = cs.load_use_python_data()

py_methnames = py_methnames[0:2000]
py_apiseqs = py_apiseqs[0:2000]
py_tokens = py_tokens[0:2000]

2018-11-16 13:55:55,958: __main__: INFO: Loading use data..
2018-11-16 13:55:55,958: __main__: INFO: methname
2018-11-16 13:55:56,035: __main__: INFO: apiseq
2018-11-16 13:55:56,053: __main__: INFO: tokens


In [12]:
print(len(methname))

5000


In [13]:
padded_methnames = cs.pad(methname, 6)
padded_apiseqs = cs.pad(apiseq, 8)
padded_tokens = cs.pad(token, 10)

padded_py_methnames = cs.pad(py_methnames, 6)
padded_py_apiseqs = cs.pad(py_apiseqs, 8)
padded_py_tokens = cs.pad(py_tokens, 10)

In [41]:
Xs = []
Xt = []
for i in range(len(padded_methnames)):
    temp = np.concatenate((padded_methnames[i], padded_apiseqs[i], padded_tokens[i]), axis=0)
    Xs.append(temp)

for j in range(len(padded_py_methnames)):
    temp = np.concatenate((padded_py_methnames[j], padded_py_apiseqs[j], padded_py_tokens[j]), axis=0)
    Xt.append(temp)

In [42]:
Xs
Xt

[array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,  645,  739, 5587,    0,    0,    0,    0, 2374,
         604, 1047]),
 array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0, 842, 455,   0,   0,   0,   0,   0]),
 array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0, 1472,  180,    0, 2014, 1262,  158,  127,    0,
           0,    0]),
 array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 1084, 3734,    0,    0,    0,    0,
           0,    0]),
 array([   0,    0,    0,    0,    0,    0, 5053,    0,    0,    0,    0,
           0,    0,    0, 1033,  955,    0,    0,    0,    0,    0,    0,
           0,    0]),
 array([   0,    0,    0,    0,    0,    0,    0,  231,    0,    0,    0,
           0,    0,    0,    0,    0, 2423,   93,  187,  177,    0,    0,
           0,    0]),
 array([  0,

In [36]:
Xs = np.array(Xs)
Xt = np.array(Xt)

In [37]:
print(Xs.shape)
print(Xt.shape)

(5000, 24)
(2000, 24)


In [72]:
Xs_output = open("./data/transfer/Xs.pkl",'wb')
pickle.dump(Xs, Xs_output)
Xs_output.close()

Xt_output = open("./data/transfer/Xt.pkl",'wb')
pickle.dump(Xt, Xt_output)
Xt_output.close()

NameError: name 'xt' is not defined

In [15]:
print(Xs[1])

[ 262    0    0    0    0    0  652 2383 1105 7970    0    0    0    0
  315  178  192  457  515   37  903  460  803    0]


In [17]:
from sklearn import preprocessing
X = np.hstack((Xs.T, Xt.T))
X = preprocessing.scale(X)
m, n = X.shape
ns, nt = len(Xs), len(Xt)



In [18]:
e = np.vstack((1 / ns * np.ones((ns, 1)), -1 / nt * np.ones((nt, 1))))

In [19]:
M = e * e.T
M = M / np.linalg.norm(M, 'fro')
H = np.eye(n) - 1 / n * np.ones((n, n))
K = kernel('primal', X, None, gamma=1)
n_eye = m

In [20]:
K = K.astype("float64")
M = M.astype("float64")
H = H.astype("float64")

In [21]:
print(K.shape)
print(M.shape)
print(H.shape)

(24, 7000)
(7000, 7000)
(7000, 7000)


In [22]:
print(K)
print(M)
print(H)

[[ 0.13214598 -0.25726564  0.50636431 ... -0.29306374 -0.23296844
  -0.33319753]
 [-0.1225732  -0.41994529 -0.38246324 ... -0.45536643 -0.23296844
  -0.33319753]
 [-0.2774025  -0.41994529 -0.53346097 ... -0.45536643 -0.23296844
  -0.33319753]
 ...
 [-0.2774025  -0.13432454  2.05580693 ... -0.45536643 -0.23296844
  -0.33319753]
 [-0.2774025   0.07864919 -0.10020044 ...  3.61335145 -0.23296844
  -0.33319753]
 [-0.2774025  -0.41994529 -0.19543196 ... -0.45536643 -0.23296844
  -0.33319753]]
[[ 5.71428571e-05  5.71428571e-05  5.71428571e-05 ... -1.42857143e-04
  -1.42857143e-04 -1.42857143e-04]
 [ 5.71428571e-05  5.71428571e-05  5.71428571e-05 ... -1.42857143e-04
  -1.42857143e-04 -1.42857143e-04]
 [ 5.71428571e-05  5.71428571e-05  5.71428571e-05 ... -1.42857143e-04
  -1.42857143e-04 -1.42857143e-04]
 ...
 [-1.42857143e-04 -1.42857143e-04 -1.42857143e-04 ...  3.57142857e-04
   3.57142857e-04  3.57142857e-04]
 [-1.42857143e-04 -1.42857143e-04 -1.42857143e-04 ...  3.57142857e-04
   3.57142857

In [23]:
a = np.linalg.multi_dot([K, M, K.T]) + 1 * np.eye(n_eye)
b =  np.linalg.multi_dot([K, H, K.T])

In [24]:
print(a)
print(b)

[[ 5.45365027e+01  2.06916291e+02  6.01779842e+01 -4.41108286e+00
  -2.66335024e+01 -3.36831771e+01  1.16227874e+02  2.05477359e+02
   5.96719117e+01  9.18228555e+01  2.34019491e+01  5.63831410e+01
   2.53042033e+00  3.68161115e+01 -2.77434794e+02 -1.62326815e+02
  -1.18806114e+02 -7.68621401e+01 -5.28167947e+01 -5.21866995e+01
  -2.92840313e+01 -3.61451227e+01 -1.81192807e+01 -2.42528450e+01]
 [ 2.06916291e+02  8.00722608e+02  2.32585333e+02 -1.70486464e+01
  -1.02937347e+02 -1.30184038e+02  4.49215756e+02  7.94161198e+02
   2.30629384e+02  3.54891406e+02  9.04475317e+01  2.17918426e+02
   9.77996625e+00  1.42292695e+02 -1.07227360e+03 -6.27386192e+02
  -4.59180546e+02 -2.97068881e+02 -2.04134651e+02 -2.01699360e+02
  -1.13181528e+02 -1.39699352e+02 -7.00302443e+01 -9.37362078e+01]
 [ 6.01779842e+01  2.32585333e+02  6.86433761e+01 -4.95830063e+00
  -2.99375269e+01 -3.78617503e+01  1.30646546e+02  2.30967894e+02
   6.70745226e+01  1.03213958e+02  2.63050826e+01  6.33777628e+01
   2.844

In [26]:
import scipy.linalg
w, V = scipy.linalg.eig(a, b)
ind = np.argsort(w)
A = V[:, ind[:cs.data_params['methname_len'] + cs.data_params['apiseq_len'] +  cs.data_params['tokens_len']]]
Z = np.dot(A.T, K)
Z = preprocessing.scale(Z)
Xs_new, Xt_new = Z[:, :ns].T, Z[:, ns:].T

In [30]:
Xs_new

array([[ 0.88883158,  0.66225456, -0.7168811 , ..., -0.08981178,
        -0.94572374, -0.19437904],
       [ 0.4187211 , -0.17222077,  0.15849414, ..., -0.01569807,
        -0.89681129, -0.12745551],
       [-0.52153002, -1.53558483,  2.25652734, ...,  0.44646615,
         0.30313843,  0.17089183],
       ...,
       [ 1.16474112,  0.07406874,  0.25459769, ...,  0.3048042 ,
        -0.08596379, -0.03959826],
       [ 2.70968835, -0.67830051, -0.41596724, ...,  0.38081367,
        -0.51832091,  0.03620954],
       [-1.1315519 ,  1.01597955,  0.86811569, ...,  0.56547862,
         0.08535975,  0.30760682]])

In [49]:
print(Xs_new[:,0:11][0])
print(Xs_new[:,10:20][0])

[ 0.88883158  0.66225456 -0.7168811   0.17007139 -0.38466695  1.35858793
  2.17031723  2.80088255 -0.86651915  0.46400771 -0.29970038]
[-0.29970038  0.28819593 -0.94281521 -0.72226928 -0.36478785 -1.83506904
 -0.4111405  -0.57487726 -0.43444603 -0.02925631]


In [None]:
X = np.hstack((self.Xs.T, self.Xt.T))
        X = np.dot(X, np.diag(1 / (np.sum(X ** 2, axis=0) ** 0.5)))
        m, n = X.shape
        ns, nt = len(self.Xs), len(self.Xt)
        e = np.vstack((1 / ns * np.ones((ns, 1)), -1 / nt * np.ones((nt, 1))))
        M = e * e.T
        M = M / np.linalg.norm(M, 'fro')
        H = np.eye(n) - 1 / n * np.ones((n, n))
        K = kernel(self.kernel_type, X, None, gamma=self.gamma)
        n_eye = m if self.kernel_type == 'primal' else n
        a, b = np.linalg.multi_dot([K, M, K.T]) + self.lamb * np.eye(n_eye), np.linalg.multi_dot([K, H, K.T])
        w, V = scipy.linalg.eig(a, b)
        ind = np.argsort(w)
        A = V[:, ind[:self.dim]]
        Z = np.dot(A.T, K)
        Z = preprocessing.scale(Z)
        Xs_new, Xt_new = Z[:, :ns].T, Z[:, ns:].T
        return Xs_new, Xt_new