In [1]:
import torch
from unixcoder import UniXcoder
import datasets

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = UniXcoder("microsoft/unixcoder-base")
model.to(device)

UniXcoder(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(51416, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),

In [2]:
# decoder no mask just completion
context = """
def countWords(str):
    words = str.split()
"""

def predict_statement(input: str) -> str:
    tokens_ids = model.tokenize([input],max_length=512,mode="<decoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    prediction_ids = model.generate(source_ids, decoder_only=True, beam_size=3, max_length=128)
    predictions = model.decode(prediction_ids)
    return input+predictions[0][0]

print(predict_statement(context))

  prevK = bestScoresId // numWords



def countWords(str):
    words = str.split()
    count = 0
    for word in words:
        count += 1
    return count


In [3]:
# encoder decoder mask
context = """
function countWords(str) {
    const words = str.split(" ")
    <mask0>
}
"""

def predict_statement_mask(code: str) -> str:
    tokens_ids = model.tokenize([code],max_length=512,mode="<encoder-decoder>")
    source_ids = torch.tensor(tokens_ids).to(device)
    prediction_ids = model.generate(source_ids, decoder_only=False, beam_size=3, max_length=128)
    predictions = model.decode(prediction_ids)
    # top 1
    return predictions[0][0].replace("<mask0>", "").strip()

print(context.replace("<mask0>", predict_statement_mask(context)))

  prevK = bestScoresId // numWords



function countWords(str) {
    const words = str.split(" ")
    return words.length
}



In [16]:
context = """
function countWords(str) {
    const words = str.split(" ")
    <mask0>
}
"""

tokens_ids = model.tokenize([context],max_length=512,mode="<encoder-decoder>")
source_ids = torch.tensor(tokens_ids).to(device)
prediction_ids = model.generate(source_ids, decoder_only=False, beam_size=3, max_length=128)
predictions,  = model.decode(prediction_ids)
print(f"Generated {len(predictions)} predictions:\n", "\n".join(f"{i}: {p.strip()}" for i, p in enumerate(predictions)))

Generated 3 predictions:
 0: <mask0>
    return words.length
1: <mask0>
    return words
2: 


In [12]:
context = """def (str)"""
tokens_ids = model.tokenize([context],max_length=512,mode="<decoder-only>")
source_ids = torch.tensor(tokens_ids).to(device)
prediction_ids = model.generate(source_ids, decoder_only=True, beam_size=3, max_length=128)
predictions = model.decode(prediction_ids)
print(context+predictions[0][0])
print(predictions[0][0])

def count_words(str):
    """
    def count_words(str):
    """
    return count_words(str)

    @wraps(count_words)
    def count_words_wrapper(*args, **kwargs):
        return count_words(str, *args, **kwargs)

    return count_words_wrapper

    """
    def count_words(str):
    """
    return count_words(str)

    @wraps(count_words)
    def count_words_wrapper(*args, **kwargs):
        return count_words(str, *args, **kwargs)

    return count_words_wrapper
