In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers # requires transformers==4.35.2
device = torch.device('cuda:0')

In [2]:
print(transformers.__version__)

4.43.3


In [3]:
draft_model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"
draft_model = AutoModelForCausalLM.from_pretrained(draft_model_name, trust_remote_code=True, device_map="auto", torch_dtype=torch.float16, use_flash_attention_2=True)#, load_in_4bit=True)
print(draft_model.device)

Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}
The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.


cuda:0


In [4]:
model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto", torch_dtype=torch.float16, use_flash_attention_2=True)#, load_in_4bit=True)#  , use_flash_attention=True)

Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
from datasets import load_dataset

ds = load_dataset("nuprl/CanItEdit", split="test")

In [6]:
import difflib

@torch.no_grad()
def find_candidate_pred_tokens(input_ids, max_ngram_size=3, num_pred_tokens=10):
    input_length = input_ids.size(1)

    # Ensure max_ngram_size and num_pred_tokens are valid
    if max_ngram_size <= 0 or num_pred_tokens <= 0 or max_ngram_size > input_length:
        raise ValueError("Invalid max_ngram_size or num_pred_tokens")

    for ngram_size in range(max_ngram_size, 0, -1):
        # Extract the last n tokens as our search ngram
        ngram = input_ids[0, -ngram_size:].tolist()

        # Create sliding windows of size ngram_size
        windows = input_ids.unfold(dimension=1, size=ngram_size, step=1)

        # Convert ngram to a tensor for comparison
        ngram_tensor = torch.tensor(ngram, device=input_ids.device).unsqueeze(0)

        # Find where the windows match the ngram
        matches = (windows == ngram_tensor).all(dim=2)

        # Get the indices of matches
        match_indices = matches.nonzero(as_tuple=True)[1]

        # Iterate through match indices to find a valid continuation
        for idx in match_indices:
            start_idx = idx + ngram_size
            end_idx = start_idx + num_pred_tokens
            # Ensure we don't go beyond the length of input_ids and avoid self-match
            # if end_idx <= input_length and start_idx < input_length - ngram_size:
            #     return input_ids[0, start_idx:end_idx]
            if start_idx < input_length - ngram_size:
                return input_ids[0, start_idx:min(end_idx, input_length)]

    # If no match is found, return an empty tensor
    return torch.tensor([100], dtype=torch.long, device=input_ids.device)

@torch.no_grad()
def find_candidate_pred_tokens_diff(input_ids, code_ids, orig_input_len=0, ngram_size=3, num_pred_tokens=10):
    # print(input_ids, code_ids)
    
    # start_time = time.perf_counter()
    input_length = input_ids.size(1)
    code_length = len(code_ids)

    # Ensure max_ngram_size and num_pred_tokens are valid
    if ngram_size <= 0 or ngram_size > input_length:
        raise ValueError("Invalid max_ngram_size or num_pred_tokens")

    sm = difflib.SequenceMatcher(None, code_ids, input_ids[0, orig_input_len:].tolist())
    
    deleted = added = changed = same = last_deleted = 0
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == 'replace':
            changed += i2 - i1
        elif tag == 'delete':
            deleted += i2 - i1
            last_deleted = i2 - i1
        elif tag == 'insert':
            added += j2 - j1
        elif tag == 'equal':
            same += i2 - i1
    
    approx_tokens_original = changed + deleted + same - last_deleted

    lookback_start = max(input_length - ngram_size, orig_input_len)
    search_ngram = input_ids[0, lookback_start:].tolist()

    for ngram_start in range(max(0, approx_tokens_original - ngram_size), len(code_ids)):
        # if there is a match, return the entire rest of the tokens.
        if ngram_start + len(search_ngram) >= len(code_ids):
            break
        if search_ngram == code_ids[ngram_start:ngram_start + len(search_ngram)]:
            return torch.tensor(code_ids[ngram_start + len(search_ngram):max(ngram_start + len(search_ngram) + num_pred_tokens, len(code_ids))], dtype=torch.long, device=input_ids.device)

    # If no match is found, return what the answer would be otherwise
    # print("Diff searching took: ", time.perf_counter() - start_time)
    return find_candidate_pred_tokens(input_ids, ngram_size, num_pred_tokens)
    # return torch.tensor([], dtype=torch.long, device=input_ids.device)


In [29]:
from transformers.generation.candidate_generator import CandidateGenerator, _crop_past_key_values
from transformers.generation.stopping_criteria import StoppingCriteria
from transformers.generation.configuration_utils import GenerationConfig
from typing import Tuple, Optional
import time

class DiffPromptLookupCandidateGenerator(CandidateGenerator):
    def __init__(self, input_ids, code_ids, ngram_size=3, num_pred_tokens=10):
        self.code_ids = code_ids
        self.orig_input_len = input_ids.shape[-1]
        self.ngram_size = ngram_size
        self.num_pred_tokens = num_pred_tokens
    
    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
        # print("Getting candidates")
        return torch.cat(
            (
                input_ids,
                find_candidate_pred_tokens_diff(input_ids, self.code_ids, self.orig_input_len, self.ngram_size, self.num_pred_tokens).unsqueeze(0)
            ),
            dim=-1
        ), None
    
    def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int): # Maybe use the number of matches/scores to have a threshold
        pass 

class NumRunsStoppingCriteria(StoppingCriteria):
    def __init__(self, max_num_runs=4):
        self.max_num_runs = 4
        self.num_runs = 0

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
        self.num_runs += 1
        return self.num_runs >= self.max_num_runs

class NewlineStoppingCriteria(StoppingCriteria):
    def __init__(self, tokenizer, prompt_tokens: int, newline_count=5):
        self.newline_token = tokenizer.encode("""
""")[-1]
        self.newline_count = newline_count
        self.prompt_tokens = prompt_tokens

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
        considered_tokens = tokenizer.batch_decode(input_ids[:, self.prompt_tokens:])[0]
        newline_list = "\n"*self.newline_count
        # print(newline_list, considered_tokens)
        return newline_list in considered_tokens

class ScoreStoppingCriteria(StoppingCriteria):
    def __init__(self, min_score, start_token_index):
        self.min_score = min_score
        self.start_token_index = start_token_index

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
        if not(scores):
            # print("No scores")
            return False
        else:
            print("Got scores scores stopping: ", scores[0].shape, len(scores))
        scores_tensor = torch.stack(scores, dim=0)
        softmax_scores = F.softmax(scores_tensor, 2)
        print(softmax_scores)
        return (softmax_scores.max(dim=2).values < self.min_score).any().item()

def _get_default_candidate_generator_generator(generator: CandidateGenerator):
    def _get_candidate_generator(self, **kwargs):
        return generator
    return _get_candidate_generator

class TwoLayerLookupCandidateGenerator(CandidateGenerator):
    def __init__(self, tokenizer, prompt_tokens, draft_model, input_ids, code_ids, num_runs=4, **diff_prompt_args):
        self.tokenizer = tokenizer
        self.prompt_tokens = prompt_tokens
        self.draft_model = draft_model
        self.input_ids = input_ids
        self.code_ids = code_ids
        self.candidate_generator = DiffPromptLookupCandidateGenerator(
            self.input_ids, 
            self.code_ids,
            **diff_prompt_args
        )
        self.draft_model.generation_config.pad_token_id = tokenizer.pad_token_id
        
        self.past_key_values = None
        self.num_runs = num_runs

        self.draft_model._get_candidate_generator = (_get_default_candidate_generator_generator(self.candidate_generator)).__get__(self.draft_model, type(self.draft_model))

        self.min_score = 1
        self.start_token_index = self.input_ids.shape[-1]

        self.min_score = 0
        self.scores_count = 0
    
    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
        if self.past_key_values:
            self.past_key_values = _crop_past_key_values(self.draft_model, self.past_key_values, input_ids.shape[-1] - 1)

        starting_input_length = input_ids.shape[-1]
        # print("Getting draft candidates")
        
        generation = self.draft_model.generate(
            inputs=input_ids,
            attention_mask=torch.ones(input_ids.shape[-1], device=input_ids.device).unsqueeze(0),
            prompt_lookup_num_tokens=1,
            max_new_tokens=1000,
            stopping_criteria=[NumRunsStoppingCriteria(self.num_runs), 
                               # NewlineStoppingCriteria(self.tokenizer, self.prompt_tokens), 
                               ScoreStoppingCriteria(self.min_score, starting_input_length)
                              ],
            past_key_values=self.past_key_values,
            use_cache=True,
            # output_logits=True,
            output_scores=True,
            return_dict_in_generate=True
        )

        # print("Scores: ", generation.scores)

        self.pred_tokens_count = generation.sequences.shape[-1] - input_ids.shape[-1]
        self.past_key_values = generation.past_key_values
        self.past_top_scores = torch.stack(generation.scores, dim=1).max(dim=1)

        return generation.sequences, torch.stack(generation.scores, dim=1)

    def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int):
        if num_matches == self.pred_tokens_count:
            if self.scores_count == 0:
                self.min_score = 0
            else:
                self.min_score = (self.scores_count / self.scores_count + 1) * (self.min_score)
        else:
            if self.scores_count == 0:
                self.min_score = self.past_top_scores[-num_matches]
            else:
                self.min_score = (self.scores_count / (self.scores_count + 1)) * (self.min_score) + (1 / (self.scores_count + 1)) * (self.past_top_scores[-1])
        self.scores_count += 1
        pass 

In [30]:
from tqdm import tqdm
from transformers import TextStreamer

stats = {"method": [], "assisted": [], "pld": [], "regular": [], "lev_dist": [], "generated_tokens": []}

for row in tqdm(ds):
    input_text = "## Code Before:\n{code_text}\n## Change requested: {question}\n## Rewrite the code to incorporate the change.\n".format(code_text=row['before'], question=row['instruction_descriptive'])
    # inputs = tokenizer(input_text, return_tensors="pt")
    inputs = tokenizer.apply_chat_template([
        {
            "role": "user",
            "content": input_text
        }
    ], tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)

    code_tokens = tokenizer(row['before'], return_tensors="pt")
    
    max_new_tokens = code_tokens.input_ids.shape[-1] + 500
    
    two_layer_candidate_generator = TwoLayerLookupCandidateGenerator(
        tokenizer,
        inputs.shape[-1],
        draft_model,
        inputs,
        code_tokens.input_ids.tolist()[0],
        ngram_size=5,
        num_pred_tokens=50
    )
    model._get_candidate_generator = (_get_default_candidate_generator_generator(two_layer_candidate_generator)).__get__(model, type(model))
    
    # Use this technique
    start_time = time.perf_counter()
    test_out = model.generate(
        inputs=inputs,
        # attention_mask=inputs.attention_mask,
        prompt_lookup_num_tokens=1,
        max_new_tokens=max_new_tokens,
        stopping_criteria=[NewlineStoppingCriteria(tokenizer, inputs.shape[-1])],
        use_cache=True,
        streamer=TextStreamer(tokenizer)
    )
    end_time = time.perf_counter()

    print("Time: ", end_time - start_time)

  0%|                                                                                                                                                                                                               | 0/105 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


<｜begin▁of▁sentence｜>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer
### Instruction:
## Code Before:
class CSVParser:
    def __init__(self, csv: str):
        self.csv = csv

    def contents(self) -> list[list[str]]:
        lines = self.csv.split("\n")
        output = []
        for line in lines:
            output.append(line.split(","))
        return output
## Change requested: Add a function called `header` which returns the first row of a csv file as a list of strings, where
every element in the list is a column in the row.
## Rewrite the code to incorporate the change.

### Response:
Got scores scores stopping:  torch.Size([1, 32256]) 1
tensor([[[2.7692e-12, 3.0322e-06, 6.9014e-05,  ..., 8.2923e-18,
          9.1073e-18, 8.5555e-

  1%|█▉                                                                                                                                                                                                     | 1/105 [00:03<06:15,  3.61s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


Time:  3.586542649194598
<｜begin▁of▁sentence｜>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer
### Instruction:
## Code Before:
class Fib:
    def __iter__(self):
        self.prev_prev = 0
        self.prev = 1
        return self
    def __next__(self):
        output = self.prev + self.prev_prev
        self.prev_prev = self.prev
        self.prev = output
        return output
## Change requested: add a method `next_n_fibs(n: int)` which takes in an integer, and produces a list containing the next `n` integers in the fibonacci sequence
starting from what the object would return if its `__next__` method was called. The method should not mutate the state of the object. When asked 
for the next fibonacci number after this method is called, i

  2%|███▊                                                                                                                                                                                                   | 2/105 [00:15<14:12,  8.27s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


Time:  11.53588342294097
<｜begin▁of▁sentence｜>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer
### Instruction:
## Code Before:
from typing import List, Literal, Tuple
from queue import PriorityQueue

Move = Literal["up", "down", "left", "right"]
# 0 = up, 1 = down, 2 = left, 3 = right
MoveIndex = Literal[0, 1, 2, 3]
# 0 = empty, 1 = wall, 2 = start, 3 = end
Cell = Literal[0, 1, 2, 3]


class Maze:
    def __init__(self, maze: List[List[Cell]]):
        self.maze = maze
        self.rows = len(maze)
        self.cols = len(maze[0])
        self.start = self.find_start()
        self.end = self.find_end()

    def find_start(self) -> Tuple[int, int]:
        for row in range(self.rows):
            for col in range(self.cols):
                

  3%|█████▋                                                                                                                                                                                                 | 3/105 [00:24<14:31,  8.55s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


Time:  8.86834499053657
<｜begin▁of▁sentence｜>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer
### Instruction:
## Code Before:
class Matrix:

    def __init__(self, matrix: list[list[int]]):
        self.matrix = matrix

    def add(self, other):
        result = []
        for i in range(len(self.matrix)):
            row = []
            for j in range(len(self.matrix[0])):
                row.append(self.matrix[i][j] + other.matrix[i][j])
            result.append(row)
        return Matrix(result)
    
    def subtract(self, other):
        result = []
        for i in range(len(self.matrix)):
            row = []
            for j in range(len(self.matrix[0])):
                row.append(self.matrix[i][j] - other.matrix[i][j])
          

  3%|█████▋                                                                                                                                                                                                 | 3/105 [00:27<15:49,  9.31s/it]

Got scores scores stopping:  torch.Size([1, 32256]) 127
tensor([[[1.1730e-17, 3.7593e-13, 2.0761e-13,  ..., 1.8321e-13,
          1.8321e-13, 1.7758e-13]],

        [[6.9270e-27, 1.5942e-16, 4.6687e-14,  ..., 1.9017e-25,
          1.6266e-25, 1.6783e-25]],

        [[1.0742e-18, 2.0761e-13, 8.3054e-15,  ..., 4.1587e-17,
          4.2242e-17, 4.1587e-17]],

        ...,

        [[1.8621e-22, 9.5221e-15, 1.0222e-10,  ..., 2.0421e-19,
          1.9792e-19, 2.0421e-19]],

        [[2.1088e-13, 3.7751e-11, 8.4234e-12,  ..., 1.2683e-17,
          1.1915e-17, 1.3501e-17]],

        [[1.2253e-11, 2.5684e-12, 2.3350e-09,  ..., 8.5309e-17,
          8.2684e-17, 8.8017e-17]]], device='cuda:0')





KeyboardInterrupt: 

In [None]:
# from rapidfuzz.distance import Levenshtein

# stats = {"method": [], "assisted": [], "pld": [], "regular": [], "lev_dist": [], "generated_tokens": []}

# for row in tqdm(ds):
#     input_text = "## Code Before:\n{code_text}\n## Change requested: {question}\n## Rewrite the code to incorporate the change.\n".format(code_text=row['before'], question=row['instruction_descriptive'])
#     # inputs = tokenizer(input_text, return_tensors="pt")
#     inputs = tokenizer.apply_chat_template([
#         {
#             "role": "user",
#             "content": input_text
#         }
#     ], tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)

#     code_tokens = tokenizer(row['before'], return_tensors="pt")
    
#     max_new_tokens = code_tokens.input_ids.shape[-1] + 500

#     # Use HuggingFace assisted decoding
#     start_time = time.perf_counter()
#     assisted_output = model.generate(
#         input_ids=inputs,
#         max_new_tokens=max_new_tokens,
#         return_dict_in_generate=True,
#         output_score=True,
#         assistant_model=draft_model
#     )
#     end_time = time.perf_counter()
#     stats["assisted"].append(end_time - start_time)

#     # Use HuggingFace prompt lookup decoding
#     start_time = time.perf_counter()
#     pld_output = model.generate(
#         input_ids=inputs,
#         max_new_tokens=max_new_tokens,
#         return_dict_in_generate=True,
#         output_score=True,
#         prompt_lookup_num_tokens=50
#     )
#     end_time = time.perf_counter()
#     stats["pld"].append(end_time - start_time)

#     # Use regular HuggingFace text generation
#     start_time = time.perf_counter()
#     regular_outputs = model.generate(
#         input_ids=inputs,
#         max_new_tokens=max_new_tokens,
#         return_dict_in_generate=True,
#         output_scores=True
#     )
#     end_time = time.perf_counter()

#     stats["regular"].append(end_time - start_time)
#     new_text = tokenizer.batch_decode(regular_outputs.sequences)[0]

#     lev_dist = Levenshtein.distance(row['before'], new_text) 
#     stats["lev_dist"].append(lev_dist)

#     stats["generated_tokens"].append(regular_outputs.sequences.shape[-1])

# print(time_taken)    