### Meta-Llama-3-8B-Instruct

In [1]:
import transformers
import pandas as pd
import os, sys
import time
import logging
import json
from tqdm import tqdm
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import torch
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

prompt_template = "Identify keywords from the summary and description of the bug report that can be used to detect duplicates.\n\nOutput format:\nSummary: [Selected Keywords]\nDescription: [Selected Keywords]\n\nSummary: {}\nDescription: {}\n\n"
project = 'spark'

df = pd.read_csv('../data/raw/test_{}.csv'.format(project))
flag_content_df = pd.read_csv(f'../data/ablation/test_{project}_flag_content.csv')

llama3_folder = '../data/keywords/{}/llama3/run_{}'

for run in range(1, 7):
    if not os.path.exists(llama3_folder.format(project, run)):
        os.makedirs(llama3_folder.format(project, run))

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        bug_id = row['bug_id']
        
        if flag_content_df[flag_content_df['bug_id'] == bug_id]['run_flag'].values[0] == 0:
            continue
        
        if os.path.exists(os.path.join(llama3_folder.format(project, run), f'{bug_id}.txt')):
            continue
                
        messages = [
            # {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
            {
                "role": "user", 
                "content": prompt_template.format(row['short_desc'], row['description'])
            },
        ]

        prompt = pipeline.tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
        )

        terminators = [
            pipeline.tokenizer.eos_token_id,
            pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
        try:
            outputs = pipeline(
                prompt,
                max_new_tokens=2048,
                # max_length=2048,
                eos_token_id=terminators,
                do_sample=False,
                top_p=1,
            )
        except Exception as e:
                print(e)
                messages = [
                    {
                        "role": "user", 
                        "content": prompt_template.format(row['short_desc'], row['description'][:2000])
                    },
                ]

                prompt = pipeline.tokenizer.apply_chat_template(
                        messages, 
                        tokenize=False, 
                        add_generation_prompt=True
                )
                
                outputs = pipeline(
                    prompt,
                    max_new_tokens=2048,
                    # max_length=2048,
                    eos_token_id=terminators,
                    do_sample=False,
                    top_p=1,
                )
                
        with open(os.path.join(llama3_folder.format(project, run), f'{bug_id}.txt'), 'w') as f:
            f.write(prompt_template.format(row['short_desc'], row['description']))
            f.write('\n\n>>>>>> Response:\n\n')
            f.write(outputs[0]["generated_text"][len(prompt):])

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 2841/2841 [00:00<00:00, 3831.15it/s]
100%|██████████| 2841/2841 [00:00<00:00, 3862.56it/s]
100%|██████████| 2841/2841 [00:00<00:00, 3784.49it/s]
100%|██████████| 2841/2841 [00:00<00:00, 3813.32it/s]
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `

The size of tensor a (16384) must match the size of tensor b (48335) at non-singleton dimension 3


  3%|▎         | 73/2841 [01:22<4:43:44,  6.15s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 75/2841 [01:42<5:45:49,  7.50s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 76/2841 [02:03<7:39:52,  9.98s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 77/2841 [02:22<9:11:41, 11.98s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 79/2841 [02:41<8:27:06, 11.02s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 80/2841 [03:04<10:27:12, 13.63s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 88/2841 [03:23<4:19:22,  5.65s/it] Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 89/2841 [03:40<5:16:08,  6.89s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎

CUDA out of memory. Tried to allocate 16.99 GiB (GPU 0; 44.52 GiB total capacity; 34.80 GiB already allocated; 7.41 GiB free; 36.06 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


 51%|█████     | 1445/2841 [4:49:19<11:57:05, 30.82s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1448/2841 [4:50:43<11:24:44, 29.49s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1449/2841 [4:51:51<14:09:39, 36.62s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1450/2841 [4:52:53<16:14:49, 42.05s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1451/2841 [4:53:52<17:44:08, 45.93s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1454/2841 [4:54:57<13:02:26, 33.85s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1455/2841 [4:56:34<17:38:43, 45.83s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 51%|█████     | 1456/2841 [4:58:07<21:31:58, 55.97s/it]Setting `pad_token_id` to `eos_token_id`:

### Phi-3-mini-128k-instruct

In [None]:
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm

import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

import torch
torch.random.manual_seed(42)
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
prompt_template = "Identify keywords from the summary and description of the bug report that can be used to detect duplicates.\n\nOutput format:\nSummary: [Selected Keywords]\nDescription: [Selected Keywords]\n\nSummary: {}\nDescription: {}\n\n"
project = 'spark'

df = pd.read_csv('../data/raw/test_{}.csv'.format(project))
flag_content_df = pd.read_csv(f'../data/ablation/test_{project}_flag_content.csv')

phi_folder = '../data/keywords/{}/phi/run_{}'

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")


for run in range(1, 6):
    if not os.path.exists(phi_folder.format(project, run)):
        os.makedirs(phi_folder.format(project, run))

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        bug_id = row['bug_id']
        
        if flag_content_df[flag_content_df['bug_id'] == bug_id]['run_flag'].values[0] == 0:
            continue
        
        if os.path.exists(os.path.join(phi_folder.format(project, run), f'{bug_id}.txt')):
            continue
                
        messages = [
            {
                "role": "user", 
                "content": prompt_template.format(row['short_desc'], row['description'])
            },
        ]

        try:
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
            )
            
            generation_args = {
                "max_new_tokens": 2048,
                "return_full_text": False,
                "temperature": 0.0,
                "do_sample": False,
                "top_p": 1,
            }

            output = pipe(messages, **generation_args)
            # print(output[0]['generated_text'])
        
        except Exception as e:
                print(e)
                messages = [
                    {
                        "role": "user", 
                        "content": prompt_template.format(row['short_desc'], row['description'][:2000])
                    },
                ]

                pipe = pipeline(
                    "text-generation",
                    model=model,
                    tokenizer=tokenizer,
                )
                
                generation_args = {
                    "max_new_tokens": 2048,
                    "return_full_text": False,
                    "temperature": 0.0,
                    "do_sample": False,
                    "top_p": 1,
                }

                output = pipe(messages, **generation_args)
                # print(output[0]['generated_text'])
                
        with open(os.path.join(phi_folder.format(project, run), f'{bug_id}.txt'), 'w') as f:
            f.write(prompt_template.format(row['short_desc'], row['description']))
            f.write('\n\n>>>>>> Response:\n\n')
            f.write(output[0]['generated_text'])