## Setup

```
pip install openai 
pip install datasets
pip install tiktoken
```

## Basic use of CodeQueries

Single data instance looks like this:
```
{'query_name': 'Unused import',
 'code_file_path': 'rcbops/glance-buildpackage/glance/tests/unit/test_db.py',
 'context_block': {'content': '# vim: tabstop=4 shiftwidth=4 softtabstop=4\n\n# Copyright 2010-2011 OpenStack, LLC\ ...',
                    'metadata': 'root',
                    'header': "['module', '___EOS___']",
                    'index': 0},
 'answer_spans': [{'span': 'from glance.common import context',
                   'start_line': 19,
                   'start_column': 0,
                   'end_line': 19,
                   'end_column': 33}
                 ],
 'supporting_fact_spans': [],
 'example_type': 1,
 'single_hop': False,
 'subtokenized_input_sequence': ['[CLS]_', 'Un', 'used_', 'import_', '[SEP]_', 'module_', '\\u\\u\\uEOS\\u\\u\\u_', '#', ' ', 'vim', ':', ...],
 'label_sequence': [4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...],
 'relevance_label': 1
}
```

In [50]:
import datasets

ds = datasets.load_dataset("thepurpleowl/codequeries", "twostep", split=datasets.Split.TEST, trust_remote_code=True)

all_queries = set(ds['query_name'])
print(f"Total queries in CodeQueries: {len(all_queries)}")
few_queries = '\n'.join([('  ' + str(idx+1) + '. ' + query) for idx, query in enumerate(list(all_queries)[:5])])
print(f"Few queries are :- \n{few_queries}")

Total queries in CodeQueries: 52
Few queries are :- 
  1. Unnecessary 'else' clause in loop
  2. Conflicting attributes in base classes
  3. Unnecessary delete statement in function
  4. Modification of parameter with default
  5. Constant in conditional expression or statement


In [57]:
import random

query_dataset = ds.filter(lambda example: example['query_name'] == "Inconsistent equality and hashing")
data_instance = query_dataset[random.randint(0, query_dataset.shape[0])]
print(f"Query: {data_instance['query_name']}\nFile path: {data_instance['code_file_path']}")

def get_sanitized_content(ctxt_blocks):
    """You can add your logic of cleaning data here. For now, we are just removing newlines."""
    context_blocks = ""
    for ctxt_block in ctxt_blocks:
        newline_removed_content = ("\n".join(line 
                                             for line in ctxt_block['content'].split('\n')
                                             if line))
        context_blocks += newline_removed_content
        context_blocks += '\n'
    return context_blocks

query_file_pair_dataset = query_dataset.filter(lambda example: example["query_name"]==data_instance['query_name'] and example['code_file_path']==data_instance['code_file_path'])
print(f"Total blocks for query: '{data_instance['query_name']}' and file: '{data_instance['code_file_path']}' are - {query_file_pair_dataset.shape[0]}")

relevant_code_blocks = []
answer_spans = []
supporting_fact_spans = []
for i, datum in enumerate(query_file_pair_dataset):
    if datum['relevance_label'] == 1:
        relevant_code_blocks.append(datum['context_block'])
        
    if i == 0:
        for span in datum['answer_spans']:
            answer_spans.append(span['span'])
        for span in datum['supporting_fact_spans']:
            supporting_fact_spans.append(span['span'])
    else:
        for span in datum['answer_spans']:
            assert span['span'] in answer_spans
        for span in datum['supporting_fact_spans']:
            assert span['span'] in supporting_fact_spans

code_context = get_sanitized_content([instance['context_block'] for instance in query_file_pair_dataset])
relevant_code_context = get_sanitized_content(relevant_code_blocks)
ans_spans = '\n'.join([('  ' + str(idx+1) + '. ' + span) for idx, span in enumerate(answer_spans)])
sf_spans = '\n'.join([('  ' + str(idx+1) + '. ' + span) for idx, span in enumerate(supporting_fact_spans)])
print(f"Total relevant blocks: {len(relevant_code_blocks)}")
print(f"Answer spans:\n {ans_spans if ans_spans else 'No Answer Spans'}")
print(f"Supporting fact spans:\n {sf_spans if sf_spans else 'No Answer Spans'}")
print(f"-----------------------\nRelevant code context:\n-----------------------\n {relevant_code_context}")

Query: Inconsistent equality and hashing
File path: phaethon/scapy/scapy/fields.py


Filter:   0%|          | 0/4058 [00:00<?, ? examples/s]

Total blocks for query: 'Inconsistent equality and hashing' and file: 'phaethon/scapy/scapy/fields.py' are - 226
Total relevant blocks: 5
Answer spans:
 No Answer Spans
Supporting fact spans:
 No Answer Spans
-----------------------
Relevant code context:
-----------------------
 class Emph:
    fld = b""
    def __init__(self, fld):
        self.fld = fld
    def __getattr__(self, attr):
        return getattr(self.fld,attr)
    def __hash__(self):
        return hash(self.fld)
    def __eq__(self, other):
        return self.fld == other


## Prompt with LLM

In [58]:
from openai import OpenAI
import tiktoken


MODEL = "gpt-3.5-turbo"
def count_tokens(input_str: str, model_name: str = MODEL) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(input_str))
    return num_tokens


client = OpenAI(
    api_key="OPENAI_API_KEY",
)

# You can get description and examples from the query examples from - https://codeql.github.com/codeql-query-help/python/
description = "Defining equality for a class without also defining hashability (or vice-versa) violates the object model."
system_message = (f"You are an expert software developer. Please help identify the results of evaluating the CodeQL query titled {data_instance['query_name']} on a code snippet."
                  f"The results should be given as code spans or fragments (if any) from the code snippet. The description of the CodeQL query { data_instance['query_name'] } is - {description}"
                  f"If there are spans that match the query description, print them out one per line. \n")
code_prompt = (f"Code snippet\n"
                  f"```python\n"
                  f"{ relevant_code_context }\n"
                  f"```\n"
                  f"Code span(s)\n"
                  f"```python")
# Code contexts can exceed 4096 tokens, so you can split them into multiple messages
if  count_tokens(code_prompt) > 4096:
    code_prompt = code_prompt[:3000]
                            
input_messages= [
    {"role": "user","content": system_message},
    {"role": "user", "content": code_prompt}
]

chat_completion = client.chat.completions.create(
    messages=input_messages,
    model=MODEL,
    temperature=0.8
)
print(chat_completion.choices[0].message.content)

def __eq__(self, other):
    return self.fld == other
