In [None]:
import tiktoken
import openai

def get_tokens(string):
  encoding = tiktoken.get_encoding("cl100k_base")
  num_tokens = len(encoding.encode(string))
  return num_tokens

openai.api_key = ""

def trim_input(user_input):
    if get_tokens(user_input) > 14000:
        user_input = user_input[:20000]
    return user_input

def clean_text(text):
  """
  Clean the text by removing certain formatting symbols.
  """
  text = text.replace("\t", "")  # Remove tabs
  text = text.replace("\n", " ")  # Replace newlines with space
  # Add any other replacements if needed
  return text


In [None]:
# STEP 1:MID LEVEL INTERROGATE
import os
import json
from concurrent.futures import ThreadPoolExecutor

directory_path = "./json_output/numbers"

def gpt_call(sys_prompt, user_input, model):
    gpt_knowledge_answer = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-16k",
    messages=[{
        "role": "system",
        "content": sys_prompt
    }, {
        "role": "user",
        "content": user_input
    }],
    temperature=0.8,
    )["choices"][0]["message"]['content']
    return gpt_knowledge_answer


if not os.path.exists(directory_path):
    print(f"Directory {directory_path} does not exist.")
else:
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, "r") as file:
                data = json.load(file)

            if 'test' in filename:
                continue
            
            filename = data['filename']
            package = data['package']
            functions = data['functions']
            types = data['types']
            structs = data['structs']
            interfaces = data['interfaces']
            variables = data['variables']
            constants = data['constants']
            function_calls = data['function_calls']
            code_content = data['code_content']

            user_input_dict = {
                "File": filename,
                "Package": package,
                "Functions": [function['name'] for function in functions] if functions else None,
                "Types": [type_['name'] for type_ in types] if types else None,
                "Structs": [struct['name'] for struct in structs] if structs else None,
                "Interfaces": [interface['name'] for interface in interfaces] if interfaces else None,
                "Code": code_content,
            }
            user_input_dict = {key: value for key, value in user_input_dict.items() if value is not None}

            user_input = json.dumps(user_input_dict)

            sys_prompt_qa = """You are code Q&A bot. Your job is to first analyze a golang backend code base of Acme Inc., cap table management software company.
            Secondly, you must produce a list questions and anwsers to them that a senior software engineer in Acme Inc. can ask and answer about the code. 
            #SAMPLE INPUT# {"File": "filename.go", "Package": "some_package", "Code": "golang code"}
            Optional fields are "Functions", "Types", "Structs", "Methods", "Interfaces". They contain names of the elements, not the full code.
            You need them to create complete Q&A in case full code is not provided due to input size limit.
            #QUESTION CONTEXT# Always ask specific questions about the file, ask questions about the file in the context of its package. Ask and asnwer 
            questions as if you are senior SWE. Do not ask about what packages are imported to the file. Don't ask about arguements or returns of the function. 
            You must produce anywhere from 10 to 25 questions and answers depending on the code lenght.
            #SAMPLE OUTPUT# Q: What external functions does DetailedExport rely on for its computations? A: The function relies on GetCompanyData
            to fetch company...
            """
            sys_prompt_summary = """You are code description writer for a golang backend of Acme Inc., a cap table management software company.
            Your job is to analyze golang code file and produce a description (max 200 words) of the file, as written by a 
            senior software engineer in the company. 
            #SAMPLE INPUT# {"File": "filename.go", "Package": "some_package", "Code": "golang code"}
            Optional fields are "Functions", "Types", "Structs", "Methods", "Interfaces". They contain names of the elements, not the full code.
            #SUMMARY CONTEXT# You must describe what is the intent of the code. What is the code doing? What is the code used for?
            Always mention specific names of code elements in your description."""
      
            with ThreadPoolExecutor() as executor:
                future_qa = executor.submit(gpt_call, sys_prompt_qa, trim_input(user_input), "gpt-3.5-turbo-16k")
                future_summary = executor.submit(gpt_call, sys_prompt_summary, trim_input(user_input), "gpt-3.5-turbo-16k")
                
                qa = future_qa.result()
                summary = future_summary.result()

            data["qa"] = qa
            data["summary"] = summary

            with open(file_path, "w") as file:
                json.dump(data, file, indent=4)

    

In [None]:
# STEP 2: HIGH LEVEL INTERROGATE
import os
import json
from concurrent.futures import ThreadPoolExecutor

directory_path = "./json_output/numbers"
PACKAGE = "calcs"

def gpt_call_high(sys_prompt, user_input, model):
    gpt_knowledge_answer = openai.ChatCompletion.create(
    model="gpt-4-32k",
    messages=[{
        "role": "system",
        "content": sys_prompt
    }, {
        "role": "user",
        "content": user_input
    }],
    )["choices"][0]["message"]['content']
    return gpt_knowledge_answer

package_descriptions = {
    "usecases": "Business and Workflow logic",
}

user_input_high = ""

if not os.path.exists(directory_path):
    print(f"Directory {directory_path} does not exist.")
else:
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, "r") as file:
                data = json.load(file)
            
            filename = data['filename']
            functions = data['functions']
            types = data['types']
            structs = data['structs']
            methods = data['methods']
            interfaces = data['interfaces']
            variables = data['variables']
            constants = data['constants']
            summary =   data['summary']
            imports = data['imports']
            package = data['package']
            import_packages = data['import_packages']

            Internal_imported_packages = ""
            if import_packages:
                for pck in import_packages:
                    if pck in package_descriptions:
                        Internal_imported_packages += f"{pck} ({package_descriptions[pck]}), "
                    else:
                        Internal_imported_packages += f"{pck}, "
            

            user_input_dict = {
                "File": filename,
                "Functions": [function['name'] for function in functions] if functions else None,
                "Types": [type_['name'] for type_ in types] if types else None,
                "Structs": [struct['name'] for struct in structs] if structs else None,
                "Methods": [method['name'] for method in methods] if methods else None,
                "Interfaces": [interface['name'] for interface in interfaces] if interfaces else None,
                'Internal Imported Packages': Internal_imported_packages if Internal_imported_packages else None,
                'External Imported Packages': str(imports) if imports else None,
                "Description": summary
            }
            user_input_dict = {key: value for key, value in user_input_dict.items() if value is not None}
            user_input_high += json.dumps(user_input_dict)


high_level_sys_prompt_qa = f"""You are code Q&A bot. Your job is to first analyze a golang package "{package}" from backend code base of Acme Inc., cap table management software company.
You must produce a list questions and anwsers to them that a senior software engineer in Acme Inc. can ask and answer about this package. 
You are given a "Filename" and "Description" of each file in this package.
Optional fields are "Functions", "Types", "Structs", "Methods", "Interfaces". They contain names of the elements, you might use to create questions.
#QUESTION CONTEXT# Always ask specific questions about the package, ask about the intent of this package, what is it used for, what is it doing?
You must produce anywhere from 25-30 questions and answers depending on the size of the package.
#SAMPLE OUTPUT# Q: What external functions does DetailedExport in numbers package rely on for its computations? A: The function relies on GetCompanyData
to fetch company...
"""
high_level_sys_prompt_summary = f"""You are code description writer for a golang package {package} from backend of Acme Inc., a cap table management software company.
Your job is to analyze the {package} package and produce a description of it (400 words), as written by a senior software engineer in the company. 
You are given a "Filename" and "Description" of each file in this package.
Optional fields are "Functions", "Types", "Structs", "Methods", "Interfaces". They contain names of the elements, you might use to create questions.
#SUMMARY CONTEXT# You must describe what is the intent of the package. What this package is doing? What is it used for?
Always mention specific names of code elements in your description."""


with ThreadPoolExecutor() as executor:
    future_qa = executor.submit(gpt_call_high, high_level_sys_prompt_qa, user_input_high, "gpt-4-32k")
    future_summary = executor.submit(gpt_call_high, high_level_sys_prompt_summary, user_input_high, "gpt-4-32k")
    
    qa_high = future_qa.result()
    summary_high = future_summary.result()

data_high = {}

data_high["qa"] = qa_high
data_high["summary"] = summary_high

with open(f"{PACKAGE}.json", "w") as file:
    json.dump(data_high, file, indent=4)


In [None]:
#LOW LEVEL INTERROG PARALLELIZED
import time
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Manager
from concurrent.futures import ThreadPoolExecutor
from json import JSONDecodeError
# Create a shared Manager object
import json
import os
manager = Manager()

directory_path = ""

# Create a shared variable for the rate limit flag
rate_limit_hit = manager.Value(bool, False)

with open("functions.json", "r") as f:
    functions_list = json.load(f)

def gpt_call_low(user_input, type, filename, package, model):

    sys_prompt_low_level = f"""#INSTRUCTION# You are a code Q&A bot. Your job is to first analyze a {type}, from the {filename} file 
    from the numbers package in golang code from the codebase of Acme Inc., a cap table management software company. 
    Numbers package is responsible for bringing data and calcs together to create small chunks of useful numbers for a given entity. 
    All functions in this package will be related to calculations, hence you all the questions and answers you should ask must be
    related to the way the  {type} is used for calculations of different element of the capitalization table.
    In most cases you will be given the code of the function to call, as well as the code of all 
    function calls made in the function, use this context to produce questions and answers.
    #QUESTION CONTEXT# Always ask specific questions about the {type}, and in the context of its file. Formulate 
    questions and answers as if you are a senior SWE. Don't ask trivial questions to explain the names of the input, return of any trivial
    questions about the structure of the code. You must focus on the intent of the code. What this function is useful for in numbers package?
    What this function calculates. How is it used in the cap table etc. You must produce up to 10 questions and answers, depending on the code length.
    #SAMPLE OUTPUT FORM# Q: How fuly diluted shares are calculated in the CalculateShares function? A: The function calculates(continued..)"""

    if model == "gpt-4":
        system_msg = sys_prompt_low_level
        user_msg = user_input
    else:
        system_msg = "You are bot that produces questions and answers about code by ananlyzing code of th function and code of the functions called by this function."
        user_msg = user_input


    if rate_limit_hit.value:
        print("Rate limit hit, sleeping for 60 seconds")
        time.sleep(60)
        rate_limit_hit.value = False

    msgs=[{
        "role": "system",
        "content": system_msg
    }, {
        "role": "user",
        "content": user_msg
    }]

    print(filename, model)
    print(msgs)
    print('-----------')

    while True:  # Keep trying until we get a valid response
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=msgs,
                max_tokens=1100,
                temperature=0.8
            )

            response_message = response["choices"][0]["message"]["content"]
            prompt_tokens, completion_tokens = response['usage']['prompt_tokens'], response['usage']['completion_tokens']
            print(f"{response_message[:200]}\n")
            print("------------------")
            return response_message  # Return the response if it's valid

        except JSONDecodeError as e:
            print(f'JSONDecodeError occurred: {e}')
            time.sleep(10)
            continue 
        except openai.error.ServiceUnavailableError as e:
            print(f'Server error: {e}. Retrying in 10 seconds.')
            time.sleep(10)  
            continue
        except openai.error.RateLimitError as e: 
            print(f'Hit rate limit. Waiting 60 sec')
            rate_limit_hit.value = True
            time.sleep(60)
            continue
        except openai.error.APIError as e: 
            print(f'BAD GATEWAY ERROR. Waiting 20 sec')
            rate_limit_hit.value = True
            time.sleep(20)  
            continue

def worker(input_data):
    # Unpack the input data
    user_input, code_type, filename, package, model = input_data
    return gpt_call_low(trim_input(user_input), code_type, filename, package, model)

# Create a ThreadPoolExecutor
executor = ThreadPoolExecutor(max_workers=8)  # Adjust the number of workers as needed

if not os.path.exists(directory_path):
    print(f"Directory {directory_path} does not exist.")
else:
    # for each file
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, "r") as file:
                data = json.load(file)

            # Prepare the tasks for the executor
            tasks = []
            print(filename)
            if data['functions']:
                for item in data['functions']:
                    
                    user_input = f" #Function to produce Q&A about#: {clean_text(item['body'])}\n"
                    try:
                        user_input += "#Code of the functions called by this function. Use this code context to produce better Q&A#:\n"
                        function_injected = 0
                        if item['functions_called']:
                            for function in item['functions_called']:
                                name = function['name']
                                if name in functions_list:
                                    function_injected +=1
                                    user_input += f"\n #Function name {name}#: #Code#: {clean_text(functions_list[name])}\n"

                        print(f"Injected {function_injected} functions")
                    except KeyError:
                        print(filename, "no functions called")

                    if get_tokens(user_input)+300 > 8000:
                        model = "gpt-3.5-turbo-16k"
                    elif get_tokens(user_input)+300 > 15500:
                        user_input = user_input[:25000]
                    else: 
                        model = "gpt-4"

                    tasks.append((user_input, 'function', filename, data['package'], model))


            for key in ['types', 'structs', 'interfaces']:
                if data[key]:
                    for item in data[key]:
                        user_input = f"{key.capitalize()[:-1]} name: {item['name']} | Code: {item['body']}\n"
                        if get_tokens(user_input)+300 > 8000:
                            model = "gpt-3.5-turbo-16k"
                        elif get_tokens(user_input)+300 > 15500:
                            user_input = user_input[:25000]
                            model = "gpt-3.5-turbo-16k"
                        else: 
                            model = "gpt-4"
                        tasks.append((user_input, key[:-1], filename, data['package'], model))

            # Run the tasks using the executor
            results = list(executor.map(worker, tasks))

            # Assign the results back to the original data
            idx = 0
            for key in ['functions', 'types', 'structs', 'interfaces']:
                if data[key]:
                    for item in data[key]:
                        item['qa'] = results[idx]
                        print(item['qa'])
                        idx += 1

            with open(file_path, "w") as file:
                json.dump(data, file, indent=4)
            print(filename, "done")
