In [None]:
import os
import openai
import pinecone
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time
from json import JSONDecodeError
import concurrent.futures
import traceback
import requests
from PIL import Image
import pytesseract
from io import BytesIO

pinecone.init(api_key="", environment="")
index = pinecone.Index(index_name='')
os.environ['OPENAI_API_KEY'] = ""
openai.api_key = ""


In [None]:
#STEP 1: HIGH LEVEL UPSERT 
PACKAGE_NAME = "calcs" #<-------------EDIT HERE

with open(f'./{PACKAGE_NAME}.json', "r") as file:
    data = json.load(file)

summary = data['summary']
qa = data['qa']

#embed by qa but insert summary
description_embeddings_1 = openai.Embedding.create(
    input=qa,
    model="text-embedding-ada-002",
)["data"][0]["embedding"]

index.upsert([
    (f"{PACKAGE_NAME}_1", description_embeddings_1, {"summary": summary, "package": PACKAGE_NAME, "type": "package", "isTest": False, "scope": "high", })
    ], namespace='vicky',)

#embed by summary
description_embeddings_2 = openai.Embedding.create(
    input=summary,
    model="text-embedding-ada-002",
)["data"][0]["embedding"]

index.upsert([
    (f"{PACKAGE_NAME}_2", description_embeddings_2, {"summary": summary, "package": PACKAGE_NAME, "type": "package",  "isTest": False, "scope": "high" })
    ], namespace='vicky',)


In [None]:
#STEP 2: MID LEVEL EMBEDDING (FILES)

import json
import openai
openai.api_key = ""

directory_path = "./json_output/numbers/calcs" #<-------------EDIT HERE



if not os.path.exists(directory_path):
    print(f"Directory {directory_path} does not exist.")
else:
    # for each file
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):

            if 'test' in filename:
                IS_TEST = True 
                print('test in', filename)
            else:
                IS_TEST = False
                print('no test', filename)

            file_path = os.path.join(directory_path, filename)
            with open(file_path, "r") as file:
                data = json.load(file)

            #getting data
            filename = data['filename']
            package = data['package']
            external_imports, internal_imports = [], []

            if data['imports']:
                external_imports = data['imports']

            if data['import_packages']:
                internal_imports = data['import_packages']

            function_names = []
            if data["functions"]:
                for func in data["functions"]:
                    function_names.append(func["name"])
            
            summary = data['summary']
            qa = data['qa']

            #embed by qa but insert summary
            description_embeddings_1 = openai.Embedding.create(
                input=qa,
                model="text-embedding-ada-002",
            )["data"][0]["embedding"]

            index.upsert([
            (f"{filename}_qa", description_embeddings_1, {"summary": summary, "filename": filename, "package": package,  "external_imports": external_imports,
    "internal_imports": internal_imports, "type": "file", "function_names": function_names, "isTest": IS_TEST, "scope": "mid" })
            ], namespace='vicky',)

            #embed by summary but insert summary
            description_embeddings_2 = openai.Embedding.create(
                input=summary,
                model="text-embedding-ada-002",
            )["data"][0]["embedding"]

            index.upsert([
            (f"{filename}_summary", description_embeddings_2, {"summary": summary, "filename": filename, "package": package,  "external_imports": external_imports,
    "internal_imports": internal_imports, "type": "file", "function_names": function_names, "isTest": IS_TEST, "scope": "mid" })
            ], namespace='vicky',)

In [None]:
#STEP 3: LOW LEVEL EMBEDDING (CODE ELEMENTS)

import json
import openai
import os
openai.api_key = ""

directory_path = "./json_output/numbers" #<-------------EDIT HERE

def clean_text(text):
  text = text.replace("\t", "")  
  text = text.replace("\n", " ")  
  return text

def embed_and_upsert(id, embed_data, summary, filename, package, external_imports, internal_imports, type, code, embed_type, is_test, functions_called):
    
    description_embeddings = openai.Embedding.create(
        input=embed_data,
        model="text-embedding-ada-002",
    )["data"][0]["embedding"]
    print(id)
    index.upsert([
    (id, description_embeddings, { "filename": filename, "package": package, "external_imports": external_imports,
    "internal_imports": internal_imports, "type": type, "code": clean_text(code), "embed_type": embed_type, "isTest": is_test, "scope": "low", "functions_called": functions_called})
    ], namespace='vicky',)

if not os.path.exists(directory_path):
    print(f"Directory {directory_path} does not exist.")
else:

    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            if 'test' in filename:
                IS_TEST = True 
                print('test in', filename)
            else:
                IS_TEST = False
                print('no test', filename)

            file_path = os.path.join(directory_path, filename)
            with open(file_path, "r") as file:
                data = json.load(file)

            #this is all file specific 
            filename = data['filename']
            package = data['package']
            functions = data['functions']
            types = data['types']
            structs = data['structs']
            interfaces = data['interfaces']
            imports = data['imports']
            # file_summary = data['summary']
            file_summary = ""

            external_imports, internal_imports = [], []

            if data['imports']:
                external_imports = data['imports']

            if data['import_packages']:
                internal_imports = data['import_packages']
            
            if functions:
                for function in functions:
                    qa = function['qa'] 
                    code = function['body']
                    function_name = function['name']
                    functions_called = []

                    try:
                        for function_called in function['functions_called']:
                            functions_called.append(function_called['name'])
                    except:
                        print('no function call')

                    if 'test' in function_name:
                        IS_TEST = True 

                    embed_id_1 = f"{filename}_{function_name}_qa" 
                    embed_and_upsert(embed_id_1, qa, file_summary, filename, package, external_imports, internal_imports, "function", code, "qa", IS_TEST, functions_called)

                    embed_id_2 = f"{filename}_{function_name}_code"
                    embed_and_upsert(embed_id_2, code, file_summary, filename, package, external_imports, internal_imports, "function", code, "code", IS_TEST, functions_called)

            if types:
                for type in types:
                    qa = type['qa']
                    code = type['body']
                    type_name = type['name']
                    if 'test' in type_name:
                        IS_TEST = True 

                    embed_id_1 = f"{filename}_{type_name}_qa"
                    embed_and_upsert(embed_id_1, qa, file_summary, filename, package, external_imports, internal_imports, "type", code, "qa",IS_TEST, [])
                    embed_id_2 = f"{filename}_{type_name}_code"
                    embed_and_upsert(embed_id_2, code, file_summary, filename, package, external_imports, internal_imports, "type", code, "code",IS_TEST, [])

            if structs:
                for struct in structs:
                    qa = struct['qa']
                    code = struct['body']
                    struct_name = struct['name']
                    if 'test' in struct_name:
                        IS_TEST = True 

                    embed_id_1 = f"{filename}_{struct_name}_qa"
                    embed_and_upsert(embed_id_1, qa, file_summary, filename, package, external_imports, internal_imports, "struct", code, "qa",IS_TEST,[])

                    embed_id_2 = f"{filename}_{struct_name}_code"
                    embed_and_upsert(embed_id_2, code, file_summary, filename, package, external_imports, internal_imports, "struct", code, "code",IS_TEST,[])

            if interfaces:
                for interface in interfaces:
                    qa = interface['qa']
                    code = interface['body']
                    interface_name = interface['name']
                    if 'test' in interface_name:
                        IS_TEST = True 
                    embed_id_1 = f"{filename}_{interface_name}_qa"
                    embed_and_upsert(embed_id_1, qa, file_summary, filename, package, external_imports, internal_imports, "interface", code, "qa",IS_TEST,[])

                    embed_id_2 = f"{filename}_{interface_name}_code"
                    embed_and_upsert(embed_id_2, code, file_summary, filename, package, external_imports, internal_imports, "interface", code, "code",IS_TEST,[])


In [None]:
def embed_and_upsert(id, embed_data, summary, filename, package, external_imports, internal_imports, type, code, embed_type, is_test, functions_called):
    
    description_embeddings = openai.Embedding.create(
        input=embed_data,
        model="text-embedding-ada-002",
    )["data"][0]["embedding"]
    print(id)
    index.upsert([
    (id, description_embeddings, { "filename": filename, "package": package, "external_imports": external_imports,
    "internal_imports": internal_imports, "type": type, "code": clean_text(code), "embed_type": embed_type, "isTest": is_test, "scope": "low", "functions_called": functions_called})
    ], namespace='vicky',)
