-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathindex_py.py
105 lines (88 loc) · 3.53 KB
/
index_py.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import tree_sitter
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
import requests
import json
import numpy as np
import os
from pathlib import Path
import faiss
import pickle
# Set up Tree-sitter for Python
PY_LANGUAGE = Language(tspython.language()) #Language(os.path.expanduser('~/.tree-sitter/python.so'), 'python')
parser = Parser(PY_LANGUAGE)
INDEX_DIR = '.promptlycode/code_search_index'
def ensure_index_dir(base_dir):
"""Create the index directory if it doesn't exist."""
index_path = Path(base_dir) / INDEX_DIR
index_path.mkdir(parents=True, exist_ok=True)
return index_path
def extract_functions_from_file(file_path):
"""Extract all function definitions from a Python file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
code = f.read()
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
functions = []
for node in root_node.children:
if node.type == 'function_definition':
function_name = node.child_by_field_name('name').text.decode('utf8')
function_code = code[node.start_byte:node.end_byte]
# Calculate line numbers
start_line = node.start_point[0] + 1
end_line = node.end_point[0] + 1
functions.append({
'name': function_name,
'code': function_code,
'file': str(file_path),
'start_line': start_line,
'end_line': end_line,
'full_file_content': code
})
return functions
except Exception as e:
print(f"Error processing {file_path}: {e}")
return []
def embed_code(code_snippets):
"""Embed code using Ollama Nomic Embed Text."""
embeddings = []
for snippet in code_snippets:
response = requests.post('http://localhost:11434/api/embeddings',
json={
"model": "nomic-embed-text",
"prompt": snippet
})
if response.status_code == 200:
embedding = response.json()['embedding']
embeddings.append(embedding)
else:
print(f"Error embedding snippet: {response.status_code}")
embeddings.append([0] * 768)
return np.array(embeddings, dtype=np.float32)
def build_index(directory):
"""Build FAISS index from Python files in directory."""
functions = []
base_dir = Path(directory)
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.py'):
file_path = Path(root) / file
functions.extend(extract_functions_from_file(file_path))
if not functions:
print("No Python functions found in the directory.")
return
# Embed all function codes
code_snippets = [f['code'] for f in functions]
embeddings = embed_code(code_snippets)
# Initialize FAISS index
dimension = embeddings.shape[1] # Should be 768 for nomic-embed-text
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
# Save index and metadata in the specified directory
index_dir = ensure_index_dir(base_dir)
faiss.write_index(index, str(index_dir / 'code.index'))
with open(index_dir / 'metadata.pkl', 'wb') as f:
pickle.dump(functions, f)
print(f"Index built successfully at {index_dir}")
print(f"Total functions indexed: {len(functions)}")