In [None]:
from diskcache import Cache
from sweepai.config.server import CACHE_DIRECTORY
from sweepai.core.lexical_search import get_lexical_cache_key, CustomIndex, tokenize_code

lexical_index_cache = Cache(f'{CACHE_DIRECTORY}/lexical_index_cache')
snippets_cache = Cache(f'{CACHE_DIRECTORY}/snippets_cache')

lexical_cache_key = get_lexical_cache_key("/tmp/aurea-crm")
snippets, file_list = snippets_cache.get(lexical_cache_key)
index = lexical_index_cache.get(lexical_cache_key)

In [None]:
from tqdm import tqdm
with open("tmp.txt", "w") as f:
    for k, v in tqdm(index.inverted_index.items()):
        f.write(f"{k}\n")

In [None]:
import tantivy

schema_builder = tantivy.SchemaBuilder()
schema_builder.add_text_field("body",stored=True)
schema_builder.add_integer_field("doc_id",stored=True)
schema = schema_builder.build()

tantivy_index = tantivy.Index(schema, path="tantivy")

In [None]:
from sweepai.core.lexical_search import tokenize_code
from tqdm import tqdm


all_tokens = [
    tokenize_code(snippet.get_snippet(False, False)) for snippet in tqdm(snippets)
]

In [None]:
writer = tantivy_index.writer()

for i, tokens in enumerate(tqdm(all_tokens)):
    writer.add_document(
        tantivy.Document(
            body=" ".join(tokens),
            doc_id=i
        )
    )

In [None]:
writer.commit()

In [None]:
tantivy_index.reload()
searcher = tantivy_index.searcher()

In [None]:
query = "hello world"
query = " ".join(tokenize_code(query))
query = tantivy_index.parse_query(query, ["body"])

for score, address in  searcher.search(query, 3).hits:
    print(searcher.doc(address)["body"])

In [None]:
searcher.doc(top_doc)

In [None]:
snippets[11478].content

In [None]:
# print(tantivy_index)
print(index.metadata)

In [None]:
import numpy as np

numpy_inverted_index = {
    k: np.array(v, dtype=(np.uint16, np.uint16))
    for k, v in tqdm(index.inverted_index.items())
}

In [None]:
import sys

sys.getsizeof(numpy_inverted_index) / 1024 / 1024

In [None]:
sum([a.nbytes for a in numpy_inverted_index.values()])

In [None]:
import pickle

with open("tmp.pkl", "wb") as f:
    pickle.dump(numpy_inverted_index, f)

In [None]:
import pickle

with open("tmp.txt", "wb") as f:
    for k, arr in tqdm(numpy_inverted_index.items()):
        f.write(k.encode())
        f.write(b"\n")
        f.write(arr.tobytes())
        f.write(b"\n")

In [None]:
newly_loaded = {}

with open("tmp.txt", "rb") as f:
    while True:
        k = f.readline().strip()
        print(k)
        if not k:
            break
        arr_encoded = f.readline()
        print(arr_encoded)
        arr = np.frombuffer(arr_encoded, dtype=(np.uint16, np.uint16))
        newly_loaded[k] = arr

In [None]:
arr.tofile("tmp.npy")

In [None]:
max_a, max_b = 0, 0

for k, v in tqdm(index.inverted_index.items()):
    for a, b in v:
        max_a = max(max_a, a)
        max_b = max(max_b, b)

In [None]:
print(max_a, max_b)

In [None]:
max(v[0] for v in index.inverted_index.values())
max(v[1] for v in index.inverted_index.values())

In [None]:
from tqdm import tqdm
other_index = index.inverted_index

for k, v in tqdm(other_index.items()):
    print(v) # store v as a 2 item numpy array
    break

In [None]:
import json

with open("tmp.json", "w") as f:
    json.dump(index.inverted_index, f)

In [None]:
import pyarrow as pa
# Convert the dictionary to a PyArrow table
table = pa.Table.from_pydict(index.inverted_index)
# Serialize the table to a file

In [None]:
with pa.OSFile('data.arrow', 'wb') as f:
    pa.ipc.write_table(table, f)

In [None]:
# Deserialize the table from a file
with pa.OSFile('data.arrow', 'rb') as f:
    table = pa.ipc.read_table(f)
# Convert the table back to a dictionary
deserialized_data = table.to_pydict()

In [None]:
lengths = [len(k) for k in index.inverted_index.keys()]
lengths = [len(k) for k in index.inverted_index.keys() if "_" not in k]
sorted_keys = sorted([k for k in index.inverted_index.keys() if "_" not in k], key=lambda x: len(x), reverse=True)
sizes = {token: len(index.inverted_index[token]) for token in index.inverted_index.keys()}
sorted_sizes = sorted(sizes.items(), key=lambda x: x[1], reverse=True)

In [None]:
# get most common terms

cdf = [0]

for i in range(1, len(sorted_sizes)):
    cdf.append(cdf[-1] + sorted_sizes[i][1])

# diagram the cdf using matplotlib

import matplotlib.pyplot as plt

plt.plot([n / cdf[-1] for n in cdf][:1000])
plt.show()


In [None]:
sorted_sizes.index(("div", 2385))

In [None]:
top_tokens = [k for k in index.inverted_index.keys() if "_" not in k][:700]

new_inverted_index = {k: index.inverted_index[k] for k in index.inverted_index if k not in top_tokens}
print(len(new_inverted_index))
print(len(index.inverted_index))
# print(len(top_tokens))
# print(top_tokens[:5])

In [None]:
import sys

print(sum(len(v) for v in index.inverted_index.values()))
print(sum(len(v) for v in new_inverted_index.values()))

In [None]:
max_freq = 0
max_document_index = 0

for k, v in index.inverted_index.items():
    for doc_id, freq in v:
        if freq > max_freq:
            max_freq = freq
        if doc_id > max_document_index:
            max_document_index = doc_id

In [None]:
max_freq, max_document_index

In [None]:
import msgpack

# Serialization
data = new_inverted_index
serialized_data = msgpack.packb(data)
len(serialized_data)

In [None]:
with open("inverted_index.msgpack", "wb") as f:
    f.write(serialized_data)

In [None]:
with open("inverted_index.msgpack", "rb") as f:
    message = f.read()

# Deserialization
deserialized_data = msgpack.unpackb(message)

In [None]:
import pickle

pickled_string = pickle.dumps(new_inverted_index)

In [None]:
# histogram of lengths of index.inverted_index's keys
# use matplotlib

from matplotlib import pyplot as plt

plt.hist(lengths, bins=20, range=(0, 100))

In [None]:
keys = [k for k in index.inverted_index.keys() if len(k) == 20]
print(keys)