# 1. Download arXiv metadata in bulk
from [kaggle](https://www.kaggle.com/datasets/Cornell-University/arxiv/data) and save as `./arxiv_metadata/arxiv-metadata-oai-snapshot.json`

# 2. Setup of Elastic search (on Windows)

1. install Docker

2. open WSL shell in this directory and execute:

    ```bash
    curl -fsSL https://elastic.co/start-local | sh
    ```
---

After running the script, you can access Elastic services at the following endpoints:

- Elasticsearch: http://localhost:9200
- Kibana: http://localhost:5601

The script generates a random password for the `elastic` user, and an API key, stored in the [.env file](./elastic-start-local/.env).

---

# 3. import downloaded arxiv metadata files

In [1]:
metadata_file = './arxiv_metadata/arxiv-metadata-oai-snapshot.json'

In [2]:
import json
from rolling.pdf import clean_text

def reader():
    with open(metadata_file, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            
            entry['title'] = clean_text(entry['title'])
            entry['abstract'] = clean_text(entry['abstract'])

            if 'categories' in entry:
                entry['categories'] = [
                    {
                        "category": c
                    } for c in entry['categories'].split(' ')
                ]
            if 'authors_parsed' in entry:
                entry['authors_parsed'] = [
                    {
                        "author": c
                    } for c in entry['authors_parsed']
                ]

            yield entry

line_iterator = reader()

In [3]:
from rolling.embedding import GTEEmbeddingModel
model = GTEEmbeddingModel()

In [4]:
def process_embeddings(entries:list[dict]):
    texts = [
        f'{e['title']}\n{e['abstract']}'
        for e in entries
    ]
    embeddings = model.encode(texts)
    for emb, entr in zip(embeddings, entries):
        entr['embedding'] = emb

    return entries

In [5]:
index_name = "arxiv"
def prepare_data_for_insert(objects:list[dict]):
    return [
        {
            "_index": index_name,
            "_id": o['id'],
            "_source": o,
        } for o in objects
    ]

In [6]:
def bulk_provider():
    counter = 0
    while True:
        # load data
        last_entry = False
        data = []        
        try:
            for _ in range(32):
                data.append(next(line_iterator))
        except StopIteration:
            last_entry = True

        data = process_embeddings(data)
        data = prepare_data_for_insert(data)
        
        for d in data:
            counter += 1
            if counter % 10000 == 0:
                print(f'Yielded {counter} results...')
            yield d

        if last_entry:
            break

provider = bulk_provider()

In [7]:
from elasticsearch import Elasticsearch
client = Elasticsearch("http://localhost:9200/", api_key="a1VxLUZaWUJ6Q1VRYXlSWTJSX2U6RWx3X0xQbXZURnlsNGU5UmNFeWZhZw==") # localhost

In [8]:
if not client.indices.exists(index=index_name):
    client.indices.create(index=index_name)

In [21]:
# Delete
if client.indices.exists(index=index_name):
    client.indices.delete(index=index_name)

In [9]:
from elasticsearch import helpers
from elasticsearch.helpers import BulkIndexError

try:
    helpers.bulk(client, provider)
except BulkIndexError as e:
    print("Bulk indexing failed with the following errors:")
    for error in e.errors:
        print(error)


Yielded 10000 results...
Yielded 20000 results...
Yielded 30000 results...
Yielded 40000 results...
Yielded 50000 results...
Yielded 60000 results...
Yielded 70000 results...
Yielded 80000 results...
Yielded 90000 results...
Yielded 100000 results...
Yielded 110000 results...
Yielded 120000 results...
Yielded 130000 results...
Yielded 140000 results...
Yielded 150000 results...
Yielded 160000 results...
Yielded 170000 results...
Yielded 180000 results...
Yielded 190000 results...
Yielded 200000 results...
Yielded 210000 results...
Yielded 220000 results...
Yielded 230000 results...
Yielded 240000 results...
Yielded 250000 results...
Yielded 260000 results...
Yielded 270000 results...
Yielded 280000 results...
Yielded 290000 results...
Yielded 300000 results...
Yielded 310000 results...
Yielded 320000 results...
Yielded 330000 results...
Yielded 340000 results...
Yielded 350000 results...
Yielded 360000 results...
Yielded 370000 results...
Yielded 380000 results...
Yielded 390000 result

---

In [10]:
s = "AI agents working together to solve complex tasks"
emb = model.encode(s)[0]
emb.shape, emb

((768,),
 array([-5.8350e-02, -1.3351e-02, -9.9976e-02,  5.4840e-02,  5.9082e-02,
        -6.1310e-02, -2.5360e-02, -1.0170e-02,  7.1350e-02,  5.5328e-02,
        -9.7412e-02, -7.7087e-02, -3.9795e-02,  1.0750e-02, -4.5929e-02,
         1.0480e-01,  1.1548e-01,  2.9221e-02,  4.2816e-02,  2.2171e-02,
         1.5480e-02,  3.1372e-02,  3.4180e-02, -7.5134e-02,  3.6987e-02,
         1.3054e-02,  9.1492e-02, -1.2398e-02,  5.1689e-03,  3.4149e-02,
        -4.2450e-02, -1.0229e-01, -2.0233e-02,  1.1528e-02,  9.7961e-02,
         6.1989e-03,  2.1973e-02,  5.8197e-02, -2.3712e-02, -6.4659e-03,
        -2.6722e-03, -5.7892e-02,  1.0223e-03,  3.0243e-02, -6.8970e-02,
         4.4708e-02,  3.7811e-02,  4.5532e-02, -6.1646e-02,  5.0323e-02,
         1.3901e-02,  2.5391e-02,  3.2562e-02,  2.0676e-02,  4.4159e-02,
        -1.2238e-02, -4.4647e-02,  4.8157e-02,  5.2032e-02, -7.6485e-03,
         8.4839e-03, -4.3907e-03,  2.1179e-02, -1.6129e-02, -5.3558e-02,
        -5.7190e-02,  2.1744e-02, -3.8385e

In [11]:
response = client.search(
    index=index_name,
    query={
        "knn": {
            "field": "embedding",
            "query_vector": emb
        }
    }
)

In [12]:
for e in response['hits']['hits']:
        print(f'{e["_score"]:.5f}: {e["_source"]["title"]}')

0.87402: Large Language Model-Driven Cross-Domain Orchestration Using Multi-Agent Workflow
0.87323: Towards Collaborative Question Answering: A Preliminary Study
0.86929: AutoAgents: A Framework for Automatic Agent Generation
0.86870: Automated Task-Time Interventions to Improve Teamwork using Imitation Learning
0.86700: A Cordial Sync: Going Beyond Marginal Policies for Multi-Agent Embodied Tasks
0.86674: Autonomous Agents for Collaborative Task under Information Asymmetry
0.86560: Coordinating metaheuristic agents with swarm intelligence
0.86504: BMW Agents -- A Framework For Task Automation Through Multi-Agent Collaboration
0.86449: Iris: A Conversational Agent for Complex Tasks
0.86244: Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks
