# Import Dataset from kaggle

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("Cornell-University/arxiv")

print("Path to dataset files:", path)

Path to dataset files: /Users/leopard1566/.cache/kagglehub/datasets/Cornell-University/arxiv/versions/222


# Read into Pandas DataFrame

In [2]:
import pandas as pd
df = pd.read_json(path+'/arxiv-metadata-oai-snapshot.json', lines=True)
print(df.head())

          id           submitter  \
0  0704.0001      Pavel Nadolsky   
1  0704.0002        Louis Theran   
2  0704.0003         Hongjun Pan   
3  0704.0004        David Callan   
4  0704.0005  Alberto Torchinsky   

                                             authors  \
0  C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...   
1                    Ileana Streinu and Louis Theran   
2                                        Hongjun Pan   
3                                       David Callan   
4           Wael Abu-Shammala and Alberto Torchinsky   

                                               title  \
0  Calculation of prompt diphoton production cros...   
1           Sparsity-certifying Graph Decompositions   
2  The evolution of the Earth-Moon system based o...   
3  A determinant of Stirling cycle numbers counts...   
4  From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...   

                                  comments  \
0  37 pages, 15 figures; published version   
1    To appear in

# Check for NAN in categories

In [3]:
df = df[df['categories'].notna()]
print(df.head())

          id           submitter  \
0  0704.0001      Pavel Nadolsky   
1  0704.0002        Louis Theran   
2  0704.0003         Hongjun Pan   
3  0704.0004        David Callan   
4  0704.0005  Alberto Torchinsky   

                                             authors  \
0  C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...   
1                    Ileana Streinu and Louis Theran   
2                                        Hongjun Pan   
3                                       David Callan   
4           Wael Abu-Shammala and Alberto Torchinsky   

                                               title  \
0  Calculation of prompt diphoton production cros...   
1           Sparsity-certifying Graph Decompositions   
2  The evolution of the Earth-Moon system based o...   
3  A determinant of Stirling cycle numbers counts...   
4  From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...   

                                  comments  \
0  37 pages, 15 figures; published version   
1    To appear in

# Import Vector Database and Model

In [4]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

# Create vector database client and model client

In [20]:
model_encoder = SentenceTransformer('all-MiniLM-L6-v2')
qdrant_client = QdrantClient(":memory:")

Create collection to store 

In [28]:
qdrant_client.recreate_collection(
    collection_name = "arxiv",
    vectors_config = models.VectorParams(
        #Note that for our model, this vector size is 384
        size=model_encoder.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE
    )
)
if qdrant_client.collection_exists(collection_name="arxiv"):
    print("arxiv collection created successfully")

arxiv collection created successfully


  qdrant_client.recreate_collection(


# Insert quadrant data into collection
- Note to self: qdrant_client.upsert is used for bulk upload of points, while qdrant_client.insert is used for single point upload

- Before uploading the vectorized data, we load the dataframe into a key value pair dictionary to enable convenient iteration.

In [50]:
#Convert the WHOLE dataframe to a dictionary
#data_in_dict = df.to_dict(orient='records')

#Convert sample of the dataframe to a dictionary
data_in_dict = df.head(5000).to_dict(orient='records')

qdrant_client.upload_points(
    collection_name = "arxiv",
    points=[
        models.PointStruct(
            id = index,
            payload=doc,
            vector = model_encoder.encode(doc["abstract"]),
        )for index, doc in enumerate(data_in_dict)
    ]
)

# Query

In [None]:
user_prompt = "I want a research paper that focuses on Graphics"
limit_search_to = 10
#This will later be read from the user when the app is running on flask

query_prompt = "You are an AI agent searching for arXiv papers based on the following instructions: "+user_prompt

In [67]:
#Search based on user prompt
hits = qdrant_client.search(
    collection_name = "arxiv",
    query_vector=model_encoder.encode(query_prompt),
    limit=limit_search_to
)
#Print Queried Outputs
for hit in hits:
    print("TITLE: "+hit.payload["title"]+'\n\n', "arXivID: "+hit.payload["id"]+'\n\n', "AUTHORS: "+hit.payload["authors"]+'\n\n', "ABSTRACT: "+hit.payload["abstract"][0:100]+ '...' + '\n\n', '============break==========\n')


TITLE: Generating Unexpected Spin Echoes in Dipolar Solids with Pi Pulses

 arXivID: 0705.0620

 AUTHORS: Dale Li, A. E. Dementyev, Yanqun Dong, R. G. Ramos, and S. E. Barrett

 ABSTRACT:   This submission has been withdrawn by arXiv administrators because it is a
duplicate of 0705.0667....


TITLE: Metal and molecule cooling in simulations of structure formation

 arXivID: 0704.2186

 AUTHORS: U. Maio, K. Dolag, B. Ciardi, L. Tornatore

 ABSTRACT:   This submission has been withdrawn by arXiv administrators because it is a
duplicate of 0704.2182....


TITLE: Gravity-induced electric polarization of matter and planetary magnetic
  fields

 arXivID: 0704.0374

 AUTHORS: Boris A. Zon, Igor Yu. Kretinin

 ABSTRACT:   This paper has been withdrawn due to copyright reasons.
...


TITLE: Optimal time evolution in (non)hermitian quantum mechanics

 arXivID: 0704.3677

 AUTHORS: Pulak Ranjan Giri

 ABSTRACT:   This paper has been withdrawn by the author
...


TITLE: A new proof of the Beez-Car

  hits = qdrant_client.search(
