In [1]:
!pip -q install chromadb pandas scikit-learn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m87.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m87.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m92.3 MB/s[0m eta [36m0:00:

In [2]:
# For setting up the vector db
import chromadb

# For working with our data files
import os
import glob

# For handling our data
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Miscellaneous
import time # to see how long code processes take to run
import textwrap # to pretty print abstracts

In [3]:
!gdown 1g3K-wlixFxklTSUQNZKpEgN4WNTFTPIZ
# https://drive.google.com/file/d/1g3K-wlixFxklTSUQNZKpEgN4WNTFTPIZ/view
# https://alex.macrocosm.so/download


Downloading...
From (original): https://drive.google.com/uc?id=1g3K-wlixFxklTSUQNZKpEgN4WNTFTPIZ
From (redirected): https://drive.google.com/uc?id=1g3K-wlixFxklTSUQNZKpEgN4WNTFTPIZ&confirm=t&uuid=28606c64-240a-4c0e-8e9a-8189041d1956
To: /content/arxiv_abstracts.zip
100% 7.55G/7.55G [01:23<00:00, 90.2MB/s]


In [4]:
!unzip arxiv_abstracts.zip

Archive:  arxiv_abstracts.zip
   creating: arxiv_abstracts/
  inflating: __MACOSX/._arxiv_abstracts  
  inflating: arxiv_abstracts/abstracts_21.parquet  
  inflating: arxiv_abstracts/abstracts_1.parquet  
  inflating: arxiv_abstracts/abstracts_8.parquet  
  inflating: arxiv_abstracts/abstracts_17.parquet  
  inflating: arxiv_abstracts/verifyResults.py  
  inflating: __MACOSX/arxiv_abstracts/._verifyResults.py  
  inflating: arxiv_abstracts/abstracts_16.parquet  
  inflating: arxiv_abstracts/abstracts_9.parquet  
  inflating: arxiv_abstracts/abstracts_20.parquet  
  inflating: arxiv_abstracts/abstracts_14.parquet  
  inflating: arxiv_abstracts/abstracts_22.parquet  
  inflating: arxiv_abstracts/abstracts_2.parquet  
  inflating: arxiv_abstracts/abstracts_23.parquet  
  inflating: arxiv_abstracts/abstracts_3.parquet  
  inflating: arxiv_abstracts/params.txt  
  inflating: __MACOSX/arxiv_abstracts/._params.txt  
  inflating: arxiv_abstracts/abstracts_15.parquet  
  inflating: arxiv_abstra

In [5]:
def load_parquet_file(file_path: str) -> pd.DataFrame:
    return pd.read_parquet(file_path)

data_df = None
i = 0
for file_path in glob.glob(os.path.join('arxiv_abstracts/', 'abstracts_*.parquet')):
    print('Loading data from {}'.format(file_path))
    file_df = load_parquet_file(file_path)

    if data_df is None:
        # initialize the dataframe
        data_df = file_df
    else:
        # append the new data to our dataframe
        data_df = pd.concat([data_df, file_df], ignore_index=True)

    # Use this to limit the amount of data
    # Each file contains 100,000 rows
    i += 1
    if i >= 5:
        break

# A little data prep
data_df = data_df.rename(columns={'embeddings': 'embedding'})
data_df['id'] = data_df.index.astype(str)

Loading data from arxiv_abstracts/abstracts_13.parquet
Loading data from arxiv_abstracts/abstracts_9.parquet
Loading data from arxiv_abstracts/abstracts_22.parquet
Loading data from arxiv_abstracts/abstracts_12.parquet
Loading data from arxiv_abstracts/abstracts_18.parquet


In [6]:
data_df.head()

Unnamed: 0,abstract,embedding,doi,id
0,In this paper we prove quantitative results ...,"[-0.011698227, 0.0066467193, -0.010206703, -0....",1911.01406,0
1,This paper considers an energy harvesting se...,"[0.03446202, 0.026488738, -0.016999641, 0.0227...",1911.01407,1
2,We determine the generic complete eigenstruc...,"[0.0042488296, -0.019051258, 0.0052831694, -0....",1911.01408,2
3,Coronary artery bypass grafts (CABG) surgery...,"[-0.020915495, 0.015672468, 0.040892176, -0.01...",1911.01409,3
4,We establish that the spin-3/2 AKLT model on...,"[-0.018369809, -0.04366875, -0.005743583, -0.0...",1911.0141,4


In [7]:
user_liked_paper = data_df[data_df['doi'] == '1911.01406'].iloc[0].to_dict()
print(f"abstract: \n{user_liked_paper['abstract']}")

abstract: 
  In this paper we prove quantitative results about geodesic approximations to
submanifolds in negatively curved spaces. Among the main tools is a new and
general Jarn\'{i}k-Besicovitch type theorem in Diophantine approximation. The
framework we develop is flexible enough to treat manifolds of variable negative
curvature, a variety of geometric targets, and logarithm laws as well as
spiraling phenomena in both measure and dimension aspect. Several of the
results are new also for manifolds of constant negative sectional curvature. We
further establish a large intersection property of Falconer in this context.



In [8]:
def brute_force_search(user_liked_paper, data_df, num_results=5):
    start_time = time.time()

    data_df['similarity'] = data_df['embedding'].apply(
		    lambda x: cosine_similarity([user_liked_paper['embedding']], [x])[0][0]
	  )

    # Make sure the results don't contain the same paper as the query
    search_results = data_df[data_df['doi'] != user_liked_paper['doi']].sort_values(
		    by='similarity',
		    ascending=False
	  ).head(num_results)

    print(f"Time taken to brute force search: {round(time.time() - start_time, 3)} seconds")

    # Select and reorder columns, reset index
    search_results = search_results[["id", "abstract", "doi"]].reset_index(
		    drop=True
	  )

    return search_results

search_results = brute_force_search(user_liked_paper, data_df, num_results=5)
display(search_results)

Time taken to brute force search: 220.534 seconds


Unnamed: 0,id,abstract,doi
0,290914,Let M be a geometrically finite pinched nega...,math/9909131
1,173296,We present examples of geometrically finite ...,1707.08264
2,156253,Let $\pi: \mathcal{X}^* \rightarrow B^*$ be ...,1706.01518
3,288238,We prove a priori bounds for the trace of th...,math/9807130
4,418572,Comparison theorems are foundational to our ...,2209.12857


In [9]:
db_client = chromadb.PersistentClient(path="chroma_db")

In [10]:
collection_name = "paper_abstracts"

if collection_name in db_client.list_collections():
    db_client.delete_collection(name=collection_name)
    print('Dropped existing collection to remake')

abstract_collection = db_client.create_collection(
    name=collection_name,
    # define the distance metric to use when comparing vectors in this collection
    metadata={"hnsw:space": "cosine"})

In [None]:
batch_size = 1000
for i in range(0, len(data_df), batch_size):
    data_batch = data_df.iloc[i:i+batch_size]
    abstract_collection.add(
        ids=data_batch['id'].tolist(),
        embeddings=data_batch['embedding'].tolist(),
        documents=data_batch['abstract'].tolist(),
        metadatas=data_batch[['doi']].to_dict(orient='records')
    )

In [12]:
def vector_db_search(user_liked_paper, collection_name, num_results):
    start_time = time.time()

    search_results = abstract_collection.query(
        query_embeddings=[user_liked_paper['embedding']],
        n_results=num_results,
        # Make sure the results don't contain the same paper as the query
        where={"doi": {"$ne": user_liked_paper['doi']}}
    )

    print(f"Time taken to search vector database: {round(time.time() - start_time, 3)} seconds")

    # Create a pandas dataframe from the search results
    search_results_df = pd.DataFrame(search_results["ids"][0], columns=["id"])
    search_results_df["document"] = search_results["documents"][0]
    search_results_df["doi"] = [meta["doi"] for meta in search_results["metadatas"][0]]
    search_results_df = search_results_df.rename(columns={"document": "abstract"})

    return search_results_df


search_results_df = vector_db_search(user_liked_paper, "article_abstracts", 5)
display(search_results_df)

Time taken to search vector database: 1.439 seconds


Unnamed: 0,id,abstract,doi
0,290914,Let M be a geometrically finite pinched nega...,math/9909131
1,173296,We present examples of geometrically finite ...,1707.08264
2,156253,Let $\pi: \mathcal{X}^* \rightarrow B^*$ be ...,1706.01518
3,288238,We prove a priori bounds for the trace of th...,math/9807130
4,241901,We start by constructing a Hilbert manifold ...,math-ph/9812002
