In [None]:
# Note: This notebook creates 256,000 vectors

In [None]:
!pip install tiktoken

In [None]:
#!pip install -U weaviate-client
!pip install "weaviate-client==3.*" # Version 4 of the client does not work

In [None]:
import pandas as pd
import numpy as np
import os

import json
import re

In [None]:
WCS_ENDPOINT = "https://my-sandbox1-1486fdzz.weaviate.network/" # Weaviate
WCS_API_KEY = "YOUR-API-KEY" # Weaviate
OPENAI_API_KEY = 'YOUR-API-KEY'
COHERE_API_KEY = 'YOUR-API-KEY'

VECTOR_DB_NAME = 'ARXIV_100SAMPLE_VDB'
BATCH_SIZE = 100

#SAMPLE_SIZE = 500

In [None]:
# You can access each paper directly on ArXiv using these links:
# https://arxiv.org/abs/{id}: Page for this paper including its abstract and further links
# https://arxiv.org/pdf/{id}: Direct link to download the PDF

In [None]:
# https://arxiv.org/abs/0704.1020

In [None]:
os.listdir('../input/arxiv')

In [None]:
# All Arxiv category codes
# Source: https://www.kaggle.com/code/artgor/arxiv-metadata-exploration

# https://arxiv.org/category_taxonomy
# https://info.arxiv.org/help/api/user-manual.html#subject_classifications


category_map = {
# These created errors when mapping categories to descriptions
'acc-phys': 'Accelerator Physics',
'adap-org': 'Not available',
'q-bio': 'Not available',
'cond-mat': 'Not available',
'chao-dyn': 'Not available',
'patt-sol': 'Not available',
'dg-ga': 'Not available',
'solv-int': 'Not available',
'bayes-an': 'Not available',
'comp-gas': 'Not available',
'alg-geom': 'Not available',
'funct-an': 'Not available',
'q-alg': 'Not available',
'ao-sci': 'Not available',
'atom-ph': 'Atomic Physics',
'chem-ph': 'Chemical Physics',
'plasm-ph': 'Plasma Physics',
'mtrl-th': 'Not available',
'cmp-lg': 'Not available',
'supr-con': 'Not available',
###

# Added
'econ.GN': 'General Economics', 
'econ.TH': 'Theoretical Economics', 
'eess.SY': 'Systems and Control', 
    
'astro-ph': 'Astrophysics',
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
'astro-ph.EP': 'Earth and Planetary Astrophysics',
'astro-ph.GA': 'Astrophysics of Galaxies',
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
'astro-ph.SR': 'Solar and Stellar Astrophysics',
'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
'cond-mat.mtrl-sci': 'Materials Science',
'cond-mat.other': 'Other Condensed Matter',
'cond-mat.quant-gas': 'Quantum Gases',
'cond-mat.soft': 'Soft Condensed Matter',
'cond-mat.stat-mech': 'Statistical Mechanics',
'cond-mat.str-el': 'Strongly Correlated Electrons',
'cond-mat.supr-con': 'Superconductivity',
'cs.AI': 'Artificial Intelligence',
'cs.AR': 'Hardware Architecture',
'cs.CC': 'Computational Complexity',
'cs.CE': 'Computational Engineering, Finance, and Science',
'cs.CG': 'Computational Geometry',
'cs.CL': 'Computation and Language',
'cs.CR': 'Cryptography and Security',
'cs.CV': 'Computer Vision and Pattern Recognition',
'cs.CY': 'Computers and Society',
'cs.DB': 'Databases',
'cs.DC': 'Distributed, Parallel, and Cluster Computing',
'cs.DL': 'Digital Libraries',
'cs.DM': 'Discrete Mathematics',
'cs.DS': 'Data Structures and Algorithms',
'cs.ET': 'Emerging Technologies',
'cs.FL': 'Formal Languages and Automata Theory',
'cs.GL': 'General Literature',
'cs.GR': 'Graphics',
'cs.GT': 'Computer Science and Game Theory',
'cs.HC': 'Human-Computer Interaction',
'cs.IR': 'Information Retrieval',
'cs.IT': 'Information Theory',
'cs.LG': 'Machine Learning',
'cs.LO': 'Logic in Computer Science',
'cs.MA': 'Multiagent Systems',
'cs.MM': 'Multimedia',
'cs.MS': 'Mathematical Software',
'cs.NA': 'Numerical Analysis',
'cs.NE': 'Neural and Evolutionary Computing',
'cs.NI': 'Networking and Internet Architecture',
'cs.OH': 'Other Computer Science',
'cs.OS': 'Operating Systems',
'cs.PF': 'Performance',
'cs.PL': 'Programming Languages',
'cs.RO': 'Robotics',
'cs.SC': 'Symbolic Computation',
'cs.SD': 'Sound',
'cs.SE': 'Software Engineering',
'cs.SI': 'Social and Information Networks',
'cs.SY': 'Systems and Control',
'econ.EM': 'Econometrics',             
'eess.AS': 'Audio and Speech Processing',
'eess.IV': 'Image and Video Processing',
'eess.SP': 'Signal Processing',               
'gr-qc': 'General Relativity and Quantum Cosmology',
'hep-ex': 'High Energy Physics - Experiment',
'hep-lat': 'High Energy Physics - Lattice',
'hep-ph': 'High Energy Physics - Phenomenology',
'hep-th': 'High Energy Physics - Theory',
'math.AC': 'Commutative Algebra',
'math.AG': 'Algebraic Geometry',
'math.AP': 'Analysis of PDEs',
'math.AT': 'Algebraic Topology',
'math.CA': 'Classical Analysis and ODEs',
'math.CO': 'Combinatorics',
'math.CT': 'Category Theory',
'math.CV': 'Complex Variables',
'math.DG': 'Differential Geometry',
'math.DS': 'Dynamical Systems',
'math.FA': 'Functional Analysis',
'math.GM': 'General Mathematics',
'math.GN': 'General Topology',
'math.GR': 'Group Theory',
'math.GT': 'Geometric Topology',
'math.HO': 'History and Overview',
'math.IT': 'Information Theory',
'math.KT': 'K-Theory and Homology',
'math.LO': 'Logic',
'math.MG': 'Metric Geometry',
'math.MP': 'Mathematical Physics',
'math.NA': 'Numerical Analysis',
'math.NT': 'Number Theory',
'math.OA': 'Operator Algebras',
'math.OC': 'Optimization and Control',
'math.PR': 'Probability',
'math.QA': 'Quantum Algebra',
'math.RA': 'Rings and Algebras',
'math.RT': 'Representation Theory',
'math.SG': 'Symplectic Geometry',
'math.SP': 'Spectral Theory',
'math.ST': 'Statistics Theory',
'math-ph': 'Mathematical Physics',
'nlin.AO': 'Adaptation and Self-Organizing Systems',
'nlin.CD': 'Chaotic Dynamics',
'nlin.CG': 'Cellular Automata and Lattice Gases',
'nlin.PS': 'Pattern Formation and Solitons',
'nlin.SI': 'Exactly Solvable and Integrable Systems',
'nucl-ex': 'Nuclear Experiment',
'nucl-th': 'Nuclear Theory',
'physics.acc-ph': 'Accelerator Physics',
'physics.ao-ph': 'Atmospheric and Oceanic Physics',
'physics.app-ph': 'Applied Physics',
'physics.atm-clus': 'Atomic and Molecular Clusters',
'physics.atom-ph': 'Atomic Physics',
'physics.bio-ph': 'Biological Physics',
'physics.chem-ph': 'Chemical Physics',
'physics.class-ph': 'Classical Physics',
'physics.comp-ph': 'Computational Physics',
'physics.data-an': 'Data Analysis, Statistics and Probability',
'physics.ed-ph': 'Physics Education',
'physics.flu-dyn': 'Fluid Dynamics',
'physics.gen-ph': 'General Physics',
'physics.geo-ph': 'Geophysics',
'physics.hist-ph': 'History and Philosophy of Physics',
'physics.ins-det': 'Instrumentation and Detectors',
'physics.med-ph': 'Medical Physics',
'physics.optics': 'Optics',
'physics.plasm-ph': 'Plasma Physics',
'physics.pop-ph': 'Popular Physics',
'physics.soc-ph': 'Physics and Society',
'physics.space-ph': 'Space Physics',
'q-bio.BM': 'Biomolecules',
'q-bio.CB': 'Cell Behavior',
'q-bio.GN': 'Genomics',
'q-bio.MN': 'Molecular Networks',
'q-bio.NC': 'Neurons and Cognition',
'q-bio.OT': 'Other Quantitative Biology',
'q-bio.PE': 'Populations and Evolution',
'q-bio.QM': 'Quantitative Methods',
'q-bio.SC': 'Subcellular Processes',
'q-bio.TO': 'Tissues and Organs',
'q-fin.CP': 'Computational Finance',
'q-fin.EC': 'Economics',
'q-fin.GN': 'General Finance',
'q-fin.MF': 'Mathematical Finance',
'q-fin.PM': 'Portfolio Management',
'q-fin.PR': 'Pricing of Securities',
'q-fin.RM': 'Risk Management',
'q-fin.ST': 'Statistical Finance',
'q-fin.TR': 'Trading and Market Microstructure',
'quant-ph': 'Quantum Physics',
'stat.AP': 'Applications',
'stat.CO': 'Computation',
'stat.ME': 'Methodology',
'stat.ML': 'Machine Learning',
'stat.OT': 'Other Statistics',
'stat.TH': 'Statistics Theory'
}

In [None]:
# Choose category codes

selected_cats_dict = {
        'cs.AI': 'Artificial Intelligence',
        'cs.CV': 'Computer Vision and Pattern Recognition',
        'cs.ET': 'Emerging Technologies',
        'cs.MA': 'Multiagent Systems',
        'cs.LG': 'Machine Learning',
        'cs.RO': 'Robotics',
        'eess.AS': 'Audio and Speech Processing',
        'eess.IV': 'Image and Video Processing',
        'eess.SP': 'Signal Processing',
        'stat.ML': 'Machine Learning',
        }

selected_cats_list = list(selected_cats_dict.keys())

selected_cats_list

## Load the Arxiv metadata

In [None]:
# https://www.kaggle.com/code/matthewmaddock/nlp-arxiv-dataset-transformers-and-umap

# Load all the available metadata

cols = ['id', 'title', 'abstract', 'categories']
data = []
file_name = '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'

with open(file_name, encoding='latin-1') as f:
    for line in f:
        doc = json.loads(line)
        lst = [doc['id'], doc['title'], doc['abstract'], doc['categories']]
        data.append(lst)

df = pd.DataFrame(data=data, columns=cols)

df.head()

In [None]:
df.shape

In [None]:
# Convert the category codes into text
# If a description was not available for a category code then 
# I left it out when converting category codes into text.

def get_cat_text(x):
    
    cat_text = ''
    
    # Put the codes into a list
    cat_list = x.split(' ')
    
    for i, item in enumerate(cat_list):
        
        cat_name = category_map[item]
        
        # If there was no description available
        # for the category code then don't include it in the text.
        if cat_name != 'Not available':
            
            if i == 0:
                cat_text = cat_name
            else:
                cat_text = cat_text + ', ' + cat_name
 
    # Remove leading and trailing spaces
    cat_text = cat_text.strip()
    
    return cat_text
    

df['cat_text'] = df['categories'].apply(get_cat_text)

df.head()

In [None]:
# Put the cat codes into a list

def get_cat_codes(x):
    
    cat_list = x.split(' ')
        
    return cat_list

df['code_list'] = df['categories'].apply(get_cat_codes)

print(df.shape)

df.head()

In [None]:
# Filter out onlt the codes we selected

def filter_by_code(x_list):
    
    for item in x_list:
        if item in selected_cats_list:
            
            return 'yes'
        else:
            return 'no'
        
df['selected_cats'] = df['code_list'].apply(filter_by_code)

df.head()

In [None]:
df_filtered = df[df['selected_cats'] == 'yes']

df_filtered = df_filtered.reset_index(drop=True)

print(df_filtered.shape)

df_filtered.head()

In [None]:
i = 0

print(df_filtered.loc[i, 'id'])
print(df_filtered.loc[i, 'title'])
print(df_filtered.loc[i, 'abstract'])

## Clean the text

In [None]:
# Replace newline characters ('\n') with a space
# Remove leading and trailing spaces

def clean_text(x):
    
    # Replace newline characters with a space
    new_text = x.replace("\n", " ")
    # Remove leading and trailing spaces
    new_text = new_text.strip()
    
    return new_text

df_filtered['title'] = df_filtered['title'].apply(clean_text)
df_filtered['abstract'] = df_filtered['abstract'].apply(clean_text)

df_filtered.head()

## Prepare the text for vectorizing

Here we will append the title to the text.

In [None]:
df_filtered['prepared_text'] = df_filtered['title'] + ' {title} ' + df_filtered['abstract']

df_filtered.head()

In [None]:
df_filtered.loc[0, 'prepared_text']

In [None]:
# Take a small sample of the data
# This reduces the cost of vectorizing

df_sample = df_filtered #[0:SAMPLE_SIZE]

df_sample = df_sample.reset_index(drop=True)

print(df_sample.shape)

df_sample.head()

## Get the total number of tokens

In [None]:
# Ref: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

import tiktoken

# Function to calculate the number of tokens in a string
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
# Example
num_tokens_from_string("tiktoken is great!", "cl100k_base")

In [None]:
def get_num_tokens(x):
    
    num_tokens = num_tokens_from_string(x, "cl100k_base")
    
    return num_tokens

df_sample['num_tokens'] = df_sample['prepared_text'].apply(get_num_tokens)

print(df_sample.shape)
print(df_sample['num_tokens'].sum())

df_sample.head()

## Estimate the cost for creating the embeddings

In [None]:
# https://openai.com/pricing
# at 22-Feb-2024

# text-embedding-3-small	$0.00002 / 1K tokens # US Dollars
# text-embedding-3-large	$0.00013 / 1K tokens # US Dollars
# ada v2	$0.00010 / 1K tokens # US Dollars

In [None]:
total_tokens = df_sample['num_tokens'].sum()
cost_per_1000_tokens = 0.00002 #  text-embedding-3-small

total_cost = (total_tokens/1000) * cost_per_1000_tokens

print('Num papers:',len(df_sample))
print('Total tokens:',total_tokens)
print('Total cost (USD):',total_cost) # US Dollars

In [None]:
#qqq

## Create a list of dicts

In [None]:
# Example data format for upload to the vector database

data = [
   {
      "title": "Object0",
      "foo": 99, 
      "quote_text": "The quick brown fox jumps over the lazy dog."
   },
   {
      "title": "Object1",
      "foo": 77, 
      "quote_text": "A nimble red fox leaped over the sleeping hound."
   }
]
   

In [None]:
data_list = []

df_final = df_sample.copy()

for i in range(0, len(df_final)):
    
    data = {
        "arxiv_id": df_final.loc[i, 'id'],
        "title": df_final.loc[i, 'title'],
        "cat_text": df_final.loc[i, 'cat_text'],
        "abstract": df_final.loc[i, 'prepared_text']
    }
    
    data_list.append(data)
    
len(data_list)

In [None]:
data_list[3]

## Create the vector database

In [None]:
# Connect

import weaviate
import json

client = weaviate.Client(
    url = WCS_ENDPOINT,  # Replace with your endpoint
    auth_client_secret=weaviate.auth.AuthApiKey(api_key=WCS_API_KEY),  # Replace w/ your Weaviate instance API key
    additional_headers = {
        "X-OpenAI-Api-Key": OPENAI_API_KEY  # Replace with your inference API key
    }
)

client.is_ready()

In [None]:
# https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-openai#api-settings-openai
# Configure the database

# resetting the schema. CAUTION: This will delete your collection 
if client.schema.exists(VECTOR_DB_NAME):
     client.schema.delete_class(VECTOR_DB_NAME)


# Create a Table in the database.
# We call this table a "class".

class_obj = {
    "class": VECTOR_DB_NAME, # This is the name of the database table
    "vectorizer": "text2vec-openai",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    "moduleConfig": {
        "text2vec-openai": {
            # "model": "ada",
            # "modelVersion": "002",  
          #"model": "text-embedding-3-large",
          #"dimensions": 3072,  
          "model": "text-embedding-3-small",
          "dimensions": 1536,
            "type": "text",
        },
        "generative-openai": {}  # Ensure the `generative-openai` module is used for generative queries
    }
}

client.schema.create_class(class_obj)

## Upload the data to the vector database

In [None]:
BATCH_SIZE

In [None]:
import requests
import json
from tqdm import tqdm

client.batch.configure(batch_size=BATCH_SIZE)  # Configure batch
with client.batch as batch:  # Initialize a batch process
    for i, data in enumerate(tqdm(data_list, total=len(data_list))):  # Batch import data
        #print(f"importing chunk: {i+1}")
        properties = {
            "arxiv_id": data["arxiv_id"],
            "title": data["title"],
            "cat_text": data["cat_text"],
            "abstract": data["abstract"] 
        }
        batch.add_data_object(
            data_object=properties,
            class_name=VECTOR_DB_NAME
        )

In [None]:
# Check number of objects in MyTable1

response = (
    client.query
    .aggregate(VECTOR_DB_NAME)
    .with_meta_count()
    .do()
)

print(response)

## Example: Query the database

Here we will run a hybrid search.

In [None]:
# Connect to the database

import weaviate
import json

wcs_client = weaviate.Client(
    url = WCS_ENDPOINT,  # Replace with your endpoint
    auth_client_secret=weaviate.auth.AuthApiKey(api_key=WCS_API_KEY),  # Replace w/ your Weaviate instance API key
    additional_headers = {
        "X-OpenAI-Api-Key": OPENAI_API_KEY, 
        "X-Cohere-Api-Key": COHERE_API_KEY
    }
)

# check that the client is ready
wcs_client.is_ready()

In [None]:
#query_text = "General purpose computers"
query_text = "Quantum computing"

response = (
    client.query
    .get(VECTOR_DB_NAME, ["arxiv_id", "abstract", "cat_text", "title"])
    .with_near_text({"concepts": [query_text]})
    .with_limit(5)
    #.with_additional(["distance", "vector, id"]) # Also return the vector, the distance and the id
    .with_additional(["distance", "id"])
    .do()
)

print(json.dumps(response, indent=4))

In [None]:
# Run a hybrid search

response = (
    wcs_client.query
    .get(VECTOR_DB_NAME, ["arxiv_id", "abstract", "cat_text", "title"])
    .with_hybrid(query=query_text, alpha=0.5)
    .with_limit(10)
    #.with_additional(["distance", "vector, id"]) # Also return the vector, the distance and the id
    .with_additional(["distance", "id"]) # This id is the Weaviate database id
    .do()
)

print(json.dumps(response, indent=4))

In [None]:
# Include reranking after the hybrid search

response = (
    wcs_client.query
    .get(VECTOR_DB_NAME, ["arxiv_id", "abstract", "cat_text", "title"])
    .with_hybrid(query=query_text, alpha=0.5)
    .with_additional(f"rerank(property: \"abstract\" query: \"{query_text}\") {{ score }}")
    .with_limit(10)
    .do()
)

print(json.dumps(response, indent=4))