In [None]:
!pip install langchain
!pip install tiktoken
!pip install transformers
!pip install chromadb
!pip install tqdm
!pip install sentence-transformers

Collecting langchain
  Downloading langchain-0.0.335-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.2-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Downloading langsmith-0.0.63-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langch

In [None]:
import pandas as pd
from google.colab import drive
import os

### Mount the drive

In [None]:
drive.mount('/content/gdrive')
os.chdir("/content/gdrive/MyDrive/IR_Project/Standardized Dataset")

Mounted at /content/gdrive


### Load the data


In [None]:
df_no_newline = pd.read_pickle("arxiv_dataset_removed_newline_5000.pkl")
df_with_newline = pd.read_pickle("arxiv_dataset_5000.pkl")

### Split the text

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(df):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=400,  # max 400 tokens per chunk
        chunk_overlap=0,
        disallowed_special=(),
    )
    papers = df['paper'].tolist()
    metadata = df['metadata'].tolist()

    texts = text_splitter.create_documents(papers, metadatas=metadata)
    return texts

In [None]:
%%time
texts_no_newline = split_text(df_no_newline)

CPU times: user 10min 23s, sys: 2.11 s, total: 10min 25s
Wall time: 10min 38s


In [None]:
%%time
texts_with_newline = split_text(df_with_newline)

CPU times: user 5min 48s, sys: 1.37 s, total: 5min 49s
Wall time: 5min 53s


### Benchmarking Function


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from tqdm import tqdm

def batch_add_documents(doc_list, batch_size, vector_store):
        total_docs = len(doc_list)
        with tqdm(total=total_docs, desc="Adding documents", unit="docs") as progress_bar:
            for i in range(0, total_docs, batch_size):
                batch = doc_list[i : i + batch_size]
                vector_store.add_documents(batch)
                progress_bar.update(len(batch))

def embeddings_benchmark(texts, persist_directory):

    model_kwargs = {"device": "cuda"}  # change to 'cpu' if you don't have a GPU
    embeddings_model = HuggingFaceEmbeddings(
        model_name="llmrails/ember-v1",
        model_kwargs=model_kwargs,
    )
    os.chdir("/content")

    vector_store = Chroma(
        "cs_paper_store",
        embeddings_model,
        persist_directory=persist_directory,
    )

    # Specify the batch size
    batch_size = 5300

    # Add documents in batches and show progress using tqdm
    batch_add_documents(texts, batch_size, vector_store)



### Create vector store with no newline


In [None]:
%%time
embeddings_benchmark(texts_no_newline, persist_directory="./chroma_ember_v1_no_newline")

Downloading (…)6d4e4/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading (…)0e8d36d4e4/README.md:   0%|          | 0.00/66.2k [00:00<?, ?B/s]

Downloading (…)8d36d4e4/config.json:   0%|          | 0.00/778 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)6d4e4/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)0e8d36d4e4/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)d36d4e4/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Adding documents:  57%|█████▋    | 137800/241159 [3:38:24<2:44:38, 10.46docs/s]

In [None]:
drive.mount('/content/gdrive')

In [None]:
!cp -r ./chroma_ember_v1_no_newline /content/gdrive/MyDrive/IR_Project/

### Create vector store with newline

In [None]:
%%time
embeddings_benchmark(df_with_newline, persist_directory="./chroma_ember_v1_with_newline")