<a href="https://colab.research.google.com/github/tascheidt/Llama2RAG/blob/main/LlamaIndex_Tutorials_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q llama-index
!pip install -q openai
!pip install -q transformers
!pip install -q accelerate
!pip install -q optimum[exporters]
!pip install -q InstructorEmbedding
!pip install -q sentence_transformers
!pip install -q pypdf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m854.4/854.4 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.3/484.3 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
os.environ["OPENAI_API_KEY"] = "api key"

In [3]:
from llama_index.llms import OpenAI
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from IPython.display import Markdown, display

In [5]:
documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()


In [7]:
response = query_engine.query("What did the author do growing up? ")
print(response)

The author worked on writing and programming before college. They wrote short stories and tried programming on an IBM 1401 computer using an early version of Fortran. They later got a microcomputer and started programming more extensively, including writing simple games and a word processor.


In [8]:
from llama_index import ServiceContext, set_global_service_context


# define LLM: https://gpt-index.readthedocs.io/en/latest/core_modules/model_modules/llms/usage_custom.html
llm = OpenAI(model="gpt-4", temperature=0, max_tokens=256)

# configure service context
service_context = ServiceContext.from_defaults(llm=llm,
                                               chunk_size=800,
                                               chunk_overlap=20)
# set_global_service_context(service_context)
index = VectorStoreIndex.from_documents(documents,
                                        service_context=service_context)

### Customize Embeddings Model:

In [9]:
from llama_index import ServiceContext
from llama_index.embeddings import OpenAIEmbedding


In [10]:
embed_model_openai = OpenAIEmbedding()
service_context = ServiceContext.from_defaults(embed_model=embed_model_openai)

In [11]:
print(service_context)

ServiceContext(llm_predictor=LLMPredictor(system_prompt=None, query_wrapper_prompt=None), prompt_helper=PromptHelper(context_window=3946, num_output=256, chunk_overlap_ratio=0.1, chunk_size_limit=None, separator=' '), embed_model=OpenAIEmbedding(model_name='text-embedding-ada-002', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7e080bf5a8f0>, deployment_name=None, additional_kwargs={}, api_key='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', api_type='open_ai', api_base='https://api.openai.com/v1', api_version=''), node_parser=SimpleNodeParser(text_splitter=SentenceSplitter(chunk_size=1024, chunk_overlap=20, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?', chunking_tokenizer_fn=<function split_by_sentence_tokenizer.<locals>.split at 0x7e080c0beb90>, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7e080bf5a8f0>, tokenizer=functools.partial(<bound method Encoding.enco

In [12]:
text_embeddings = embed_model_openai.get_text_embedding("AI is awesome!")

In [13]:
len(text_embeddings)

1536

### Local Embeddings with HuggingFace

Massive Text Embedding Benchmark (MTEB) [Leaderboard](https://huggingface.co/spaces/mteb/leaderboard):

In [15]:
from llama_index.embeddings import HuggingFaceEmbedding
embed_model_bge = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")


Downloading (…)lve/main/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [16]:
text_embeddings = embed_model_bge.get_text_embedding("AI is awesome!")
print(len(text_embeddings))

384


### InstructorEmbedding

In [17]:
from llama_index.embeddings import InstructorEmbedding
embed_model_inst = InstructorEmbedding(model_name="hkunlp/instructor-base")

Downloading (…)62736/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading (…)15e6562736/README.md:   0%|          | 0.00/66.2k [00:00<?, ?B/s]

Downloading (…)e6562736/config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)62736/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.43k [00:00<?, ?B/s]

Downloading (…)6562736/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [18]:
text_embeddings = embed_model_inst.get_text_embedding("AI is awesome!")
print(len(text_embeddings))

768


### Benchmarking


In [19]:
!curl https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf --output IPCC_AR6_WGII_Chapter03.pdf


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 20.7M  100 20.7M    0     0  32.8M      0 --:--:-- --:--:-- --:--:-- 32.8M


In [20]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader

In [21]:
documents = SimpleDirectoryReader(
    input_files=["IPCC_AR6_WGII_Chapter03.pdf"]
).load_data()

In [23]:
print(f"Number of documents: {len(documents)}")

Number of documents: 172


### OpenAI's Embeddings



In [24]:
embed_model_openai = OpenAIEmbedding()
service_context = ServiceContext.from_defaults(embed_model=embed_model_openai)

In [25]:
%%timeit -r 2 -n 2
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context, show_progress=True
)

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

19.4 s ± 78.4 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)


### BGE-Small Embeddings

In [26]:
service_context = ServiceContext.from_defaults(embed_model=embed_model_bge)

In [27]:
%%timeit -r 2 -n 2
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context, show_progress=True)

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

8.97 s ± 95 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)


### Instructors Embeddings

In [28]:
service_context = ServiceContext.from_defaults(embed_model=embed_model_inst)

In [29]:
%%timeit -r 2 -n 2
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context, show_progress=True)

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

Parsing documents into nodes:   0%|          | 0/172 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/428 [00:00<?, ?it/s]

19 s ± 622 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)


In [30]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
