# OpenAI vs Local Embeddings

Performance comparison
    - OpenAI's Embeddings Model
    - InstructorEmbedding at [Huggingface](https://huggingface.co/hkunlp/instructor-xl)

#### Install required packages

In [1]:
!pip install -r requirements.txt

You should consider upgrading via the '/Users/ccugutrillague/Documents/perso/doctorado/experiments/embeddings-for-retrievalQA/embeddings/bin/python -m pip install --upgrade pip' command.[0m


### Set up the environment variables and import libraries

In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, TextLoader, AirbyteJSONLoader
from langchain.document_loaders import DirectoryLoader

In [3]:
# InstructorEmbedding 
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings
# OpenAI Embedding
from langchain.embeddings import OpenAIEmbeddings

  from tqdm.autonotebook import trange


### Load Multiple files from Directory (json)

In [4]:
from langchain.document_loaders import AirbyteJSONLoader

In [5]:
root_dir = "/Users/ccugutrillague/Documents/outreach/smp-hackthaton/smp-llm"

In [7]:
# Load all repos
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import Language
loader = GenericLoader.from_filesystem(
    root_dir + "/mpdl_collection",
    glob="**/*",
    suffixes=[".json"],
    parser=LanguageParser(language=Language.PYTHON, parser_threshold=500),
)
documents = loader.load()
len(documents)

3

### Divide and split text

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
)
texts = python_splitter.split_documents(documents)
len(texts)

14

### RetrievalQA

We need to store the documents in a way we can semantically search for their content.

The most common approach is to embed the contents of each document then store the embedding and document in a vector store. 

In [10]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

db = Chroma.from_documents(texts, OpenAIEmbeddings(disallowed_special=()))
retriever = db.as_retriever(
    search_type="mmr",  # Also test "similarity"
    search_kwargs={"k": 8},
)

In [13]:
# from langchain.chains import ConversationalRetrievalChain
# from langchain.chat_models import ChatOpenAI
# from langchain.memory import ConversationSummaryMemory

# llm = ChatOpenAI(model_name="gpt-4")
# memory = ConversationSummaryMemory(
#     llm=llm, memory_key="chat_history", return_messages=True
# )
# qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)


In [15]:
# question = "How can I initialize a ReAct agent?"
# result = qa(question)
# result["answer"]

### Get Embeddings for MPDL document(s)

In [86]:
import pickle
import faiss # for similarilty: https://faiss.ai/index.html
from langchain.vectorstores import FAISS

In [87]:
def store_embeddings(docs, embeddings, sotre_name, path):
    vectorStore = FAISS.from_documents(docs, embeddings)

    with open(f"{path}/faiss_{sotre_name}.pkl", "wb") as f:
        pickle.dump(vectorStore, f)

In [88]:
def load_embeddings(sotre_name, path):
    with open(f"{path}/faiss_{sotre_name}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    return VectorStore

### HF Instructor Embeddings

In [89]:
from langchain.embeddings import HuggingFaceInstructEmbeddings
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cpu"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [91]:
Embedding_store_path = f"{root_dir}/Embedding_store"
print(Embedding_store_path)

/Users/ccugutrillague/Documents/perso/doctorado/experiments/embeddings-for-retrievalQA/Embedding_store


In [93]:
db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)
print(db_instructEmbedd) ##

<langchain.vectorstores.faiss.FAISS object at 0x16b0db010>


In [95]:
retriever = db_instructEmbedd.as_retriever(search_kwargs={"k":3})

In [97]:
print(retriever)

tags=['FAISS', 'HuggingFaceInstructEmbeddings'] vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x16b0db010> search_kwargs={'k': 3}


In [44]:
retriever.search_type

'similarity'

In [103]:
docs = retriever.get_relevant_documents("Who are the authors of this software?")
docs[0]

Document(page_content='}\n    ],\n    "download_url": [\n        {\n            "result": {\n                "value": "https://github.com/MPDL/FirstAuthor/releases",\n                "type": "Url"\n            },\n            "confidence": 1,\n            "technique": "GitHub_API"\n        }\n    ],\n    "programming_languages": [\n        {\n            "result": {\n                "value": "Java",\n                "name": "Java",\n                "type": "Programming_language",\n                "size": 22899\n            },\n            "confidence": 1,\n            "technique": "GitHub_API"\n        }\n    ],\n    "readme_url": [\n        {\n            "result": {\n                "value": "https://raw.githubusercontent.com/MPDL/FirstAuthor/main/README.md",\n                "type": "Url"\n            },\n            "confidence": 1,\n            "technique": "file_exploration"\n        }\n    ],\n    "full_title": [\n        {\n            "result": {\n                "type": "Stri

In [104]:
# create the chain to answer questions
qa_chain_instrucEmbed = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0.2, ),
                                                    chain_type="stuff",
                                                    retriever=retriever,
                                                    return_source_documents=True)

## OpenAI's embeddings

In [105]:
from langchain.embeddings import OpenAIEmbeddings

In [106]:
embeddings = OpenAIEmbeddings()

In [107]:
db_openAIEmbedd= FAISS.from_documents(texts, embeddings)
retriever_openai = db_openAIEmbedd.as_retriever(search_kwargs={"k": 3})

In [108]:
# create the chain to answer questions
qa_chain_openai = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0.2, ),
                                                    chain_type="stuff",
                                                    retriever=retriever_openai,
                                                    return_source_documents=True)

### Testing both MODELS

In [109]:
## Cite sources

import textwrap # text wrapping and filling
def wrap_text_preserve_newlines(text, width=79):
    # Split the input text into lines based on newline characters
    lines = text.split("\n")

    #wrap each line individually
    wrapped_lines = [textwrap.fill(line, width) for line in lines]

    # Join the wrapped lines back into a single string using newline characters
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\nSources:')
    for source in llm_response['source_documents']:
        print(source.metadata['source'])

In [125]:
query = 'Who is the creator of the github repo?'

print('--------------Instructor Embeddings-------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)


--------------Instructor Embeddings-------------

 The creator of the github repo is MPDL.

Sources:
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json


In [124]:
query = 'Who is the creator of the github repo?'

print('-------------------OpenAI Embeddings------------------')
llm_response = qa_chain_openai(query)
process_llm_response(llm_response)
print('\n\n\n')

-------------------OpenAI Embeddings------------------
 The creator of the github repo is MPDL.

Sources:
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json






In [122]:
query = 'Who is the creator of the github repo?'

print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------

 The creator of the github repo is MPDL.

Sources:
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json


In [123]:
query = "Who is the creator of the github repo?"

# print('-------------------OpenAI Embeddings------------------')
# llm_response = qa_chain_openai(query)
# process_llm_response(llm_response)
# print('\n\n\n')
print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------

 The creator of the github repo is MPDL.

Sources:
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json


In [121]:
query = "what is the file format of the document?"

# print('-------------------OpenAI Embeddings------------------')
# llm_response = qa_chain_openai(query)
# process_llm_response(llm_response)
# print('\n\n\n')
print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------

 The file format of the document is a string.

Sources:
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json


In [120]:
query = "what is the file format of the document?"

print('-------------------OpenAI Embeddings------------------')
llm_response = qa_chain_openai(query)
process_llm_response(llm_response)
print('\n\n\n')

-------------------OpenAI Embeddings------------------
 The file format of the document is Markdown (MD).

Sources:
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json






In [118]:
query = "what license is using?"

# print('-------------------OpenAI Embeddings------------------')
# llm_response = qa_chain_openai(query)
# process_llm_response(llm_response)
# print('\n\n\n')
print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------

 I don't know.

Sources:
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json


In [119]:
query = "what license is using?"

print('-------------------OpenAI Embeddings------------------')
llm_response = qa_chain_openai(query)
process_llm_response(llm_response)
print('\n\n\n')

-------------------OpenAI Embeddings------------------
 I don't know.

Sources:
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json
Documents/MPDL_FirstAuthor_2023-11-28.json






In [22]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.llms import LlamaCpp
from langchain.memory import ConversationSummaryMemory
from langchain.prompts import PromptTemplate

In [20]:
!pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.20.tar.gz (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Obtaining dependency information for diskcache>=5.6.1 from https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl.metadata
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-py

In [25]:
!pip install llama-cpp-python  --upgrade --force-reinstall --no-cache-dir

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.20.tar.gz (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting typing-extensions>=4.5.0 (from llama-cpp-python)
  Obtaining dependency information for typing-extensions>=4.5.0 from https://files.pythonhosted.org/packages/24/21/7d397a4b7934ff4028987914ac1044d3b7d52712f30e2ac7a2ae5bc86dd0/typing_extensions-4.8.0-py3-none-any.whl.metadata
  Downloading typing_extensions-4.8.0-py3-none-any.whl.metadata (3.0 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python)
  Obtaining dependency information for numpy>=1.20.0 from https://files.pythonhosted.org/packages/2e/54/218ce51bb571a70975f223671b2a86aa951e8

In [29]:
from llama_cpp import Llama
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = Llama(
    model_path="/Users/rlm/Desktop/Code/llama/code-llama/codellama-13b-instruct.Q4_K_M.gguf",
    n_ctx=5000,
    n_gpu_layers=1,
    n_batch=512,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True,
)

ValueError: Model path does not exist: ./models/7B/llama-model.gguf