# OpenAI vs Local Embeddings

Performance comparison:
- OpenAI's Embeddings Model
- InstructorEmbedding at [Huggingface](https://huggingface.co/hkunlp/instructor-xl)

#### Install required packages

In [25]:
# !pip install -r requirements.txt

UsageError: Line magic function `%` not found.


### Set up the environment variables and import libraries

In [26]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [27]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, TextLoader, AirbyteJSONLoader
from langchain.document_loaders import DirectoryLoader

In [28]:
# InstructorEmbedding 
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings
# OpenAI Embedding
from langchain.embeddings import OpenAIEmbeddings

### Load Multiple files from Directory (json)

We will upload all MPDL project files using the `langchain.document_loaders.TextLoader`. The following script iterates over the files in this repository and loads every `.json` file (a.k.a. **documents**):

In [29]:
# from langchain.document_loaders import AirbyteJSONLoader
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import Language

In [31]:
root_dir = "/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data"

In [8]:
# loader = DirectoryLoader(f'{root_dir}/mpdl_collection/', glob='**/*.json', show_progress=True, loader_cls=TextLoader)
# documents = loader.load()
# print(len(documents))

In [32]:
# Load all repos
loader = GenericLoader.from_filesystem(
    root_dir + "/mpdl_collection_raw",
    glob="**/*",
    suffixes=[".json"],
    parser=LanguageParser(language=Language.PYTHON, parser_threshold=500),
)
documents = loader.load()
len(documents)

58

### Splitting

Split the `Document` into chunks for embedding and vector storage. We can use `RecursiveCharacterTextSplitter` with language specified.

In [42]:
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000,
#     chunk_overlap=100
# )
# texts = text_splitter.split_documents(documents)

from langchain.text_splitter import RecursiveCharacterTextSplitter

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
)
texts = python_splitter.split_documents(documents[:5]) # only 5 documents as it gets so long to compute embeddings
len(texts)

51

In [43]:
print(texts[0])

page_content='{\n    "somef_provenance": {\n        "somef_version": "0.9.4",\n        "somef_schema_version": "1.0.0",\n        "date": "2023-11-28 15:24:34"\n    },\n    "code_repository": [\n        {\n            "result": {\n                "value": "https://github.com/MPDL/screenshot-service",\n                "type": "Url"\n            },\n            "confidence": 1,\n            "technique": "GitHub_API"\n        }\n    ],\n    "owner": [\n        {\n            "result": {\n                "value": "MPDL",\n                "type": "Organization"\n            },\n            "confidence": 1,\n            "technique": "GitHub_API"\n        }\n    ],\n    "date_created": [\n        {\n            "result": {\n                "value": "2014-08-05T09:14:41Z",\n                "type": "Date"\n            },\n            "confidence": 1,\n            "technique": "GitHub_API"\n        }\n    ],\n    "date_updated": [\n        {\n            "result": {\n                "value": "201

### RetrievalQA

We need to store the documents in a way we can semantically search for their content. The most common approach is to embed the contents of each document then store the embedding and document in a vector store. When setting up the vectorstore retriever:
- We test max marginal relevance for retrieval
- And 8 documents returned

In [13]:
# from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.vectorstores import Chroma

# db = Chroma.from_documents(texts, OpenAIEmbeddings(disallowed_special=()))
# retriever = db.as_retriever(
#     search_type="mmr",  # Also test "similarity"
#     search_kwargs={"k": 8},
# search_)

### Get Embeddings for MPDL document(s)

In [44]:
import pickle
import faiss # for similarilty: https://faiss.ai/index.html
from langchain.vectorstores import FAISS

In [45]:
def store_embeddings(docs, embeddings, sotre_name, path):
    vectorStore = FAISS.from_documents(docs, embeddings)

    with open(f"{path}/faiss_{sotre_name}.pkl", "wb") as f:
        pickle.dump(vectorStore, f)

In [46]:
def load_embeddings(sotre_name, path):
    with open(f"{path}/faiss_{sotre_name}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    return VectorStore

### HF Instructor Embeddings

In [47]:
from langchain.embeddings import HuggingFaceInstructEmbeddings
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cpu"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [48]:
Embedding_store_path = f"{root_dir}/Embedding_store"
print(Embedding_store_path)

/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/Embedding_store


In [49]:
db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)
print(db_instructEmbedd) ## takes so long

<langchain.vectorstores.faiss.FAISS object at 0x3dd2c31d0>


In [50]:
retriever = db_instructEmbedd.as_retriever(search_kwargs={"k":3})

In [51]:
print(retriever)

tags=['FAISS', 'HuggingFaceInstructEmbeddings'] vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x3dd2c31d0> search_kwargs={'k': 3}


In [52]:
retriever.search_type

'similarity'

In [53]:
docs = retriever.get_relevant_documents("Who are the authors of this software?")
docs[0]

Document(page_content='"type": "Url"\n            },\n            "confidence": 1,\n            "technique": "file_exploration"\n        }\n    ],\n    "license": [\n        {\n            "result": {\n                "value": "The MIT License (MIT)\\n\\nCopyright (c) 2016-2018 Christoph Broschinski\\n\\nPermission is hereby granted, free of charge, to any person obtaining a copy of\\nthis software and associated documentation files (the \\"Software\\"), to deal in\\nthe Software without restriction, including without limitation the rights to\\nuse, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\\nof the Software, and to permit persons to whom the Software is furnished to do\\nso, subject to the following conditions:\\n\\nThe above copyright notice and this permission notice shall be included in all\\ncopies or substantial portions of the Software.\\n\\nTHE SOFTWARE IS PROVIDED \\"AS IS\\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\\nIMPLIED, INCLUDING BUT NOT 

In [54]:
# create the chain to answer questions
qa_chain_instrucEmbed = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0.2, ),
                                                    chain_type="stuff",
                                                    retriever=retriever,
                                                    return_source_documents=True)

## OpenAI's embeddings

In [55]:
#  Warnings regarding parallelism and stack logging that are used within the library. To address these warnings,
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [56]:
from langchain.embeddings import OpenAIEmbeddings

In [57]:
embeddings = OpenAIEmbeddings()

Following lines gives an message: `RateLimitError` caused by reaching the rate limit for using the OpenAI Text Embedding API (text-embedding-ada-002). This API has a limitation on the number of tokens that can be processed within a certain time frame. The solution is to reduce the Input Size: As suggested in the error message, you can reduce the number of input tokens (text) in your request to stay within the rate limits of the Text Embedding API.

In [58]:
db_openAIEmbedd= FAISS.from_documents(texts, embeddings)
retriever_openai = db_openAIEmbedd.as_retriever(search_kwargs={"k": 3})

In [59]:
# create the chain to answer questions
qa_chain_openai = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0.2, ),
                                                    chain_type="stuff",
                                                    retriever=retriever_openai,
                                                    return_source_documents=True)

### Testing both MODELS

In [60]:
## Cite sources

import textwrap # text wrapping and filling
def wrap_text_preserve_newlines(text, width=79):
    # Split the input text into lines based on newline characters
    lines = text.split("\n")

    #wrap each line individually
    wrapped_lines = [textwrap.fill(line, width) for line in lines]

    # Join the wrapped lines back into a single string using newline characters
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\nSources:')
    for source in llm_response['source_documents']:
        print(source.metadata['source'])

In [65]:
query = 'Can you give me the installation instructions in detail for one of the software? Please indicate the project name and description'

print('--------------Instructor Embeddings-------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)


--------------Instructor Embeddings-------------

 The installation instructions for the Screenshot Service can be found in the
README.md file on the project's GitHub page. The instructions are as follows:

1. Clone the service: https://github.com/MPDL/screenshot-service
2. Compile the service: In service directory, run `mvn clean install`
3. Copy html-screenshot.war to Tomcat Webapp Directory
4. Start Tomcat
5. Run Services under `http://localhost:8080/screenshot`
6. (OPTIONAL) If you want to support Webgl (only with useFirefox=true), you
need:
      * a: a server with a grafic card
      * b: run `Xvfb :2 -screen 0 1024x768x24`
      * c: run  `export DISPLAY=":2"`

Sources:
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_joai-project_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_screenshot-service_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/ma

In [66]:
query = 'Can you give me the installation instructions in detail for one of the software? Please indicate the project name and description'

print('-------------------OpenAI Embeddings------------------')
llm_response = qa_chain_openai(query)
process_llm_response(llm_response)
print('\n\n\n')

-------------------OpenAI Embeddings------------------
 I'm sorry, I don't know.

Sources:
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_screenshot-service_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_screenshot-service_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_screenshot-service_2023-11-28.json






In [68]:
query = 'Can you give me the installation requirements of a project using python? Please provide the title name and licence'

print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------

 The project is called unibiAPC and it is licensed under The MIT License (MIT).
The installation requirements are:
1. Install Firefox [download](https://www.mozilla.org/)
2. Install Phantoms [download] (http://phantomjs.org/download.html)
3. Install Java [download]
(http://www.oracle.com/technetwork/java/javase/downloads/index.html)
4. Install Maven [download] (http://maven.apache.org/download.cgi)
5. Install Tomcat [download](http://maven.apache.org/download.cgi)

Sources:
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_unibiAPC_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_screenshot-service_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_unibiAPC_2023-11-28.json


In [69]:
query = 'Can you give me the installation requirements of a project using python? Please provide the title name and licence'

# print('-------------------OpenAI Embeddings------------------')
# llm_response = qa_chain_openai(query)
# process_llm_response(llm_response)
# print('\n\n\n')
print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------

 The project is called unibiAPC and it is licensed under The MIT License (MIT).
The installation requirements are:
1. Install Firefox [download](https://www.mozilla.org/)
2. Install Phantoms [download] (http://phantomjs.org/download.html)
3. Install Java [download]
(http://www.oracle.com/technetwork/java/javase/downloads/index.html)
4. Install Maven [download] (http://maven.apache.org/download.cgi)
5. Install Tomcat [download](http://maven.apache.org/download.cgi)

Sources:
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_unibiAPC_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_screenshot-service_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_unibiAPC_2023-11-28.json


In [70]:
query = "Can you provide me the list of dependencies for a specific software package? Indicate the owner of the package and how to install it"

# print('-------------------OpenAI Embeddings------------------')
# llm_response = qa_chain_openai(query)
# process_llm_response(llm_response)
# print('\n\n\n')
print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------

 No, I don't know the list of dependencies for a specific software package.

Sources:
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_joai-project_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_rdmo-catalog_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_unibiAPC_2023-11-28.json


In [71]:
query = "Can you provide me the list of dependencies for a specific software package? Indicate the owner of the package and how to install it"

print('-------------------OpenAI Embeddings------------------')
llm_response = qa_chain_openai(query)
process_llm_response(llm_response)
print('\n\n\n')

-------------------OpenAI Embeddings------------------
 No, I don't know the answer to that question.

Sources:
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_screenshot-service_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_screenshot-service_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_screenshot-service_2023-11-28.json






In [72]:
query = "Can you identify the `software library dependencies` of a readme?"

# print('-------------------OpenAI Embeddings------------------')
# llm_response = qa_chain_openai(query)
# process_llm_response(llm_response)
# print('\n\n\n')
print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------

 No, I cannot identify the software library dependencies of a readme.

Sources:
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_unibiAPC_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_rdmo-catalog_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_joai-project_2023-11-28.json


In [73]:
query = "Please provide a link to the documentation for each project?"

print('-------------------OpenAI Embeddings------------------')
llm_response = qa_chain_openai(query)
process_llm_response(llm_response)
print('\n\n\n')

-------------------OpenAI Embeddings------------------
 The documentation for the joai-project can be found at
https://github.com/MPDL/joai-project/wiki and the documentation for the auth
project can be found at https://github.com/MPDL/auth/wiki.

Sources:
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_joai-project_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_auth_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_screenshot-service_2023-11-28.json






In [74]:
query = "Can you give me the list of names for all the project you find?"

print('-------------------OpenAI Embeddings------------------')
llm_response = qa_chain_openai(query)
process_llm_response(llm_response)
print('\n\n\n')

-------------------OpenAI Embeddings------------------
 The names of the projects are: University of Sheffield, University of
Southampton, University of St Andrews, University of Strathclyde, University of
Surrey, University of Sussex, University of the West of England, University of
Ulster, University of Warwick, and University of York.

Sources:
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_unibiAPC_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_joai-project_2023-11-28.json
/Users/ccugutrillague/Documents/outreach/smp-hackthaton/maSMP-LLM/data/mpdl_collection_raw/MPDL_auth_2023-11-28.json




