## A Storage-Focused Chatbot
Answer all your SAS, SATA, and NVMe questions

ref: 
- https://www.mlq.ai/gpt-4-pinecone-website-ai-assistant/
- https://github.com/rabbitmetrics/langchain-13-min/blob/main/notebooks/langchain-13-min.ipynb
- https://blog.futuresmart.ai/building-a-document-based-question-answering-system-with-langchain-pinecone-and-llms-like-gpt-4-and-chatgpt

In [None]:
# Load environment variables
import os
from dotenv import load_dotenv,find_dotenv

print(f"current directory: {os.getcwd()}")
 
if find_dotenv():
    print("Found .env file")
    load_dotenv(find_dotenv())  
else:
    print("No .env file found")
# print all environment variables


In [None]:
# import schema for chat messages and ChatOpenAI in order to query chatmodels GPT-3.5-turbo or GPT-4

from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.chat_models import ChatOpenAI


In [None]:
# chat = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0.3)
# messages = [
#     SystemMessage(content="You are an expert Linux storage engineer with experience in SATA, SAS, hard drives, NVMe, and SSDs"),
#     HumanMessage(content="give a short description of your skills")
# ]
# response=chat(messages)

# print(response.content,end='\n')

In [None]:
"""create embeddings for langchain documents
Here we are extracting the text and metadata into all_the_pages"""
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter



files = os.listdir("sources")
print(files)

all_the_pages = []

for file in files:
    loader = PyMuPDFLoader(f"sources/{file}")
    pages = loader.load_and_split()
    all_the_pages.extend(pages)


print(f"len(all_the_pages) = {len(all_the_pages)}")
# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap  = 10,
)
docs = text_splitter.split_documents(all_the_pages)
print(f"len(docs) = {len(docs)}")


# create embeddings"


In [None]:
# Import and instantiate OpenAI embeddings

from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model_name="text-embedding-ada-002")
#embeddings = OpenAIEmbeddings(model_name="gpt-4")

In [None]:

# Import and initialize Pinecone client

import os
import tiktoken
import pinecone
from langchain.vectorstores import Pinecone


pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)
   
index_name = "sata-chat"
print(f"deleting index {index_name}")
pinecone.delete_index(index_name)
print(f"creating index {index_name}")
pinecone.create_index(index_name, dimension=1536)
print(f"indexing {index_name}")
index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
pinecone.list_indexes()

In [None]:

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA


def get_similiar_docs(query, k=2, score=False):
  if score:
    similar_docs = index.similarity_search_with_score(query, k=k)
  else:
    similar_docs = index.similarity_search(query, k=k)
  return similar_docs

model_name = "gpt-4"
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)

qa_chain = RetrievalQA.from_chain_type(llm,retriever=index.as_retriever())

def get_answer(query):
  similar_docs = get_similiar_docs(query)
  print(similar_docs)
  answer = qa_chain.run(input_documents=similar_docs, query=query)
  return answer

In [None]:
query = "What are the SATA speeds?"
print(get_answer(query))