In [13]:
from langchain.vectorstores import FAISS
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings


import os
import json
import openai

In [5]:
def openai_setup(secret_path):
    """
    Load OpenAI API key from the secrets file
    """
    with open(secret_path) as f:
        secrets = json.load(f)

    os.environ['OPENAI_API_KEY'] = secrets['OPENAI_API_KEY']
    openai.api_key = os.environ['OPENAI_API_KEY']

openai_setup('../secrets/openai_secret.json')


In [6]:
loader = CSVLoader("metadata/processed/Washington.csv")

In [16]:
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = FAISS.from_documents(docs, embeddings)

In [17]:
db.save_local("metadata/vector_db")


In [18]:
db = FAISS.load_local('vector_db_loaded', embeddings=embeddings)


ValueError: The de-serialization relies loading a pickle file. Pickle files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.You will need to set `allow_dangerous_deserialization` to `True` to enable deserialization. If you do this, make sure that you trust the source of the data. For example, if you are loading a file that you created, and no that no one else has modified the file, then this is safe to do. Do not set this to `True` if you are loading a file from an untrusted source (e.g., some random site on the internet.).

In [19]:
def generate_answer(q):
    context = db.similarity_search(q)
    # Truncate context. Shouldn't have to do this but
    # without it we often generate the empty string.
    N = 10_000
    if len(context)>N:
        context = context[:N]
    prompt = f'''[INSTRUCTIONS]:\nYou are an educator supporting students on a discussion \
    board for a course in Data Science at Harvard by answering questions like the one below. \
    When you don't have information about some aspect of course policy from the provided relevant information, \
    you MUST NOT make any unfounded assertion. \
    You should instead advise students to message the course helpline, cs109a2023@gmail.com, \
    if you lack the required information, \
    but you must not imply that the course policies or deadlines are negotiable. \
    These are final unless there is a medical or other university excused exception.\
    You should begin your responses with a brief, friendly greeting so as to not appear curt. \
    Empty answers are never acceptable; You must provide some kind of meaningful response. \
    Most importantly, you must NEVER contradict yourself or the provided relevant information as it pertains to course policy.\n\n\
    [RELEVANT INFORMATION]: {context}\n\n\
    [QUESTION]:\n{q}\n\n\
    [ANSWER]:\n'''
    result = pipe(prompt)
    answer = result[0]['generated_text'][len(prompt):].strip()
    return answer

In [20]:
db.similarity_search("What is the concern about the data?")

[Document(page_content='Everybody was so friendly and kind.: Data entered by Maria Kim 12/30/24', metadata={'source': 'metadata/processed/Washington.csv', 'row': 325}),
 Document(page_content='Everybody was so friendly and kind.: The mobile network is not good and on top of it the guest internet is really  bad even to finish the survey.', metadata={'source': 'metadata/processed/Washington.csv', 'row': 273}),
 Document(page_content='Everybody was so friendly and kind.: I am taking the survey 12hrs postpartum and thus far so not have complications', metadata={'source': 'metadata/processed/Washington.csv', 'row': 326}),
 Document(page_content='Everybody was so friendly and kind.: 1. Our son was given a flu shot instead of being given a hepatitis B shot. 2. The hearing test devise had technical issues which was giving some false readings. The woman said it has been having issues for a month. Not sure why she was using it and did not report it ASAP', metadata={'source': 'metadata/processed/