In [170]:
import os
from dotenv import load_dotenv

In [None]:
load_dotenv()

In [4]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [5]:
from langchain_openai.chat_models import ChatOpenAI

In [6]:
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [None]:
from pinecone import Pinecone

In [17]:
pc_db = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [20]:
import requests
import openpyxl as xl

In [21]:
wb = xl.load_workbook("./data/collated/faolex_download.xlsx")

In [22]:
ws = wb[wb.sheetnames[0]]

In [23]:
def download_all_files(ws):
    for row in ws.iter_rows():
        for cell in row:
            try:
                target = cell.hyperlink.target
                if target.startswith("https://faolex.fao.org/docs/pdf"):
                    pdf_res = requests.get(target)
                    with open(f"./data/raw/pdfs/{target.replace('https://faolex.fao.org/docs/pdf/', '')}", "wb") as f:
                        f.write(pdf_res.content)
                        print(f"Downloaded: {target}")
            except Exception as e:
                pass
    downloaded = os.listdir("./data/raw")
    for row in ws.iter_rows():
        for cell in row:
            try:
                target = cell.hyperlink.target
                if not target.startswith("http://www.fao.org/faolex") and target.replace("https://faolex.fao.org/docs/pdf/", "") not in downloaded:
                    htm_res = requests.get(target)
                    with open(f"./data/raw/htms/{target.replace('https://faolex.fao.org/docs/html/', '')}", "wb") as f:
                        f.write(htm_res.content)
                        print(f"Downloaded: {target}")
            except:
                pass

In [None]:
download_all_files(ws)

In [118]:
from langchain_community.document_loaders import PyPDFLoader
pdfs = []
for file in os.listdir("./data/raw/pdfs"):
    if file.endswith(".pdf"):
        pdf_loader = PyPDFLoader(f"./data/raw/pdfs/{file}")
        pdf = pdf_loader.load()
        pdf = [p.page_content for p in pdf]
        pdfs.append({"fname": file.replace(".pdf", ""), "text": pdf})

In [None]:
len(pdfs)

In [26]:
from langchain_community.document_loaders import UnstructuredHTMLLoader
htms = []
for file in os.listdir("./data/raw/htms"):
    htm_loader = UnstructuredHTMLLoader(f"./data/raw/htms/{file}")
    htm = htm_loader.load()
    htm = [h.page_content for h in htm]
    htms.append(htm)

In [27]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [28]:
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=40) # Apx. 100 words

In [29]:
pdfs = ["".join(p) for p in pdfs]
htms = ["".join(h) for h in htms]

In [None]:
def gpt_format_document(chain, document):
    return chain.invoke(f"Format the following document into one that is cleaner by removing formatting errors and unecessary special characters. Make it readable. Here is the document:\n{document}")

In [None]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

In [None]:
chain = model | parser

In [30]:
pdfs_clean = []
htms_clean = []
printed = 0
# for p in pdfs: 
#     try:
#         pdfs_clean.append(gpt_format_document(chain, p["text"]))
#     except Exception as e:
#         pdfs_clean.append(p["text"])
#         print(e)
#     printed += 1
#     print(f"Cleaned: {printed}")
with open("./pdfs_clean1.txt") as f:
    pdfs_clean = f.readlines()

In [31]:
pdfs_clean = [p[1:-1] for p in pdfs_clean]

In [None]:
len(pdfs)-len(pdfs_clean)

In [32]:
import tiktoken

In [None]:
pdfs[28]

In [None]:
for p_index in range(len(pdfs_clean)):
    if pdfs_clean[p_index] in pdfs:
        temp_pdf = pdfs_clean[p_index]
        encoding = tiktoken.get_encoding("cl100k_base")
        encoded = encoding.encode(temp_pdf)
        temp_formatted = []
        for j in range(0, len(encoded), 15000): 
            temp_formatted.append(gpt_format_document(chain, encoding.decode(encoded[j:j+15000])))
        pdfs_clean[p_index] = " ".join(temp_formatted)
        print(f"Done with {p_index}")
        

In [33]:
pdfs_split = [splitter.create_documents([p]) for p in pdfs_clean]

In [None]:
len(pdfs_split[0])

In [183]:
from pinecone import Pinecone
pc = Pinecone(api_key="d42f1895-2cea-49a9-b83a-baeb8fef8f57") # Was having issues with using dotenv variable for this, so I just pasted it into my code. 

In [199]:
index_name = "law"
index = pc.Index(index_name)

In [None]:
index.describe_index_stats()

In [43]:
import random
random.seed(42)

In [44]:
total_docs = []
for split_index in range(len(pdfs_split)):
    fname = pdfs[split_index]
    docs = pdfs_split[split_index]
    for doc in docs:
        total_docs.append({"content": doc.page_content}) # May need to add other metadata later, so I have created separate arrays total_docs and total_docs_content_only

In [66]:
total_docs_content_only = [d["content"] for d in total_docs]

In [55]:
from openai import OpenAI

In [56]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [73]:
all_embeddings = []

In [74]:
import time

In [None]:
for i in range(0, len(total_docs_content_only), 2000):
    all_embeddings.append(client.embeddings.create(input=total_docs_content_only[i:i+2000], model="text-embedding-3-small"))
    time.sleep(60) # Timeout for OpenAI embedding rate limit.

In [191]:
to_add = []

In [192]:
counter = 0
for i in all_embeddings:
    d = i.data
    for e in d:
        to_add.append(
            {
                "id": str(counter),
                "values": e.embedding,
                "metadata": {
                    "content": total_docs_content_only[counter]
                }
            }
        )
        counter += 1

In [None]:
len(to_add)

In [203]:
for i in range(0, len(to_add), 100):
    index.upsert(to_add[i:i+100])

In [332]:
prompt = "If I want to spray pesticides on my farm in California but do not do so regularly, do I have to pay a fee to get my license?"
# hyde = chain.invoke(f"Write a law-style response to the following question: {prompt}")
query = client.embeddings.create(input=prompt, model="text-embedding-3-small")

In [333]:
query_embedding = query.data[0].embedding

In [334]:
results = index.query(
    vector=query_embedding,
    top_k=3
)

In [335]:
results_ids = [r["id"] for r in results["matches"]]

In [336]:
fetch_results = index.fetch(results_ids)["vectors"]

In [337]:
fetch_results
content = [fetch_results[k]["metadata"]["content"] for k in dict.keys(fetch_results)]

In [338]:
newline = "\n"

In [339]:
response = chain.invoke(f"A user asked the following question: {prompt}{newline}Using the following documents as context, respond to the question. Avoid providing overly-technical answers and instead focus on the main context.{newline}{newline.join(content)}")

In [None]:
print(response)

In [342]:
import numpy as np
from numpy.linalg import norm

In [343]:
def cosine_sim(v1, v2):
    return np.dot(np.array(v1), np.array(v2))/(norm(np.array(v1))*norm(np.array(v2)))

In [359]:
print(cosine_sim([1, 2], [3, 4]))

0.9838699100999074


In [386]:
# Following is no longer needed thanks to Ragas.
# def answer_relevance(response, actual_question, n=5):
#     generated_questions = json.loads(client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": f"Generate {n} questions that a farmer might ask that would have the following response: {response}"}], tools=tools).choices[0].message.tool_calls[0].function.arguments)["questions"]
#     generated_embeddings = client.embeddings.create(input=generated_questions, model="text-embedding-3-small")
#     actual_embeddings = client.embeddings.create(input=actual_question, model="text-embedding-3-small")
#     mean_cosine = sum([cosine_sim(generated_embeddings.data[i].embedding, actual_embeddings.data[0].embedding) for i in range(len(generated_embeddings.data))])/len(generated_embeddings.data)
#     return mean_cosine

In [389]:
from datasets import Dataset 
from ragas.metrics import answer_relevancy, faithfulness
from ragas import evaluate

In [394]:
data_samples = {
    "question": [prompt],
    "answer": [response],
    "contexts": [content]
}

In [396]:
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[faithfulness, answer_relevancy])
score.to_pandas()

Evaluating: 100%|██████████| 2/2 [00:06<00:00,  3.08s/it]


Unnamed: 0,question,answer,contexts,faithfulness,answer_relevancy
0,If I want to spray pesticides on my farm in Ca...,If you are not regularly engaged in the busine...,[the matter may proceed to hearing as though t...,1.0,0.905619
