#### Load the json file into a dictionary

In [9]:
from langchain_community.document_loaders import JSONLoader

import json
from pathlib import Path
from pprint import pprint


file_path='../jsons/ch28.json'
json_dict = json.loads(Path(file_path).read_text())

#### Create langchain documents with the relevant data

In [12]:
from langchain_core.documents import Document

In [15]:
docs = []
items = json_dict["Items"]
source = json_dict["Document Name"]

for item in items:
    prefix = item["Prefix"]
    hsHeadingName = item["HS Hdg Name"]
    hscode = item["HS Code"]
    description = item["Description"]

    content = "Prefix: " + prefix + " , HS Heading Name:" + hsHeadingName + " , Description:" + description
    document = Document(
        page_content=content,
        metadata={
            "source": source,
            "HS Code": hscode   
        }
    )
    docs.append(document)


#### Splitting

no need to split the items, let's see about the chapter context.

#### Create Vectorstore

In [20]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [27]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [28]:
from langchain.vectorstores import Chroma

persist_directory = 'vectorstore/Chroma'

In [30]:
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    persist_directory=persist_directory
)

In [10]:
print(vectordb._collection.count())

190


#### Testing with Similarity Search

In [32]:
question1 = "chlorine"
ans = vectordb.similarity_search(question1,k=3)

In [33]:
ans

[Document(metadata={'HS Code': '2801.10', 'source': 'Ch. 28.pdf'}, page_content='Prefix:  , HS Heading Name:Flourine, chlorine, bromine and iodine , Description:Chlorine'),
 Document(metadata={'HS Code': '2801.20', 'source': 'Ch. 28.pdf'}, page_content='Prefix:  , HS Heading Name:Flourine, chlorine, bromine and iodine , Description:Iodine'),
 Document(metadata={'HS Code': '2812.11', 'source': 'Ch. 28.pdf'}, page_content='Prefix: Chlorides and chloride oxides : , HS Heading Name:Halides and halide oxides of non-metals. , Description:Carbonyl dichloride (Phosgene)')]

#### Feeding to Chat Bot

In [34]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

  llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


In [23]:
# from langchain.chains import RetrievalQA

# qa_chain = RetrievalQA.from_chain_type(
#     llm,
#     retriever=vectordb.as_retriever()
# )

In [25]:
# question = "what are the hs codes for group 7 elements"
# result = qa_chain({"query": question})
# result["result"]

"I don't have information on the HS codes for group 7 elements."

In [74]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """You are a chemistry expert. The context provided is from a tariff document regarding chemicals. Use the context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(
        search_type = "mmr",
        search_kwargs = { 'k':20, 'lambda_mult':0.25 }
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)



In [86]:
question1 = "What is the HS code for chlorine" #works
question2 = "what is the pal sg for chlorine" #works
question3 = "what the hs codes for rare gases" #acceptable
question4 = "what are the hs codes for group 7 elements" #confuses it, gets wrong stuff
question5 = "what are the hs codes for halogens" #but this works :)
question6  = "what is the pal sg for bromine"#works
question7 = "what is the pal gen for iodine" #works
question8 = "what is the hs code for sand" #fails
question9 = "what is the hs hdg for radioactive stuff" #works
question10 = "how many radioactive categories are there?" #should be 9, returns 7
question11 = "list the radioactive categories" #returns 5, but there are more in the retrieved 20 documents
question12 = "what is the hs code of hydrogen peroxide" #works
question13 = "list all the chlorides mentioned" #returns 6
question14 = "list all the chlorides mentioned with their hs codes" #actually got the stuff from the dedicated chloride category, but missed others in other categories like thionyl choride which is in halides, and hydrogen chloride which is in some other category
question15 = "list the noble gases with their hs codes" # returned argon's hs code for helium (especially when helium-3 is listed in a different category), got argon and neon right (since neon falls under other), but didn't realize that krypton and above also falls under other 
question16 = "list hs code of helium" #correct despite above
question17 = "what are the details of hs code 2812.15"
question18 = "what are the details of hs code 2804.30"
question19 = "what is the hs code of oxygen"
question20 = "what are the inorganic oxygen compounds of non-metals that have Gen Duty."
result = qa_chain({"query": question20}) 
result["result"]

'The inorganic oxygen compounds of non-metals that have Gen Duty are Hydrogen cyanide (hydrocyanic acid) and Hydrogen fluoride (hydrofluoric acid).'

In [87]:
result["source_documents"]

[Document(metadata={'seq_num': 35, 'source': '/Users/rehangagamage/Desktop/pdfplumber/jsons/ch28.json'}, page_content='{"Prefix": "Other inorganic oxygen compounds of non-\\nmetals :", "HS Hdg Name": "Other inorganic acids and other\\ninorganic oxygen compounds of non-\\nmetals.", "HS Hdg": "28.11", "HS Code": "2811.21", "Description": "Carbon dioxide", "Unit": "kg", "ICL/SLSI": "", "Preferential Duty_AP": "", "Preferential Duty_AD": "", "Preferential Duty_BN": "", "Preferential Duty_GT": "", "Preferential Duty_IN": "Free", "Preferential Duty_PK": "Free", "Preferential Duty_SA": "", "Preferential Duty_SF": "5%", "Preferential Duty_SD": "4.5%", "Preferential Duty_SG": "10.0%", "Gen Duty": "15%", "VAT": "18%", "PAL_Gen": "10%", "PAL_SG": "6%", "Cess_GEN": "5%", "Cess_SG": "Free", "Excise SPD": "", "SSCL": "2.5%", "SCL": ""}'),
 Document(metadata={'seq_num': 36, 'source': '/Users/rehangagamage/Desktop/pdfplumber/jsons/ch28.json'}, page_content='{"Prefix": "Other inorganic oxygen compounds