# Loading

In [1]:
from langchain_community.document_loaders import JSONLoader

In [3]:
DATA_PATH = '../data/cleaned_data_final.json'

In [4]:
import json
from pathlib import Path
from pprint import pprint


file_path=DATA_PATH
data = json.loads(Path(file_path).read_text())

pprint(data)

[{'instructions': '9-Volt battery can be recycled at e-waste collection '
                  'points, located at ALBA E-waste recycling bins.  Remember '
                  'to tape the ends of the battery and seal leaking batteries '
                  'in a leak-proof container or bag first to prevent '
                  'short-circuit or fire incidents.',
  'item': '9-volt battery',
  'links': ['https://www.nea.gov.sg/our-services/waste-management/3r-programmes-and-resources/e-waste-management/where-to-recycle-e-waste'],
  'material': 'Others',
  'recyclable': True},
 {'instructions': 'Please empty contents before recycling to prevent fire '
                  'incidents.',
  'item': 'aerosol cans',
  'links': [],
  'material': 'Others',
  'recyclable': True},
 {'instructions': 'Can be recycled through: • Retailer 1-for-1 Take-back• '
                  'Bulky item removal service by TCs for HDB residents• '
                  'E-waste collection drive (Quarterly)• Doorstep collection '
 

In [5]:
loader = JSONLoader(
    file_path=DATA_PATH,
    jq_schema='.[] | {material, item, recyclable, instructions, links}',
    text_content=False,
    # metadata_func=lambda doc, index: {
    #     "doc": doc,
    #     "index": index
    # }
    )

docs = loader.load()

In [4]:
# # remove the irrelevant fields in the content
# def extract_relevant_fields(page_content: dict):
#     return {
#         "material": page_content.get('material', None),
#         "item": page_content.get('item', None),
#         "similar_items": page_content.get('similar_items', [])
#     }

# for i in range(len(docs)):
#     content_json = json.loads(docs[i].page_content)
#     docs[i].page_content = json.dumps(extract_relevant_fields(content_json))



In [6]:
docs[0]

Document(metadata={'source': '/home/mightymagnus/projects/blooapp-api/data/cleaned_data_final.json', 'seq_num': 1}, page_content='{"material": "Others", "item": "9-volt battery", "recyclable": true, "instructions": "9-Volt battery can be recycled at e-waste collection points, located at ALBA E-waste recycling bins.  Remember to tape the ends of the battery and seal leaking batteries in a leak-proof container or bag first to prevent short-circuit or fire incidents.", "links": ["https://www.nea.gov.sg/our-services/waste-management/3r-programmes-and-resources/e-waste-management/where-to-recycle-e-waste"]}')

# Splitting

In [50]:
# No splitting as each document has relatively small tokens

# Embedding

In [7]:
from langchain_openai import OpenAIEmbeddings

embedder = OpenAIEmbeddings(model='text-embedding-ada-002')

# Vector Store

In [8]:
from dotenv import load_dotenv
load_dotenv()



True

In [10]:
from langchain_pinecone import PineconeVectorStore
import os 

index_name = os.getenv("PINECONE_INDEX_NAME")

docsearch = PineconeVectorStore.from_documents(docs, embedder, index_name=index_name)



In [18]:
query = "styrofoam, toothpase tube"

In [19]:
# query = "The item is potato chips bag. Is it recyclable?"
docs = docsearch.similarity_search(query)

for doc in docs:
    print(doc.page_content, end='\n\n')



{"material": "Plastic", "item": "styrofoam", "recyclable": false, "instructions": "Should be disposed of as general waste.", "links": []}

{"material": "Plastic", "item": "styrofoam cup", "recyclable": false, "instructions": "Should be disposed of as general waste.", "links": []}

{"material": "Plastic", "item": "styrofoam clamshell container", "recyclable": false, "instructions": "Should be disposed of as general waste.", "links": []}

{"material": "Plastic", "item": "polystyrene foam product", "recyclable": false, "instructions": "Should be disposed of as general waste.", "links": []}



In [20]:
docs = docsearch.max_marginal_relevance_search(query)

for doc in docs:
    print(doc.page_content, end='\n\n')



{"material": "Plastic", "item": "styrofoam", "recyclable": false, "instructions": "Should be disposed of as general waste.", "links": []}

{"material": "Glass", "item": "glass tube", "recyclable": true, "instructions": "Can be recycled at e-waste collection points, located at ALBA E-waste recycling bins.", "links": ["https://www.nea.gov.sg/our-services/waste-management/3r-programmes-and-resources/e-waste-management/where-to-recycle-e-waste"]}

{"material": "Paper", "item": "paper towel tube", "recyclable": true, "instructions": "Make sure it is clean before recycling.", "links": []}

{"material": "Plastic", "item": "plastic bubble tea carrier", "recyclable": true, "instructions": "Please empty contents before recycling.", "links": []}



# Using Chroma (in memory vector db)

In [7]:
from langchain.vectorstores import Chroma

In [8]:
persist_directory = 'docs/chroma'

In [9]:
# !rm -rf ./docs/chroma  # remove old database files if any
# !rmdir docs

In [10]:
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedder,
    persist_directory=persist_directory
)

In [11]:
print(vectordb._collection.count())



322


In [16]:
question = "The item in the image is a plastic wrapper for an A4 paper ream, branded by PaperOne."

In [45]:
docs = vectordb.max_marginal_relevance_search(question, k=5)

for doc in docs:
    print(doc.metadata, end='\n\n')



{'seq_num': 218, 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}

{'seq_num': 238, 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}

{'seq_num': 240, 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}

{'seq_num': 39, 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}

{'seq_num': 291, 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}



In [29]:
vectordb.add_documents()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'page_content': '{"material": "Paper", "item": "Printed paper box", "recyclable": "YES!", "instructions": "Please flatten before recycling.", "similar_items": []}',
  'metadata': {'seq_num': 238,
   'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'},
  'type': 'Document'}}

In [47]:
vectordb.

{'seq_num': 291,
 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}

In [44]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Initialize the embedding model and vector database
embedder = OpenAIEmbeddings()
vectordb = Chroma(persist_directory='docs/chroma', embedding=embedder)

# Example document
doc_content = {
    "id": "436",
    "material": "Others",
    "item": "Clothes",
    "recyclable": "NOT IN ME!",
    "instructions": "Clothes should be donated if they are in good condition...",
    "similar_items": ["T-Shirt", "Shorts", "Pants", "Skirt", "Jacket", "Shirts", "Tie", "Blouse", "Singlet", "Camisole", "Clothes"]
}

# Function to prepare text for embedding based on specific fields
def prepare_text_for_embedding(item, similar_items):
    return item + " " + " ".join(similar_items)

# Embedding the relevant fields
embedding_text = prepare_text_for_embedding(doc_content['item'], doc_content['similar_items'])
embedding_vector = embedder.get_embedding(embedding_text)

# Adding the document to Chroma with full document as metadata
vectordb.add_document(doc_content['id'], embedding_vector, metadata=doc_content)

# Function to retrieve a full document by ID
def get_full_document_by_id(doc_id):
    result = vectordb.get_document(doc_id)
    if result:
        return result.metadata
    return None

# Example of retrieving a document
retrieved_document = get_full_document_by_id("436")
print(retrieved_document)

'{"material": "Plastic", "item": "Plastic packaging for packet drink", "recyclable": "YES!", "instructions": "Make sure it is clean before recycling.", "similar_items": ["Plastic packaging for packet drink"]}'