# Loading

In [1]:
from langchain_community.document_loaders import JSONLoader

In [2]:
import json
from pathlib import Path
from pprint import pprint


file_path='./data.json'
data = json.loads(Path(file_path).read_text())

pprint(data)

[{'instructions': 'Make sure it is clean before recycling.',
  'item': 'Printed paper (Glossy and non-glossy)',
  'material': 'Paper',
  'recyclable': True,
  'similar_items': ['Printed paper (Glossy and non-glossy)']},
 {'instructions': 'Make sure it is clean before recycling.',
  'item': 'Writing paper',
  'material': 'Paper',
  'recyclable': True,
  'similar_items': ['Writing paper']},
 {'instructions': 'Make sure it is clean before recycling.',
  'item': 'Paper',
  'material': 'Paper',
  'recyclable': True,
  'similar_items': ['Paper',
                    'Exam Papers',
                    'Notes',
                    'Notebook',
                    'Mail',
                    'Letter',
                    'Bill',
                    'Foolscap Paper',
                    'Exercise Book']},
 {'instructions': 'Clothes should be donated if they are in good condition. '
                  '<br/><br/> Click <a '
                  "href='https://www.nea.gov.sg/our-services/waste-managemen

In [3]:
item_names = set()
data_cleaned = []
for d in data:
    if d['item'] not in item_names:
        data_cleaned.append(d)
        item_names.add(d['item'])


In [5]:
# save to json file
with open('./data_cleaned.json', 'w') as file:
    json.dump(data_cleaned, file)

In [7]:
loader = JSONLoader(
    file_path='./data_cleaned.json',
    jq_schema='.[] | {material, item, similar_items, recyclable, instructions}',
    text_content=False,
    # metadata_func=lambda doc, index: {
    #     "doc": doc,
    #     "index": index
    # }
    )

docs = loader.load()

In [4]:
# # remove the irrelevant fields in the content
# def extract_relevant_fields(page_content: dict):
#     return {
#         "material": page_content.get('material', None),
#         "item": page_content.get('item', None),
#         "similar_items": page_content.get('similar_items', [])
#     }

# for i in range(len(docs)):
#     content_json = json.loads(docs[i].page_content)
#     docs[i].page_content = json.dumps(extract_relevant_fields(content_json))



# Splitting

In [50]:
# No splitting as each document has relatively small tokens

# Embedding

In [10]:
from langchain_openai import OpenAIEmbeddings

embedder = OpenAIEmbeddings(model='text-embedding-ada-002')

# Vector Store

In [11]:
from dotenv import load_dotenv
load_dotenv()



True

In [12]:
from langchain_pinecone import PineconeVectorStore
import os 

index_name = os.getenv("PINECONE_INDEX_NAME")

docsearch = PineconeVectorStore.from_documents(docs, embedder, index_name=index_name)



In [13]:
# query = "The item is potato chips bag. Is it recyclable?"
query = "The item in the image is a plastic wrapper for an A4 paper ream, branded by PaperOne."
docs = docsearch.similarity_search(query)

for doc in docs:
    print(doc.page_content, end='\n\n')



{"material": "Paper", "item": "Plastic envelope", "similar_items": ["Plastic envelope"], "recyclable": true, "instructions": "Make sure it is clean before recycling."}

{"material": "Paper", "item": "Printed paper (Glossy and non-glossy)", "similar_items": ["Printed paper (Glossy and non-glossy)"], "recyclable": true, "instructions": "Make sure it is clean before recycling."}

{"material": "Plastic", "item": "Toilet paper packaging", "similar_items": ["Toilet paper packaging"], "recyclable": true, "instructions": "Make sure it is clean before recycling."}

{"material": "Paper", "item": "Paper Packaging (printed paper box etc)", "similar_items": ["Gifts", "Presents"], "recyclable": true, "instructions": "Please flatten before recycling."}



In [66]:
# query = "The item is potato chips bag. Is it recyclable?"
query = "The item in the image is a plastic wrapper for an A4 paper ream, branded by PaperOne."
docs = docsearch.max_marginal_relevance_search(query)

for doc in docs:
    print(doc.page_content, end='\n\n')



{"material": "Paper", "item": "Printed paper (Glossy and non-glossy)", "recyclable": "YES!", "instructions": "Make sure it is clean before recycling.", "similar_items": ["Exam Papers ", "Notes", "Notebook", "Mail", "Letter", "Stamps"]}

{"material": "Paper", "item": "Beverage carton", "recyclable": "YES!", "instructions": "Please empty, rinse and flatten before recycling.", "similar_items": ["Beverage carton", "Juice", "Pokka", "Milo", "Yeo's ", "Ribena", "Green Tea", "Ice Lemon Tea", "Lychee Tea", "Marigold", "Peel Fresh", "Orange Juice", "Apple Juice", "FNN", "Seasons", "Meiji", "Oatly", "Oatside", "HL", "Fruit Juice Packet", "Hojicha Tea", "Oolong Tea ", "Vitasoy", "F&N", "Greenfield", "Farm Fresh"]}

{"material": "Others", "item": "Plastic food wrap", "recyclable": "YES!", "instructions": "Make sure it is clean before recycling.", "similar_items": ["Plastic food wrap"]}

{"material": "Paper", "item": "Paper packaging contaminated with food", "recyclable": "NO!", "instructions": "sh

  docs = docsearch.max_marginal_relevance_search(query)


In [7]:
from langchain.vectorstores import Chroma

In [8]:
persist_directory = 'docs/chroma'

In [9]:
# !rm -rf ./docs/chroma  # remove old database files if any
# !rmdir docs

In [10]:
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedder,
    persist_directory=persist_directory
)

In [11]:
print(vectordb._collection.count())



322


In [16]:
question = "The item in the image is a plastic wrapper for an A4 paper ream, branded by PaperOne."

In [45]:
docs = vectordb.max_marginal_relevance_search(question, k=5)

for doc in docs:
    print(doc.metadata, end='\n\n')



{'seq_num': 218, 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}

{'seq_num': 238, 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}

{'seq_num': 240, 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}

{'seq_num': 39, 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}

{'seq_num': 291, 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}



In [29]:
vectordb.add_documents()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'page_content': '{"material": "Paper", "item": "Printed paper box", "recyclable": "YES!", "instructions": "Please flatten before recycling.", "similar_items": []}',
  'metadata': {'seq_num': 238,
   'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'},
  'type': 'Document'}}

In [47]:
vectordb.

{'seq_num': 291,
 'source': '/home/mightymagnus/projects/blooapp-api/app/data.json'}

In [44]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Initialize the embedding model and vector database
embedder = OpenAIEmbeddings()
vectordb = Chroma(persist_directory='docs/chroma', embedding=embedder)

# Example document
doc_content = {
    "id": "436",
    "material": "Others",
    "item": "Clothes",
    "recyclable": "NOT IN ME!",
    "instructions": "Clothes should be donated if they are in good condition...",
    "similar_items": ["T-Shirt", "Shorts", "Pants", "Skirt", "Jacket", "Shirts", "Tie", "Blouse", "Singlet", "Camisole", "Clothes"]
}

# Function to prepare text for embedding based on specific fields
def prepare_text_for_embedding(item, similar_items):
    return item + " " + " ".join(similar_items)

# Embedding the relevant fields
embedding_text = prepare_text_for_embedding(doc_content['item'], doc_content['similar_items'])
embedding_vector = embedder.get_embedding(embedding_text)

# Adding the document to Chroma with full document as metadata
vectordb.add_document(doc_content['id'], embedding_vector, metadata=doc_content)

# Function to retrieve a full document by ID
def get_full_document_by_id(doc_id):
    result = vectordb.get_document(doc_id)
    if result:
        return result.metadata
    return None

# Example of retrieving a document
retrieved_document = get_full_document_by_id("436")
print(retrieved_document)

'{"material": "Plastic", "item": "Plastic packaging for packet drink", "recyclable": "YES!", "instructions": "Make sure it is clean before recycling.", "similar_items": ["Plastic packaging for packet drink"]}'