In [1]:
import json

file_path = 'data/data.txt'

# Read the file and load JSON data
with open(file_path, 'r') as file:
    list_of_docs = json.load(file)

# Now 'data' contains the list of dictionaries
print(type(list_of_docs))

<class 'list'>


In [2]:
from langchain_community.vectorstores import FAISS, Chroma
from langchain.schema import Document

docs = []

for doc_obj in list_of_docs:
    doc = Document(
        page_content=doc_obj["job_description"],
        metadata={
            "id": doc_obj["id"],
            "hirer": doc_obj["hirer"],
            "title": doc_obj["title"],
            "company": doc_obj["company"],
            "company_description": doc_obj["company_description"],
            "location": doc_obj["location"],
            "job_type": doc_obj["job_type"],
            "experience_level": doc_obj["experience_level"],
            "requirement": doc_obj["requirement"],
            "skill": doc_obj["skill"],
            "salary": doc_obj["salary"]
        }
    )
    docs.append(doc)



In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [4]:
from torch import cuda

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [5]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={'device': device})

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
persist_directory="./chroma_db"
vectordb = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)

In [7]:
vectordb.get()

{'ids': ['97fea212-d472-11ee-b1a0-38d57a06f7e2',
  '97fea213-d472-11ee-889f-38d57a06f7e2',
  '97fea214-d472-11ee-bc26-38d57a06f7e2',
  '97fea215-d472-11ee-a86b-38d57a06f7e2',
  '97fea216-d472-11ee-9f85-38d57a06f7e2',
  '97fea217-d472-11ee-83b1-38d57a06f7e2',
  '97fea218-d472-11ee-bc72-38d57a06f7e2',
  '97fea219-d472-11ee-853d-38d57a06f7e2',
  '97fea21a-d472-11ee-a4fe-38d57a06f7e2',
  '97fea21b-d472-11ee-a1c4-38d57a06f7e2',
  '97fea21c-d472-11ee-97b9-38d57a06f7e2',
  '97fea21d-d472-11ee-9bf9-38d57a06f7e2',
  '97fea21e-d472-11ee-92f0-38d57a06f7e2',
  '97fea21f-d472-11ee-b597-38d57a06f7e2',
  '97fea220-d472-11ee-93ea-38d57a06f7e2',
  '97fea221-d472-11ee-bc07-38d57a06f7e2',
  '97fea222-d472-11ee-b5cd-38d57a06f7e2',
  '97fea223-d472-11ee-86e9-38d57a06f7e2',
  '97fea224-d472-11ee-85bf-38d57a06f7e2',
  '97fea225-d472-11ee-a3bb-38d57a06f7e2',
  '97fea226-d472-11ee-be91-38d57a06f7e2',
  '97fea227-d472-11ee-a7c1-38d57a06f7e2',
  '97fea228-d472-11ee-9495-38d57a06f7e2',
  '97fea229-d472-11ee-b1b4-

In [8]:
vectordb.persist()