JSON Parsing and processing

In [1]:
import os 
import json
os.makedirs("data/json_files", exist_ok=True)

In [2]:
json_data = {
  "company": "TechCorp",
  "employees": [
    {
      "id": 1,
      "name": "John Doe",
      "role": "Software Engineer",
      "skills": [
        "Python",
        "JavaScript",
        "React"],
      "projects": [
        {
          "name": "RAG System",
          "status": "In Progress"
        },
        {
          "name": "Data Pipeline",
          "status": "Completed"
        }
      ]
    },
    {
      "id": 2,
      "name": "Jane Smith",
      "role": "Data Scientist",
      "skills": [
        "Python",
        "Machine Learning",
        "SQL"
      ],
      "projects": [
        {
          "name": "ML Model",
          "status": "In Progress"
        },
        {
          "name": "Analytics Dashboard",
          "status": "Planning"
        }
      ]
    }
  ],
  "departments": {
    "engineering": {
      "head": "Mike Johnson",
      "budget": 1000000,
      "team_size": 25
    },
    "data_science": {
      "head": "Sarah Williams",
      "budget": 750000,
      "team_size": 15
    }
  }
}

with open("data/json_files/company_data.json", "w") as f:
    json.dump(json_data, f, indent=2)

In [5]:
jsonl_data = [{"timestamp": "2024-01-01", "event": "user_login", "user_id": 123},
{"timestamp": "2024-01-01", "event": "page_view", "user_id": 123, "page": "/home"},
{"timestamp": "2024-01-01", "event": "purchase", "user_id": 123, "amount": 99.99}]

with open("data/json_files/events.jsonl", "w") as f:
    for item in jsonl_data:
        f.write(json.dumps(item) + "\n")

Json processing strategies

In [8]:
from typing import List
from langchain_core.documents import Document

In [9]:
def process_json(file_path: str)-> List[Document]:
    with open(file_path, 'r') as f:
        data = json.load(f)

    docs=[]
    for emp in data.get('employees', []):
        content=f""" Employee Information:
             name: {emp['name']},
             role: {emp['role']},
             skills: {', '.join(emp['skills'])},

             projects:"""
        for proj in emp.get('projects', []):
            content += f"\n - {proj['name']} (status: {proj['status']})"
        
        doc = Document(
            page_content = content,
            metadata = {
                'source': file_path,
                'employee_id':emp['id'],
                'employee_name': emp['name'],
                'role': emp['role']            
                }
        )

        docs.append(doc)
    
    return docs

In [15]:
docs = process_json("data/json_files/company_data.json")
print(f"count of docs: {len(docs)}")
print(f"content: \n {docs[0].page_content}")
print(f"metadata: \n {docs[0].metadata}")

count of docs: 2
content: 
  Employee Information:
             name: John Doe,
             role: Software Engineer,
             skills: Python, JavaScript, React,

             projects:
 - RAG System (status: In Progress)
 - Data Pipeline (status: Completed)
metadata: 
 {'source': 'data/json_files/company_data.json', 'employee_id': 1, 'employee_name': 'John Doe', 'role': 'Software Engineer'}
