In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json

def process_data():
    all_chunks = []
    with open("gitlab_handbook_data.json","r") as f:
        data = json.load(f)
        for item in data :
            content=item["content"]
            url = item["url"]
            title=item["title"]
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=150,
                length_function=len,
                add_start_index=True,
            )
            chunks = text_splitter.create_documents([content], metadatas=[{"url": url, "title": title}])
            all_chunks.extend(chunks)
    return all_chunks      



In [4]:
all_chunks = process_data()

In [5]:
print(len(all_chunks))

1795


In [7]:
import os
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
load_dotenv()
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("test-git")

embed_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')


def store_in_pinecone(chunks):
    for chunk in chunks :
        embedding = embed_model.encode(chunk.page_content).tolist()
        chunk_id = f"{chunk.metadata['url']}_{chunk.metadata.get('start_index', 0)}"
        metadata = {
            'text': chunk.page_content,
            'url': chunk.metadata['url'],
            'title':chunk.metadata['title']
        }
        index.upsert([(chunk_id,embedding,metadata)])

In [9]:
all_chunks= process_data()
store_in_pinecone(all_chunks)