<a href="https://colab.research.google.com/github/teofizzy/doc-savvy/blob/main/ingest_to_pinecone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ingesting to Pinecone

## Setting up

In [1]:
# access github repo
!git clone https://github.com/teofizzy/doc-savvy.git
%cd doc-savvy

Cloning into 'doc-savvy'...
remote: Enumerating objects: 51, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 51 (delta 26), reused 22 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (51/51), 16.87 KiB | 1.20 MiB/s, done.
Resolving deltas: 100% (26/26), done.
/content/doc-savvy


In [2]:
# install requirements
!pip install -r requirements.txt

Collecting PyMuPDF (from -r requirements.txt (line 2))
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting python-docx (from -r requirements.txt (line 3))
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting unstructured (from -r requirements.txt (line 4))
  Downloading unstructured-0.17.2-py3-none-any.whl.metadata (24 kB)
Collecting langdetect (from -r requirements.txt (line 7))
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pypika (from -r requirements.txt (line 8))
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to 

In [3]:
# Import dependencies
import argparse
import os
import io
import tempfile
import docx
from tqdm import tqdm
import pptx
import PyPDF2
from xmindparser import xmind_to_dict
import warnings
import msal
import openpyxl
from bs4 import BeautifulSoup
import re
import zipfile
from getpass import getpass
from faster_whisper import WhisperModel
import pandas as pd
import pinecone
from dotenv import load_dotenv
from sharepoint_utils import SharePointFetcher
from pinecone_utils import SharePointLoader, ingest_files_to_pinecone
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document as LangchainDocument
from pinecone import Pinecone, ServerlessSpec


from typing import List, Optional, Dict, Any
from langchain_community.document_loaders import (
    UnstructuredFileLoader,
    UnstructuredExcelLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredPowerPointLoader,
    CSVLoader
)
from pdf2image import convert_from_bytes
import pytesseract
import warnings

In [4]:
# load secrets
from google.colab import auth, userdata
auth.authenticate_user()

# Environment variables
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
SHAREPOINT_CLIENT_SECRET = userdata.get('SHAREPOINT_CLIENT_SECRET')
SHAREPOINT_CLIENT_ID = userdata.get('SHAREPOINT_CLIENT_ID')
SHAREPOINT_TENANT_ID = userdata.get('SHAREPOINT_TENANT_ID')
PINECONE_INDEX_NAME = userdata.get('PINECONE_INDEX_NAME')
PINECONE_ENVIRONMENT = userdata.get('PINECONE_ENVIRONMENT')

In [5]:
# Get embedding model
embedding_model = "sentence-transformers/all-mpnet-base-v2"

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={"device": "cuda"},  # or "cpu" if no GPU
    encode_kwargs={"normalize_embeddings": True}
)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# # Get the embedding dimension
# dimension = embeddings.dict()
# print(f"Embedding dimension for {embedding_model}: {dimension}")

In [6]:
# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)

# Configuration
index_name = PINECONE_INDEX_NAME

index_names = [index.name for index in pc.list_indexes()]

# Check if index exists
if index_name not in index_names:
    # Create new index if it doesn't exist
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws",
                            region="us-east-1")
        )
    print(f"Created new index: {index_name}")
else:
    print(f"Using existing index: {index_name}")

index = pc.Index(index_name)

# preview
index.describe_index_stats()

Using existing index: project-kenobi-index


{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'project-kenobi': {'vector_count': 3900}},
 'total_vector_count': 3900,
 'vector_type': 'dense'}

In [7]:
# Create a vector store
namespace = "project-kenobi"
vector_store = PineconeVectorStore(embedding=embeddings,
                                   index=index,
                                   namespace=namespace)

In [8]:
# Init fetcher
fetcher = SharePointFetcher(tenant_id=SHAREPOINT_TENANT_ID,
                            client_id=SHAREPOINT_CLIENT_ID,
                            client_secret=SHAREPOINT_CLIENT_SECRET)

# enter sharepoint domain, site and folder
domain = getpass("Enter domain: ")
site = getpass("Enter site: ")
folder = getpass("Enter folder: ")

Enter domain: ··········
Enter site: ··········
Enter folder: ··········


In [9]:
# fetch files
files = fetcher.fetch_all_files(domain=domain, site_name=site, folder_path=folder)

In [10]:
# Initialize document loader
loader = SharePointLoader()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


config.json:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/145M [00:00<?, ?B/s]

In [11]:
# index.delete(delete_all=True, namespace=namespace)

{}

## Ingest to pinecone

In [12]:
# Run ingestion
ingest_files_to_pinecone(
    files=files,
    loader=loader,
    vectorstore=vector_store,
    batch_size=50  # Conservative batch size
)

Processing files:   4%|▍         | 4/90 [00:01<00:41,  2.07it/s]

[!] Failed to process Collab/01. SPG/07. Project Kenobi/01. Background Documents/AIM Overview.pdf: Unable to get page count. Is poppler installed and in PATH?

[!] No content extracted from Collab/01. SPG/07. Project Kenobi/01. Background Documents/AIM Overview.pdf


Processing files: 100%|██████████| 90/90 [00:52<00:00,  1.70it/s]



Starting ingestion of 3900 chunks to Pinecone


Ingesting to Pinecone: 100%|██████████| 78/78 [01:58<00:00,  1.52s/it]


Ingestion Summary:
- Successfully processed 90/90 files
- Successfully ingested 3900/3900 chunks





## RAG implementation


In [13]:
def retrieve_relevant_chunks(vectorstore, query: str, top_k: int = 10) -> list[LangchainDocument]:
    docs = vectorstore.similarity_search(query, k=top_k)
    return docs

In [17]:
# query = "Who are the participants in Project Kenobi?"
# docs = retrieve_relevant_chunks(vector_store, query)

# for i, doc in enumerate(docs):
#     print(f"\nResult {i+1}")
#     print(f"Source: {doc.metadata.get('source', 'N/A')}")
#     print(doc.page_content)
