In [None]:
from dotenv import load_dotenv
import os

DATA_DIR="./data"
load_dotenv() 

if not os.getenv("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")


In [None]:
# Let's first load the document
import boto3
import shutil
from langchain_community.document_loaders import PyPDFLoader, TextLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from botocore.client import Config

account_id = os.getenv('R2_ACCOUNT_ID')
access_key = os.getenv('R2_ACCESS_KEY')
secret_key = os.getenv('R2_SECRET_KEY')
tmp_dir = "./tmp/"
os.makedirs(os.path.dirname(local_path), exist_ok=True)

s3 = boto3.client(
    's3',
    endpoint_url=f'https://{account_id}.r2.cloudflarestorage.com',
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    config=Config(signature_version='s3v4'),
    region_name='auto'
)

bucket_name = "yuri-data"

response = s3.list_objects_v2(Bucket=bucket_name)

for obj in response['Contents']:
    file_key = obj['Key']
    file_extension = os.path.splitext(file_key)[1].lower()
    
    local_path = f"{tmp_dir}{file_key}"
    response = s3.get_object(Bucket=bucket_name, Key=file_key)
    
    with open(local_path, "wb") as f:
        f.write(response["Body"].read())

pdf_loader = DirectoryLoader(tmp_dir, glob='**/*.pdf', loader_cls=PyPDFLoader, loader_kwargs={'mode': 'single'})
txt_loader = DirectoryLoader(tmp_dir, glob='**/*.txt', loader_cls=TextLoader)

pdf_doc = pdf_loader.load()
txt_docs = txt_loader.load()

# Joining all docs since I have more than one type
all_docs = pdf_doc + txt_docs

# It's a good practice to break the text into smaller chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
split_docs = splitter.split_documents(all_docs)

shutil.rmtree(tmp_dir)

In [38]:
# embedding text
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from uuid import uuid4
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec

# Cannot use the Gemini for now, quota excceeded
# embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-exp-03-07")
# vector_store = Chroma(
#     collection_name="yuri_data_1",
#     embedding_function=embeddings,
#     persist_directory="./chroma_db",
# )

# # Is this a good idea? We should have consistent IDs
# ids = [str(uuid4()) for _ in range(len(split_docs))]

# vector_store.add_documents(documents=split_docs, ids=ids)

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L12-v2"
)
model_dimension = 384 # all-MiniLM-L12-v2
pc = Pinecone(api_key=os.getenv('PINECONE_KEY'), text_key="text")

index_name = "yuri-data"
index_names = [index['name'] for index in pc.list_indexes()]
if index_name not in index_names:
    pc.create_index(
        name=index_name,
        dimension=model_dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )

index = pc.Index(index_name)
vector_store = Pinecone(
    index=index,
    embedding_function=embeddings.embed_query,
    text_key="text",
    api_key=os.getenv('PINECONE_KEY')
)

texts = [doc.page_content for doc in split_docs]
metadatas = [doc.metadata for doc in split_docs]

vector_store.add_texts(texts, metadatas=metadatas)


AttributeError: 'Pinecone' object has no attribute 'add_texts'

In [None]:
# Query data
# retriever = vector_store.as_retriever(search_kwargs={"k": 4})
retriever = vector_store.similarity_search(k=3)

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain import hub

system_instructions = ''
with open("./llm_instructions/system_prompt.txt", "r") as llm_instructions_f:
    system_instructions = llm_instructions_f.read()

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_instructions,
        ),
        ("human", "{input}"),
    ]
)

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0.5,
    max_tokens=200, # Reduced a bit, it's talking too much
    max_retries=2
)

combine_docs_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

answer = retrieval_chain.invoke({
    "input": "How many years of experience does he have with Python?"
})
print(answer)
