## Redis Connection

In [None]:
import os
import warnings

warnings.filterwarnings('ignore')

# Replace values below with your own if using Redis Cloud instance
REDIS_HOST = os.getenv("REDIS_HOST", "localhost") # ex: "redis-18374.c253.us-central1-1.gce.cloud.redislabs.com"
REDIS_PORT = os.getenv("REDIS_PORT", "6379")      # ex: 18374
REDIS_PASSWORD = os.getenv("REDIS_PASSWORD", "")  # ex: "1TNxTEdYRDgIDKM2gDfasupCADXXXX"

# If SSL is enabled on the endpoint, use rediss:// as the URL prefix
REDIS_URL = f"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}"

In [None]:
from redis import Redis

client = Redis.from_url(REDIS_URL)
client.ping()

## DataSet Preparation

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

# pdf to load
path = 'nke-10k-2023.pdf'
assert os.path.exists(path), f"File not found: {path}"

# load and split
loader = PyPDFLoader(path)
pages = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=0)
chunks = text_splitter.split_documents(pages)

print("Done preprocessing. Created", len(chunks), "chunks of the original pdf", path)


In [None]:
chunks[0]

# Set Groq as LLM

In [None]:
from groq import Groq
import json
from dotenv import load_dotenv

import os

load_dotenv()


# Your free Groq API key
client = Groq(api_key=os.getenv("GROQ_API_KEY"))

CHAT_MODEL = "llama3-70b-8192"

In [None]:
import tqdm
import json

def create_dense_props(chunk):
    """Create dense representation of raw text chunk."""

    SYSTEM_PROMPT = """
    You are a helpful PDF extractor tool. You will be presented with segments from
    raw PDF documents composed of 10k SEC filings information about public companies.

    Decompose and summarize the raw content into clear and simple propositions,
    ensuring they are interpretable out of context. Consider the following rules:
    1. Split compound sentences into simpler dense phrases that retain existing
    meaning.
    2. Simplify technical jargon or wording if possible while retaining existing
    meaning.
    2. For any named entity that is accompanied by additional descriptive information,
    separate this information into its own distinct proposition.
    3. Decontextualize the proposition by adding necessary modifier to nouns or
    entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that")
    with the full name of the entities they refer to.
    4. Present the results as a list of strings, formatted in JSON, under the key "propositions".
    """

    response = client.chat.completions.create(
    model=CHAT_MODEL,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"Decompose this raw content using the rules above:\n{chunk.page_content}"}
    ],
        temperature=0
    )
    
    res = response.choices[0].message.content
    print(res)
    try:
        return json.loads(res)["propositions"]
    except Exception as e:
        print(f"Failed to parse propositions", str(e), flush=True)
        # Retry
        return create_dense_props(chunk)


## Create text propositions using Groq

In [None]:
# Load from disk to save time or regenerate as needed.
try:
    with open("propositions.json", "r") as f:
        propositions = json.load(f)
except:
    # create props
    propositions = [create_dense_props(chunk) for chunk in tqdm.tqdm(chunks)]
    propositions = [" ".join(prop) for prop in propositions]
 
    print(propositions)
    
    # Save to disk for faster reload..
    with open("propositions.json", "w") as f:
        json.dump(propositions, f)