In [1]:
import fitz  # PyMuPDF
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
import streamlit as st
import asyncio

# Extract text from PDFs using fitz (PyMuPDF)
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Paths to the PDF files
alphabet_pdf_path = "Alphabet_10K.pdf"
tesla_pdf_path = "Tesla_10K.pdf"
uber_pdf_path = "Uber_10K.pdf"

# Extract text from the PDFs
alphabet_text = extract_text_from_pdf(alphabet_pdf_path)
tesla_text = extract_text_from_pdf(tesla_pdf_path)
uber_text = extract_text_from_pdf(uber_pdf_path)

# Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
alphabet_embeddings = model.encode(alphabet_text.split('\n'))
tesla_embeddings = model.encode(tesla_text.split('\n'))
uber_embeddings = model.encode(uber_text.split('\n'))

# Store embeddings in Faiss
dimension = 384  # for 'all-MiniLM-L6-v2'
index = faiss.IndexFlatL2(dimension)
all_embeddings = np.vstack([alphabet_embeddings, tesla_embeddings, uber_embeddings])
index.add(all_embeddings)

# Query function
def query_vector_store(query, k=5):
    query_embedding = model.encode([query])[0]
    D, I = index.search(np.array([query_embedding]), k)
    return I

# Local LLM for insights
model_name = 'EleutherAI/gpt-neo-2.7B'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)

async def generate_insights(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs['input_ids'], max_length=1000)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Streamlit UI
st.title("Content Engine for 10-K Filings")
query = st.text_input("Enter your query:")

# Function to run async function within Streamlit
def run_async_function():
    if query:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        result = loop.run_until_complete(generate_insights(query))
        st.write(result)

# Check if in main thread or running in a script
if __name__ == '__main__':
    run_async_function()

  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

  _torch_pytree._register_pytree_node(


Downloading model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

2024-07-01 01:58:29.248 
  command:

    streamlit run C:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
