In [1]:
import pdfplumber
import PyPDF4
import re
import os
import sys
from typing import Callable, List, Tuple, Dict

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

from dotenv import load_dotenv


In [2]:
def extract_metadata_from_pdf(file_path: str) -> dict:
    with open(file_path, "rb") as pdf_file:
        reader = PyPDF4.PdfFileReader(pdf_file)  # Change this line
        metadata = reader.getDocumentInfo()
        return {
            "title": metadata.get("/Title", "").strip(),
            "author": metadata.get("/Author", "").strip(),
            "creation_date": metadata.get("/CreationDate", "").strip(),
        }


def extract_pages_from_pdf(file_path: str) -> List[Tuple[int, str]]:
    """
    Extracts the text from each page of the PDF.

    :param file_path: The path to the PDF file.
    :return: A list of tuples containing the page number and the extracted text.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    with pdfplumber.open(file_path) as pdf:
        pages = []
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text.strip():  # Check if extracted text is not empty
                pages.append((page_num + 1, text))
    return pages


def parse_pdf(file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, str]]:
    """
    Extracts the title and text from each page of the PDF.

    :param file_path: The path to the PDF file.
    :return: A tuple containing the title and a list of tuples with page numbers and extracted text.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    metadata = extract_metadata_from_pdf(file_path)
    pages = extract_pages_from_pdf(file_path)

    return pages, metadata


def merge_hyphenated_words(text: str) -> str:
    return re.sub(r"(\w)-\n(\w)", r"\1\2", text)


def fix_newlines(text: str) -> str:
    return re.sub(r"(?<!\n)\n(?!\n)", " ", text)


def remove_multiple_newlines(text: str) -> str:
    return re.sub(r"\n{2,}", "\n", text)


def clean_text(
    pages: List[Tuple[int, str]], cleaning_functions: List[Callable[[str], str]]
) -> List[Tuple[int, str]]:
    cleaned_pages = []
    for page_num, text in pages:
        for cleaning_function in cleaning_functions:
            text = cleaning_function(text)
        cleaned_pages.append((page_num, text))
    return cleaned_pages


def text_to_docs(text: List[str], metadata: Dict[str, str]) -> List[Document]:
    """Converts list of strings to a list of Documents with metadata."""
    doc_chunks = []

    for page_num, page in text:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=200,
        )
        chunks = text_splitter.split_text(page)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "page_number": page_num,
                    "chunk": i,
                    "source": f"p{page_num}-{i}",
                    **metadata,
                },
            )
            doc_chunks.append(doc)

    return doc_chunks

def get_file_paths_and_names(directory):
    file_paths = [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    file_names = [os.path.splitext(f)[0] for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    return file_paths, file_names

In [4]:
load_dotenv()

True

In [5]:
# Step 0: Get file paths of all PDFs
file_paths, file_names = get_file_paths_and_names("src/data/")

In [10]:
# Step 1: Parse PDF
raw_pages, metadata = parse_pdf(file_paths[0])

In [11]:
type(raw_pages)

list

In [13]:
len(raw_pages)

20

In [15]:
for i in raw_pages:
    print(i)

(1, 'WisdomTree 2023 ECONOMIC & MARKET OUTLOOK 1\n2023\nECONOMIC\n& MARKET\nOUTLOOK\nKevin Flanagan Rick Harper Jeremy Schwartz Scott Welch Jeff Weniger\nHead of Fixed Income Chief Investment Officer, Fixed Global Chief Investment Chief Investment Officer, Head of Equity\nStrategy Income and Currency Officer Model Portfolios Strategy')
(2, 'WisdomTree 2023 ECONOMIC & MARKET OUTLOOK 2\nMACRO-ECONOMIC OUTLOOK\nThe economic and market landscapes continue to evolve, and we expect some significant changes as we make our way through\n2023. So, as always, we suggest focusing on key market signals, which we define as:\n+ Economic growth rates\n+ Inflation expectations\n+ Monetary policy\n+ Interest Rates\n+ Corporate earnings growth rates\nAnd we provide our thoughts for 2023 on:\n+ Equities\n+ Fixed income\n+ Real assets and alternatives\nAt the time of this writing, there are also some “known unknowns” that could significantly affect our perspective. These include:\n1. Fed-induced volatility

In [16]:
# Step 2: Create text chunks
cleaning_functions = [
    merge_hyphenated_words,
    fix_newlines,
    remove_multiple_newlines,
    ]
cleaned_text_pdf = clean_text(raw_pages, cleaning_functions)

In [17]:
type(cleaned_text_pdf)

list

In [18]:
len(cleaned_text_pdf)

20

In [19]:
for i in cleaned_text_pdf:
    print(i)

(1, 'WisdomTree 2023 ECONOMIC & MARKET OUTLOOK 1 2023 ECONOMIC & MARKET OUTLOOK Kevin Flanagan Rick Harper Jeremy Schwartz Scott Welch Jeff Weniger Head of Fixed Income Chief Investment Officer, Fixed Global Chief Investment Chief Investment Officer, Head of Equity Strategy Income and Currency Officer Model Portfolios Strategy')
(2, 'WisdomTree 2023 ECONOMIC & MARKET OUTLOOK 2 MACRO-ECONOMIC OUTLOOK The economic and market landscapes continue to evolve, and we expect some significant changes as we make our way through 2023. So, as always, we suggest focusing on key market signals, which we define as: + Economic growth rates + Inflation expectations + Monetary policy + Interest Rates + Corporate earnings growth rates And we provide our thoughts for 2023 on: + Equities + Fixed income + Real assets and alternatives At the time of this writing, there are also some “known unknowns” that could significantly affect our perspective. These include: 1. Fed-induced volatility 2. The Russian invas

In [20]:
document_chunks = text_to_docs(cleaned_text_pdf, metadata)

In [21]:
type(document_chunks)

list

In [22]:
len(document_chunks)

63

In [23]:
for i in document_chunks:
    print(i)

page_content='WisdomTree 2023 ECONOMIC & MARKET OUTLOOK 1 2023 ECONOMIC & MARKET OUTLOOK Kevin Flanagan Rick Harper Jeremy Schwartz Scott Welch Jeff Weniger Head of Fixed Income Chief Investment Officer, Fixed Global Chief Investment Chief Investment Officer, Head of Equity Strategy Income and Currency Officer Model Portfolios Strategy' metadata={'page_number': 1, 'chunk': 0, 'source': 'p1-0', 'title': '', 'author': '', 'creation_date': "D:20221214121303-05'00'"}
page_content='WisdomTree 2023 ECONOMIC & MARKET OUTLOOK 2 MACRO-ECONOMIC OUTLOOK The economic and market landscapes continue to evolve, and we expect some significant changes as we make our way through 2023. So, as always, we suggest focusing on key market signals, which we define as: + Economic growth rates + Inflation expectations + Monetary policy + Interest Rates + Corporate earnings growth rates And we provide our thoughts for 2023 on: + Equities + Fixed income + Real assets and alternatives At the time of this writing, t

In [24]:
# Step 1: Parse PDF
raw_pages2, metadata2 = parse_pdf(file_paths[1])
# Step 2: Create text chunks
cleaning_functions = [
    merge_hyphenated_words,
    fix_newlines,
    remove_multiple_newlines,
    ]
cleaned_text_pdf2 = clean_text(raw_pages2, cleaning_functions)
document_chunks2 = text_to_docs(cleaned_text_pdf2, metadata2)


In [25]:
chunks = [document_chunks,document_chunks2]
chunks

[[Document(page_content='WisdomTree 2023 ECONOMIC & MARKET OUTLOOK 1 2023 ECONOMIC & MARKET OUTLOOK Kevin Flanagan Rick Harper Jeremy Schwartz Scott Welch Jeff Weniger Head of Fixed Income Chief Investment Officer, Fixed Global Chief Investment Chief Investment Officer, Head of Equity Strategy Income and Currency Officer Model Portfolios Strategy', metadata={'page_number': 1, 'chunk': 0, 'source': 'p1-0', 'title': '', 'author': '', 'creation_date': "D:20221214121303-05'00'"}),
  Document(page_content='WisdomTree 2023 ECONOMIC & MARKET OUTLOOK 2 MACRO-ECONOMIC OUTLOOK The economic and market landscapes continue to evolve, and we expect some significant changes as we make our way through 2023. So, as always, we suggest focusing on key market signals, which we define as: + Economic growth rates + Inflation expectations + Monetary policy + Interest Rates + Corporate earnings growth rates And we provide our thoughts for 2023 on: + Equities + Fixed income + Real assets and alternatives At th

In [35]:
def combine_list_items(list1,list2):
    for item in list2:
        list1.append(item)

In [32]:
combined_document_chunks = combine_list_items(document_chunks,document_chunks2)

In [33]:
combined_document_chunks

In [39]:
a = [1,2,3]
b = [4,5,6]
for item in b:
    a.append(item)
a

[1, 2, 3, 4, 5, 6]