In [2]:
import os
from dotenv import load_dotenv
import nest_asyncio
nest_asyncio.apply()

load_dotenv('../.env')

data_folder_path = '../data/raw/'
doc_paths = os.listdir(data_folder_path)

# 1.Data Colletion & Understanding

In [7]:
import os
from llama_index.readers.file import PyMuPDFReader

loader = PyMuPDFReader()

documents = {}
for doc in doc_paths:
    doc_name = doc.replace(' ','_'). replace('.pdf', '')
    documents[doc_name] = loader.load(file_path= data_folder_path + doc)


In [8]:
documents.keys()

dict_keys(['2023_Q2_INTC', '2023_Q3_INTC', '2023_Q3_AMZN', '2023_Q2_AMZN', '2022_Q3_AAPL', '2023_Q1_AAPL', '2023_Q2_MSFT', '2023_Q3_MSFT', '2023_Q2_NVDA', '2023_Q3_NVDA', '2023_Q1_NVDA', '2022_Q3_NVDA', '2023_Q1_MSFT', '2022_Q3_MSFT', '2023_Q2_AAPL', '2023_Q3_AAPL', '2022_Q3_AMZN', '2022_Q3_INTC', '2023_Q1_AMZN', '2023_Q1_INTC'])

In [9]:
print(documents['2023_Q2_AAPL'][0].metadata)
print(documents['2023_Q2_AAPL'][1].metadata)


{'total_pages': 28, 'file_path': '../data/raw/2023 Q2 AAPL.pdf', 'source': '1'}
{'total_pages': 28, 'file_path': '../data/raw/2023 Q2 AAPL.pdf', 'source': '2'}


In [10]:
total_pages = 0
for key, doc in documents.items():
    total_pages += doc[0].metadata['total_pages']
print(total_pages)

1037


# 2.Data Cleaning & Prerpocessing

In [11]:
documents['2023_Q2_AAPL'][8].metadata

{'total_pages': 28, 'file_path': '../data/raw/2023 Q2 AAPL.pdf', 'source': '9'}

In [12]:
documents['2023_Q2_AAPL'][8].get_content()

'Apple Inc.\nNotes to Condensed Consolidated Financial Statements (Unaudited)\nNote 1 – Summary of Significant Accounting Policies\nBasis of Presentation and Preparation\nThe condensed consolidated financial statements include the accounts of Apple Inc. and its wholly owned subsidiaries (collectively “Apple” or the “Company”).\nIntercompany accounts and transactions have been eliminated. In the opinion of the Company’s management, the condensed consolidated financial statements\nreflect all adjustments, which are normal and recurring in nature, necessary for fair financial statement presentation. The preparation of these condensed\nconsolidated financial statements and accompanying notes in conformity with U.S. generally accepted accounting principles requires management to make\nestimates and assumptions that affect the amounts reported. Actual results could differ materially from those estimates. Certain prior period amounts in the\ncondensed consolidated financial statements and acc

In [13]:
sample_file = data_folder_path + '2023 Q2 AAPL.pdf'

## 2.1 PDF Parsing & Chunking & Token Count

### 2.1.1 Parsing with `pymupdf4llm`

In [14]:
import pymupdf4llm

with open('../data/interim/pymupdf4llm_2023_Q2_AAPL.md', 'w+') as f:
    f.write(pymupdf4llm.to_markdown(sample_file))

> **IMPORTANT**  
> Fails to convert tables

### 2.1.2 Parsing with LlamaParse API

In [15]:
from llama_parse import LlamaParse

LLAMA_CLOUD_API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')

parser = LlamaParse(
    api_key=LLAMA_CLOUD_API_KEY, # type: ignore
    result_type='markdown', # type: ignore
)

md_text = parser.load_data(sample_file)

with open('../data/interim/llamaparse_2023_Q2_APPL.md', 'w') as f:
    f.write(md_text[0].get_content())


Started parsing the file under job_id cac11eca-5289-4430-a4fe-efa3f0794521


> **IMPORTANT**  
> Extracts tables better than `pymupdf4llm` but gets greatly confused with the table structure and sometimes even do not extract some text

### 2.1.3 Parsing with Unstructured API

In [11]:
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError
from pprint import pprint

client = UnstructuredClient(
    api_key_auth=os.getenv('UNSTRUCTURED_API_KEY') # type: ignore
)

def unstructured_api_call(
    file_path: str, 
    strategy: str='auto', 
    chunking_strategy: str|None=None, 
    multipage_sections: bool|None=None, 
    max_characters: int|None=None,
    ):
    with open(file_path, "rb") as f:
        files=shared.Files(
            content=f.read(),
            file_name=file_path,
        )

    req = shared.PartitionParameters(
        files=files, 
        strategy=strategy,
        chunking_strategy=chunking_strategy,
        multipage_sections=multipage_sections,
        max_characters=max_characters
        )

    try:
        return client.general.partition(req)
    except SDKError as e:
        print(e)

In [12]:
response = unstructured_api_call(sample_file)

In [13]:
types = {}
for element in response.elements: # type: ignore
    if element['type'] not in types:
        types[element['type']] = 0
    types[element['type']] += 1
pprint(types)

{'Footer': 13,
 'Header': 3,
 'ListItem': 1,
 'NarrativeText': 169,
 'Table': 35,
 'Title': 101,
 'UncategorizedText': 3}


**Manually type count results:**   
- Table: 30
- Footer: 22
- Header: 0
- Image: 1 *ps. Apple Logo*

In [14]:
#Create an html copy of the sample file for manual comparison
page_html = ''
for element in response.elements: # type: ignore
        try:
            if element['type'] not in ['Header', 'Footer']:
                html = element['metadata']['text_as_html']
                page_html += html
        except KeyError:
            text = element['text']
            if element['type'] == 'Title':
                page_html += f'<b>{text}</b>'
            else:
                page_html += text

with open('../data/interim/unstructured_2023_Q2_APPL.html', 'w') as f:
    f.write(page_html)

> **IMPORTANT**  
>Works great!. Yet this is an API service with [100-page cap per month](https://docs.unstructured.io/api-reference/api-services/free-api#free-unstructured-api-limitationst). Paid and self hosted alternatives available

### 2.1.4. Parsing with LangChain

In [3]:
from langchain.document_loaders import DirectoryLoader, PyMuPDFLoader

sample_files = [file for file in os.listdir(data_folder_path) if file.startswith('2023')]
sample_docs = []
for file in sample_files:
    doc_temp = PyMuPDFLoader(data_folder_path + file)
    sample_docs.append(doc_temp.load())

## 2.2 Text Cleaning

In [4]:
from typing import List
import re

def doc_cleaner(
    document, 
    skip_first: int|None=None, 
    cleaning_patterns: List|None=None, 
    remove_page_identifiers: List|None=None, 
    final_page_identifier: str|None=None
    ) -> list:
    """
    
    Args:
        document :
        skip_first (int|Optional): Number of pages to skip at the begining of the document
        cleaning_patterns (List|Optional): List of regex patters to detect and remove the corresponding text 
        remove_page_identifiers (List|Optional): List of regex patters to detect and remove whole pages. 
        final_page_identifier (str|Optional): The regex pattern to detect final page. This page and the following pages will be removed
        
    Returns
        document_page or None
    """
    
    def text_remover(pattern, string):
        match = re.findall(pattern, string)
        if match:
            return string.replace(match[0], '')
        else:
            return string
    
    final_page_num = 1e10
    cleaned_doc = []
    
    for page in document:
        
        # Skip first n pages
        if skip_first and page.metadata['page'] <= skip_first:
            continue

        # Remove texts with matching given patterns
        if cleaning_patterns:
            for pattern in cleaning_patterns:
                page.page_content = text_remover(pattern, page.page_content)
        
        # Remove pages with matching given identifiers
        if remove_page_identifiers: 
            for identifier in remove_page_identifiers:
                if re.findall(identifier, page.page_content):
                    continue
            
        # Remove pages on and after final_page_identifier match
        if final_page_identifier:
            if re.findall(final_page_identifier, page.page_content): # type: ignore
                final_page_num = page.metadata['page']

        if page.metadata['page'] >= final_page_num:
            continue
        
        cleaned_doc.append(page)
        
    return cleaned_doc



In [5]:
print(f"Sample cleaning lenght for APPL docs")
print(len(sample_docs[4]))
print(len(doc_cleaner(
    sample_docs[4], 
    skip_first=2, 
    cleaning_patterns=[r'(Apple Inc. \| Q[0-9]{1} [0-9]{4} Form 10-Q \| [0-9]+)'], 
    remove_page_identifiers=['SIGNATURE'],
    final_page_identifier='Exhibit 31.1', 
    )))

Sample cleaning lenght for APPL docs
46
40


In [6]:
print('Sample cleanning length for AMZN docs')
print(len(sample_docs[2]))
print(len(doc_cleaner(
    sample_docs[2],
    skip_first=1,
    cleaning_patterns=[r'Table of Contents'],
    final_page_identifier='PART II. OTHER INFORMATION',
)))

Sample cleanning length for AMZN docs
51
31


In [7]:
print('Sample cleanning length for INTC docs')
print(len(sample_docs[1]))
print(len(doc_cleaner(
    sample_docs[1],
    skip_first=3,
    cleaning_patterns=[r'Table of Contents'],
    final_page_identifier='Exhibit 31.1',
)))

Sample cleanning length for INTC docs
54
47


In [8]:
print('Sample cleanning length for MSFT docs')
print(len(sample_docs[5]))
print(len(doc_cleaner(
    sample_docs[5],
    skip_first=1,
    cleaning_patterns=[r'PART (I|II) Item [0-9]{1,2}'],
    final_page_identifier='Exhibit 31.1',
)))

Sample cleanning length for MSFT docs
74
67


In [9]:
print('Sample cleanning length for NVDA docs')
print(len(sample_docs[7]))
print(len(doc_cleaner(
    sample_docs[7],
    skip_first=1,
    cleaning_patterns=[r"NVIDIA CORPORATION AND SUBSIDIARIES NOTES TO CONDENSED CONSOLIDATED FINANCIAL STATEMENTS (Continued) (Unaudited)"],
    final_page_identifier='Exhibit 31.1',
)))

Sample cleanning length for NVDA docs
51
49


In [10]:
cleaned_sample_docs = []
for doc in sample_docs:
    doc_name = doc[0].metadata['source']
    if 'AAPL' in doc_name:
        cleaned_doc = doc_cleaner(
            doc, 
            skip_first=2, 
            cleaning_patterns=[r'(Apple Inc. \| Q[0-9]{1} [0-9]{4} Form 10-Q \| [0-9]+)'], 
            remove_page_identifiers=['SIGNATURE'],
            final_page_identifier='Exhibit 31.1', 
            )
    elif 'AMZN' in doc_name:
        cleaned_doc = doc_cleaner(
            doc,
            skip_first=1,
            cleaning_patterns=[r'Table of Contents'],
            final_page_identifier='PART II. OTHER INFORMATION',
        )
    elif 'INTC' in doc_name:
        cleaned_doc = doc_cleaner(
            doc,
            skip_first=3,
            cleaning_patterns=[r'Table of Contents'],
            final_page_identifier='Exhibit 31.1',
        )
    elif 'MSFT' in doc_name:
        cleaned_doc = doc_cleaner(
            doc,
            skip_first=1,
            cleaning_patterns=[r'PART (I|II) Item [0-9]{1,2}'],
            final_page_identifier='Exhibit 31.1',
        )
    elif 'NVDA' in doc_name:
        cleaned_doc = doc_cleaner(
            doc,
            skip_first=1,
            cleaning_patterns=[r"NVIDIA CORPORATION AND SUBSIDIARIES NOTES TO CONDENSED CONSOLIDATED FINANCIAL STATEMENTS (Continued) (Unaudited)"],
            final_page_identifier='Exhibit 31.1',
        )
    else:
        print(f"Unknown doc type: {doc_name}. Doc is not cleaned")
        cleaned_doc = doc
        
    cleaned_sample_docs.append(cleaned_doc)

#### 2.2.1 Chunking

In [11]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings.ollama import OllamaEmbeddings

embedding_model = OllamaEmbeddings(model='mxbai-embed-large:latest')
text_splitter = SemanticChunker(embedding_model)

chunked_docs = []
for doc in cleaned_sample_docs:
    print(f"Processing {doc[0].metadata['source']}")
    chunks = text_splitter.split_documents(doc)
    chunked_docs.append(chunks)
    print(f"Total chunks: {len(chunks)}")

Processing ../data/raw/2023 Q2 INTC.pdf
Total chunks: 84
Processing ../data/raw/2023 Q3 INTC.pdf
Total chunks: 93
Processing ../data/raw/2023 Q3 AMZN.pdf
Total chunks: 70
Processing ../data/raw/2023 Q2 AMZN.pdf
Total chunks: 71
Processing ../data/raw/2023 Q1 AAPL.pdf
Total chunks: 86
Processing ../data/raw/2023 Q2 MSFT.pdf
Total chunks: 154
Processing ../data/raw/2023 Q3 MSFT.pdf
Total chunks: 138
Processing ../data/raw/2023 Q2 NVDA.pdf
Total chunks: 103
Processing ../data/raw/2023 Q3 NVDA.pdf
Total chunks: 106
Processing ../data/raw/2023 Q1 NVDA.pdf
Total chunks: 105
Processing ../data/raw/2023 Q1 MSFT.pdf
Total chunks: 153
Processing ../data/raw/2023 Q2 AAPL.pdf
Total chunks: 46
Processing ../data/raw/2023 Q3 AAPL.pdf
Total chunks: 47
Processing ../data/raw/2023 Q1 AMZN.pdf
Total chunks: 71
Processing ../data/raw/2023 Q1 INTC.pdf
Total chunks: 75


## 2.4 Token Count Analysis

### 2.4.1 OpenAI Tokenization

In [12]:
import tiktoken

def num_tokens_from_string(string: str, encoding_model: str = 'gpt-3.5-turbo') -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(encoding_model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [13]:
sample_docs_token_count = sum([num_tokens_from_string(chunk.page_content)for chunk in doc for doc in chunked_docs])
sample_docs_count = len(chunked_docs)
doc_avg_token_count = sample_docs_token_count / sample_docs_count
total_docs_token_count = len(os.listdir(data_folder_path)) * doc_avg_token_count

print(f"Avg token of a doc: {doc_avg_token_count:_.0f}")
print(f"Total estimated token for all docs: {total_docs_token_count:_.0f}")

Avg token of a doc: 24_491
Total estimated token for all docs: 489_820


# 3.Initial Data Exploration

In [14]:
import pandas as pd

# DataFrame construction from docs.
# Each row is a chunk
data_ = {'doc_name':[], 'page_num': [], 'text': [], 'langchain_doc': []}
for doc in chunked_docs:
    for chunk in doc:
        data_['doc_name'].append(chunk.metadata['source'].split('/')[-1])
        data_['page_num'].append(chunk.metadata['page'])
        data_['text'].append(chunk.page_content)
        data_['langchain_doc'].append(chunk)

df_sample_docs = pd.DataFrame(data_).sort_values('doc_name')
df_sample_docs['text_length'] = df_sample_docs['text'].apply(len)
df_sample_docs['text_token_count'] = df_sample_docs['text'].apply(num_tokens_from_string)
df_sample_docs['company'] = df_sample_docs['doc_name'].apply(lambda x: x.split()[-1][:-4])

## 3.1 Text Length Analysis

In [15]:
# Text lengths of sample docs
df_sample_docs.groupby(['doc_name'])['text_length'].agg(['min', 'mean', 'max'], ).style.background_gradient(cmap='gray')

Unnamed: 0_level_0,min,mean,max
doc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023 Q1 AAPL.pdf,0,1297.697674,3895
2023 Q1 AMZN.pdf,2,1432.169014,4492
2023 Q1 INTC.pdf,22,1367.093333,4262
2023 Q1 MSFT.pdf,3,1507.535948,5279
2023 Q1 NVDA.pdf,3,1354.952381,3698
2023 Q2 AAPL.pdf,0,1215.934783,3172
2023 Q2 AMZN.pdf,2,1504.591549,4631
2023 Q2 INTC.pdf,8,1386.154762,5079
2023 Q2 MSFT.pdf,3,1497.090909,4764
2023 Q2 NVDA.pdf,3,1463.427184,3926


In [16]:
# Token count statistics of documents
df_sample_docs.groupby('doc_name')['text_token_count'].agg(['min', 'mean', 'max']).style.background_gradient(cmap='gray')

Unnamed: 0_level_0,min,mean,max
doc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023 Q1 AAPL.pdf,0,290.906977,1026
2023 Q1 AMZN.pdf,2,323.915493,1072
2023 Q1 INTC.pdf,6,325.933333,959
2023 Q1 MSFT.pdf,2,378.941176,1626
2023 Q1 NVDA.pdf,2,292.866667,704
2023 Q2 AAPL.pdf,0,311.695652,790
2023 Q2 AMZN.pdf,2,358.816901,1131
2023 Q2 INTC.pdf,5,337.416667,1031
2023 Q2 MSFT.pdf,2,363.428571,1412
2023 Q2 NVDA.pdf,2,331.368932,746


## 3.2 Word Frequency Analysis

In [17]:
import spacy
from collections import Counter

nlp = spacy.load('en_core_web_trf')

# Build spacy doc for each chunk
df_sample_docs['spacy_doc'] = (
    df_sample_docs
    .text
    .apply(lambda x: ' '.join(x.split()).replace('\xa0', ' ').replace('\n', ' ').replace('\t', '')) # Clear whitespace
    .apply(lambda x: x.lower()) # Normalization
    .apply(nlp) # Apply spacy pipeline # type: ignore
) # type: ignore

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
df_sample_docs['word_counts'] = (
    df_sample_docs
    .spacy_doc
    .apply(
        lambda doc: 
            dict(Counter([token.text for token in doc if token.is_stop != True and token.is_punct != True and token]))
            )
)

In [19]:
combined_counts = Counter()

for d in df_sample_docs['word_counts'].values:
    combined_counts.update(d)

combined_counts.most_common()[:20]

[('$', 6638),
 ('2023', 2403),
 ('2022', 1838),
 ('net', 1534),
 ('cash', 1386),
 ('income', 1384),
 ('financial', 1280),
 ('revenue', 1256),
 ('operating', 1211),
 ('billion', 1178),
 ('months', 1137),
 ('products', 1118),
 ('services', 1015),
 ('1', 996),
 ('year', 926),
 ('ended', 915),
 ('tax', 901),
 ('sales', 872),
 ('statements', 833),
 ('30', 821)]

# 4.Linguistic Analysis

## 4.1 Named Entity Recognition

In [20]:
ner_counts = Counter()

df_sample_docs['ners'] = df_sample_docs['spacy_doc'].apply(lambda x: x.ents).apply(lambda x: [i.label_ for i in x])

for _, row in df_sample_docs.iterrows():
    ner_counts.update(row['ners'])

ner_counts.most_common()[:20]

[('CARDINAL', 9272),
 ('DATE', 6086),
 ('MONEY', 4415),
 ('ORG', 2526),
 ('PERCENT', 1753),
 ('GPE', 1097),
 ('PRODUCT', 814),
 ('LAW', 392),
 ('ORDINAL', 272),
 ('PERSON', 97),
 ('LOC', 90),
 ('NORP', 76),
 ('QUANTITY', 10),
 ('EVENT', 4),
 ('TIME', 1),
 ('FAC', 1)]

> Entitiy Explanations:
>   
> PERSON:      People, including fictional.  
> NORP:        Nationalities or religious or political groups.  
> FAC:         Buildings, airports, highways, bridges, etc.  
> ORG:         Companies, agencies, institutions, etc.  
> GPE:         Countries, cities, states.  
> LOC:         Non-GPE locations, mountain ranges, bodies of water.  
> PRODUCT:     Objects, vehicles, foods, etc. (Not services.)  
> EVENT:       Named hurricanes, battles, wars, sports events, etc.  
> WORK_OF_ART: Titles of books, songs, etc.   
> LAW:         Named documents made into laws.  
> LANGUAGE:    Any named language.  
> DATE:        Absolute or relative dates or periods.  
> TIME:        Times smaller than a day.  
> PERCENT:     Percentage, including ”%“.  
> MONEY:       Monetary values, including unit.  
> QUANTITY:    Measurements, as of weight or distance.  
> ORDINAL:     “first”, “second”, etc.  
> CARDINAL:    Numerals that do not fall under another type.  
>
> ps: also can be acessed via `spacy.explain(label)`
> Resource: https://github.com/explosion/spaCy/discussions/9147

# 5.Visualizations

## 5.1 NER Visualization

In [199]:
from spacy import displacy

displacy.serve(df_sample_docs['spacy_doc'][814], style='ent', port=5001)


Using the 'ent' visualizer
Serving on http://0.0.0.0:5001 ...

Shutting down server on port 5001.


# 6.Content Analysis

## 6.2 Keyword Extraction

## 6.3 Table Extraction

## 6.4 Graph Extraction

## 6.5 Image Extraction

# 7 Embedding Model Analysis
For each model the sample cleaned data will be split with `SemanticChunker`. Then with the 

## 7.1 PacMAP Analysis


### 7.1.1 Open Source Embedding Analysis

#### 7.1.1.1 Mxbai Embed Large

In [116]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings.ollama import OllamaEmbeddings
import plotly.express as px

def df_chunk_builder(embedding_model, docs):
    
    text_splitter = SemanticChunker(embedding_model)
    
    # Chunk docs
    chunked_docs = []
    for doc in docs:
        chunks = text_splitter.split_documents(doc)
        chunked_docs.append(chunks)
    
    # Construct DataFrame for each chunk to be a row
    data_ = {'doc_name':[], 'page_num': [], 'text': [], 'langchain_doc': []}
        
    for doc in chunked_docs:
        for chunk in doc:
            data_['doc_name'].append(chunk.metadata['source'].split('/')[-1])
            data_['page_num'].append(chunk.metadata['page'])
            data_['text'].append(chunk.page_content)
            data_['langchain_doc'].append(chunk)

    df = pd.DataFrame(data_).sort_values('doc_name')
    df['text_length'] = df['text'].apply(len)
    df['text_token_count'] = df['text'].apply(num_tokens_from_string)
    df['company'] = df['doc_name'].apply(lambda x: x.split()[-1][:-4])
    
    df['embeddings'] = df['text'].apply(lambda x: np.array(embedding_model.embed_query(x), dtype=float)) # type: ignore
    
    return df

def embeding_pacmap_plot(df, embedding_name):
    
    X = np.stack(df['embeddings'].values) # type: ignore
    X.shape

    projector = pacmap.PaCMAP(n_components=3, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0,) # type: ignore
    df[['projected_x', f'projected_y', f'projected_z']] = projector.fit_transform(X, init='pca')
    
    fig = px.scatter_3d(
        df, 
        x='projected_x', 
        y='projected_y', 
        z='projected_z',
        color=df['doc_name'].apply(lambda x: x.split()[-1]),
        title=embedding_name,
        hover_data={
            'projected_x': False,
            'projected_y': False,
            'projected_z': False,
            'doc_name': True,
            'page_num': True,
            'text': True
        },
        width=1000, height=800
    )
    
    fig.show()

In [117]:
embedding_name = 'mxbai-embed-large'
embedding_model_ = OllamaEmbeddings(model=embedding_name)

embeding_pacmap_plot(df_chunk_builder(embedding_model_, cleaned_sample_docs), embedding_name)

#### 7.1.1.2 Nomic Embed

In [119]:
embedding_name = 'nomic-embed-text:latest'
embedding_model_ = OllamaEmbeddings(model=embedding_name)

embeding_pacmap_plot(df_chunk_builder(embedding_model_, cleaned_sample_docs), embedding_name)

### 7.1.2 OpenAI Embedding Analysis

In [121]:
from langchain.embeddings.openai import OpenAIEmbeddings
import openai

openai.api_key = os.getenv('OPENAI_API_KEY')

embedding_name = 'text-embedding-3-large'
openai_embeddings = OpenAIEmbeddings(model='text-embedding-3-large', dimensions=1024) # type: ignore

embeding_pacmap_plot(df_chunk_builder(openai_embeddings, cleaned_sample_docs), embedding_name)

# 10. Analysis Notes
* Pure Reader:  
  1. Paragraphs seperated by '\n\n' and '\n' is just newline. If recursive chunking method will be used the order is ` ['\n\n', (?<=\.\n[A-Z]), '.', '\n\t', '\n', ' ', ''] `.
  2. Values within the tables are seperated by `\xa0`, they need to be raplaced by either `|` or `' '`
* `pymupdf4llms` library fails to convert tables
* LlamaParse constructs some tables. However, fails greatly on some intended tables. Which documents have lots.
* Unstrucred works best within these alternatives. Yet to keep in mind that it has a limit of [100-pages cap per month](https://docs.unstructured.io/api-reference/api-services/free-api#free-unstructured-api-limitations)

## 10.1 Document Manual Controls

**APPL Notes:**
- Can skip first 3 pages
- Doesn't have header
- Has footer. Sample: `Apple Inc. | Q3 2023 Form 10-Q | 1` Final number of this is page number
-  Can remove the page starting with `SIGNATURE`
- Texts after `Exhibit 31.1` may be generic. Check and remove if so

**AMZN Notes:**
- Can skip first 2 pages
- Has header. Sample: `Table of Contents`
- Has page number. But there may not be a significant identifier for them. Check a sample content if it contains a form of line or multiple newlines.
- Texts after `PART II. OTHER INFORMATION` may be generic. Check and remove if so
- 

**INTC Notes:**
- Can skip first 4 pages
- Has header. Sample: `Table of Contents`
- Has footer and page number. But no unique identifier. Excluding them may not be possible
- Texts after `Exhibit 31.1` may be generic. Check and remove if so

**MSFT Notes:**
- Can skip first 2 pages
- Has header: Sample: `PART II Item 6`
- Doesn't have footer.
- Has page number. But no unique identifier. Exculing them may not be possible
- Texts after `Exhibit 31.1` may be generic. Check and remove if so

**NVDA Notes:**
- Can skip first 2 pages
- Has header: One type: `NVIDIA CORPORATION AND SUBSIDIARIES
NOTES TO CONDENSED CONSOLIDATED FINANCIAL STATEMENTS (Continued)
(Unaudited)`
- Doesn't have footer
- Has page number. But no unique identifier. Excluding them may not be possible
- Texts after `Exhibit 31.1` may be generic. Check and remove if so

## 10.1 Text Analysis Notes
From the sample docs max token for a chunk is $1626$. Which is enoguh for a 4k model

## 10.2 Embedding Models

OpenAI's `text-embedding-3-large`'s embeddings represent the file structures best!