In [1]:
import os
from dotenv import load_dotenv
import nest_asyncio
nest_asyncio.apply()

load_dotenv('../.env')

data_folder_path = '../data/raw/'
doc_paths = os.listdir(data_folder_path)

# 1.Data Colletion & Understanding

In [7]:
import os
from llama_index.readers.file import PyMuPDFReader

loader = PyMuPDFReader()

documents = {}
for doc in doc_paths:
    doc_name = doc.replace(' ','_'). replace('.pdf', '')
    documents[doc_name] = loader.load(file_path= data_folder_path + doc)


In [8]:
documents.keys()

dict_keys(['2023_Q2_INTC', '2023_Q3_INTC', '2023_Q3_AMZN', '2023_Q2_AMZN', '2022_Q3_AAPL', '2023_Q1_AAPL', '2023_Q2_MSFT', '2023_Q3_MSFT', '2023_Q2_NVDA', '2023_Q3_NVDA', '2023_Q1_NVDA', '2022_Q3_NVDA', '2023_Q1_MSFT', '2022_Q3_MSFT', '2023_Q2_AAPL', '2023_Q3_AAPL', '2022_Q3_AMZN', '2022_Q3_INTC', '2023_Q1_AMZN', '2023_Q1_INTC'])

In [9]:
print(documents['2023_Q2_AAPL'][0].metadata)
print(documents['2023_Q2_AAPL'][1].metadata)


{'total_pages': 28, 'file_path': '../data/raw/2023 Q2 AAPL.pdf', 'source': '1'}
{'total_pages': 28, 'file_path': '../data/raw/2023 Q2 AAPL.pdf', 'source': '2'}


In [10]:
total_pages = 0
for key, doc in documents.items():
    total_pages += doc[0].metadata['total_pages']
print(total_pages)

1037


# 2.Data Cleaning & Prerpocessing

In [11]:
documents['2023_Q2_AAPL'][8].metadata

{'total_pages': 28, 'file_path': '../data/raw/2023 Q2 AAPL.pdf', 'source': '9'}

In [12]:
documents['2023_Q2_AAPL'][8].get_content()

'Apple Inc.\nNotes to Condensed Consolidated Financial Statements (Unaudited)\nNote 1 – Summary of Significant Accounting Policies\nBasis of Presentation and Preparation\nThe condensed consolidated financial statements include the accounts of Apple Inc. and its wholly owned subsidiaries (collectively “Apple” or the “Company”).\nIntercompany accounts and transactions have been eliminated. In the opinion of the Company’s management, the condensed consolidated financial statements\nreflect all adjustments, which are normal and recurring in nature, necessary for fair financial statement presentation. The preparation of these condensed\nconsolidated financial statements and accompanying notes in conformity with U.S. generally accepted accounting principles requires management to make\nestimates and assumptions that affect the amounts reported. Actual results could differ materially from those estimates. Certain prior period amounts in the\ncondensed consolidated financial statements and acc

In [13]:
sample_file = data_folder_path + '2023 Q2 AAPL.pdf'

## 2.1 PDF Parsing & Chunking & Token Count

### 2.1.1 Parsing with `pymupdf4llm`

In [14]:
import pymupdf4llm

with open('../data/interim/pymupdf4llm_2023_Q2_AAPL.md', 'w+') as f:
    f.write(pymupdf4llm.to_markdown(sample_file))

> **IMPORTANT**  
> Fails to convert tables

### 2.1.2 Parsing with LlamaParse API

In [15]:
from llama_parse import LlamaParse

LLAMA_CLOUD_API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')

parser = LlamaParse(
    api_key=LLAMA_CLOUD_API_KEY,
    result_type='markdown',
)

md_text = parser.load_data(sample_file)

with open('../data/interim/llamaparse_2023_Q2_APPL.md', 'w') as f:
    f.write(md_text[0].get_content())


Started parsing the file under job_id cac11eca-5289-4430-a4fe-efa3f0794521


> **IMPORTANT**  
> Extracts tables better than `pymupdf4llm` but gets greatly confused with the table structure and sometimes even do not extract some text

### 2.1.3 Parsing with Unstructured API

In [11]:
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError
from pprint import pprint
import json

client = UnstructuredClient(
    api_key_auth=os.getenv('UNSTRUCTURED_API_KEY')
)

def unstructured_api_call(
    file_path: str, 
    strategy: str='auto', 
    chunking_strategy: str|None=None, 
    multipage_sections: bool|None=None, 
    max_characters: int|None=None,
    ):
    with open(file_path, "rb") as f:
        files=shared.Files(
            content=f.read(),
            file_name=file_path,
        )

    req = shared.PartitionParameters(
        files=files, 
        strategy=strategy,
        chunking_strategy=chunking_strategy,
        multipage_sections=multipage_sections,
        max_characters=max_characters
        )

    try:
        return client.general.partition(req)
    except SDKError as e:
        print(e)

In [12]:
response = unstructured_api_call(sample_file)

In [13]:
types = {}
for element in response.elements:
    if element['type'] not in types:
        types[element['type']] = 0
    types[element['type']] += 1
pprint(types)

{'Footer': 13,
 'Header': 3,
 'ListItem': 1,
 'NarrativeText': 169,
 'Table': 35,
 'Title': 101,
 'UncategorizedText': 3}


**Manually type count results:**   
- Table: 30
- Footer: 22
- Header: 0
- Image: 1 *ps. Apple Logo*

In [14]:
#Create an html copy of the sample file for manual comparison
page_html = ''
for element in response.elements:
        try:
            if element['type'] not in ['Header', 'Footer']:
                html = element['metadata']['text_as_html']
                page_html += html
        except KeyError:
            text = element['text']
            if element['type'] == 'Title':
                page_html += f'<b>{text}</b>'
            else:
                page_html += text

with open('../data/interim/unstructured_2023_Q2_APPL.html', 'w') as f:
    f.write(page_html)

> **IMPORTANT**  
>Works great!. Yet this is an API service with [100-page cap per month](https://docs.unstructured.io/api-reference/api-services/free-api#free-unstructured-api-limitationst). Paid and self hosted alternatives available

### 2.1.4. Parsing with LangChain
Load one document from each type

In [2]:
from langchain.document_loaders import DirectoryLoader, PyMuPDFLoader

sample_files = [file for file in os.listdir(data_folder_path) if file.startswith('2023')]
sample_docs = []
for file in sample_files:
    doc_temp = PyMuPDFLoader(data_folder_path + file)
    sample_docs.append(doc_temp.load())

**APPL Notes:**
- Can skip first 3 pages
- Doesn't have header
- Has footer. Sample: `Apple Inc. | Q3 2023 Form 10-Q | 1` Final number of this is page number
-  Can remove the page starting with `SIGNATURE`
- Texts after `Exhibit 31.1` may be generic. Check and remove if so

In [21]:
from typing import List
import re

def doc_cleaner(
    document, 
    skip_first: int|None=None, 
    cleaning_patterns: List|None=None, 
    remove_page_identifiers: List|None=None, 
    final_page_identifier: str|None=None
    ) -> list:
    """
    
    Args:
        document :
        skip_first (int|Optional): Number of pages to skip at the begining of the document
        cleaning_patterns (List|Optional): List of regex patters to detect and remove the corresponding text 
        remove_page_identifiers (List|Optional): List of regex patters to detect and remove whole pages. 
        final_page_identifier (str|Optional): The regex pattern to detect final page. This page and the following pages will be removed
        
    Returns
        document_page or None
    """
    
    def text_remover(pattern, string):
        match = re.findall(pattern, string)
        if match:
            return string.replace(match[0], '')
        else:
            return string
    
    final_page_num = 1e10
    cleaned_doc = []
    
    for page in document:
        
        # Skip first n pages
        if skip_first and page.metadata['page'] <= skip_first:
            continue

        # Remove texts with matching given patterns
        if cleaning_patterns:
            for pattern in cleaning_patterns:
                page.page_content = text_remover(pattern, page.page_content)
        
        # Remove pages with matching given identifiers
        if remove_page_identifiers: 
            for identifier in remove_page_identifiers:
                if re.findall(identifier, page.page_content):
                    continue
            
        # Remove pages on and after final_page_identifier match
        if final_page_identifier:
            if re.findall(final_page_identifier, page.page_content): # type: ignore
                final_page_num = page.metadata['page']

        if page.metadata['page'] >= final_page_num:
            continue
        
        cleaned_doc.append(page)
        
    return cleaned_doc



In [24]:
print(f"Sample cleaning lenght for APPL docs")
print(len(sample_docs[4]))
print(len(doc_cleaner(
    sample_docs[4], 
    skip_first=2, 
    cleaning_patterns=[r'(Apple Inc. \| Q[0-9]{1} [0-9]{4} Form 10-Q \| [0-9]+)'], 
    remove_page_identifiers=['SIGNATURE'],
    final_page_identifier='Exhibit 31.1', 
    )))

Sample cleaning lenght for APPL docs
46
40


In [25]:
print('Sample cleanning length for AMZN docs')
print(len(sample_docs[2]))
print(len(doc_cleaner(
    sample_docs[2],
    skip_first=1,
    cleaning_patterns=[r'Table of Contents'],
    final_page_identifier='PART II. OTHER INFORMATION',
)))

Sample cleanning length for AMZN docs
51
31


In [28]:
print('Sample cleanning length for INTC docs')
print(len(sample_docs[1]))
print(len(doc_cleaner(
    sample_docs[1],
    skip_first=3,
    cleaning_patterns=[r'Table of Contents'],
    final_page_identifier='Exhibit 31.1',
)))

Sample cleanning length for INTC docs
54
47


In [33]:
print('Sample cleanning length for MSFT docs')
print(len(sample_docs[5]))
print(len(doc_cleaner(
    sample_docs[5],
    skip_first=1,
    cleaning_patterns=[r'PART (I|II) Item [0-9]{1,2}'],
    final_page_identifier='Exhibit 31.1',
)))

Sample cleanning length for MSFT docs
74
67


**NVDA Notes:**
- Can skip first 2 pages
- Has header: One type: `NVIDIA CORPORATION AND SUBSIDIARIES
NOTES TO CONDENSED CONSOLIDATED FINANCIAL STATEMENTS (Continued)
(Unaudited)`
- Doesn't have footer
- Has page number. But no unique identifier. Excluding them may not be possible
- Texts after `Exhibit 31.1` may be generic. Check and remove if so

In [38]:
print('Sample cleanning length for NVDA docs')
print(len(sample_docs[7]))
print(len(doc_cleaner(
    sample_docs[7],
    skip_first=1,
    cleaning_patterns=[r"NVIDIA CORPORATION AND SUBSIDIARIES NOTES TO CONDENSED CONSOLIDATED FINANCIAL STATEMENTS (Continued) (Unaudited)"],
    final_page_identifier='Exhibit 31.1',
)))

Sample cleanning length for NVDA docs
51
49


#### Chunking with LangChain

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings.ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model='mxbai-embed-large:latest')
text_splitter = SemanticChunker(embeddings=embeddings, breakpoint_threshold_type='percentile')

chunked_docs = all_docs.split(text_splitter)

## 2.2 Text Cleaning

## 2.4 Token Count Analysis

### 2.4.1 OpenAI Tokenization

In [23]:
import tiktoken

def num_tokens_from_string(string: str, encoding_model: str = 'gpt-3.5-turbo') -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(encoding_model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [25]:
print("Total number of tokens for GPT3.5:")
sum([num_tokens_from_string(doc.page_content) for doc in chunked_docs])

Total number of tokens for GPT3.5:


667132

### 2.4.2 Open Source Tokenization

# 3.Initial Data Exploration

## 3.1 Text Length Analysis

## 3.2 Word Frequency Analysis

## 3.3 Section Frequency Analysis

# 4.Linguistic Analysis

## 4.1 Named Entity Recognition

## 4.2 Sentiment Analysis

# 5.Visualizations

## 5.1 Word Clouds

## 5.2 PaCMAP Analysis

# 6.Statistical Analysis

## 6.1 Section Length Analysis

## 6.2 Correlation Analysis

# 7.Content Analysis

## 7.1 TF-IDF Analysis

## 7.2 Keyword Extraction

## 7.3 Table Extraction

## 7.4 Graph Extraction

## 7.5 Image Extraction

# 8.Information Retrieval Metrics

## 8.1 Query Analysis

## 8.2 Document Similarity

## 8.3 Precision and Recall

# 9.Advanced Text Features

## 9.1 Embedding Analysis

## 9.2 Clustering

# 10. Analysis Notes
* Pure Reader:  
  1. Paragraphs seperated by '\n\n' and '\n' is just newline. If recursive chunking method will be used the order is ` ['\n\n', (?<=\.\n[A-Z]), '.', '\n\t', '\n', ' ', ''] `.
  2. Values within the tables are seperated by `\xa0`, they need to be raplaced by either `|` or `' '`
* `pymupdf4llms` library fails to convert tables
* LlamaParse constructs some tables. However, fails greatly on some intended tables. Which documents have lots.
* Unstrucred works best within these alternatives. Yet to keep in mind that it has a limit of [100-pages cap per month](https://docs.unstructured.io/api-reference/api-services/free-api#free-unstructured-api-limitations)

## 10.1 Manual Controls

**APPL Notes:**
- Can skip first 3 pages
- Doesn't have header
- Has footer. Sample: `Apple Inc. | Q3 2023 Form 10-Q | 1` Final number of this is page number
-  Can remove the page starting with `SIGNATURE`
- Texts after `Exhibit 31.1` may be generic. Check and remove if so

**AMZN Notes:**
- Can skip first 2 pages
- Has header. Sample: `Table of Contents`
- Has page number. But there may not be a significant identifier for them. Check a sample content if it contains a form of line or multiple newlines.
- Texts after `PART II. OTHER INFORMATION` may be generic. Check and remove if so
- 

**INTC Notes:**
- Can skip first 4 pages
- Has header. Sample: `Table of Contents`
- Has footer and page number. But no unique identifier. Excluding them may not be possible
- Texts after `Exhibit 31.1` may be generic. Check and remove if so

**MSFT Notes:**
- Can skip first 2 pages
- Has header: Sample: `PART II Item 6`
- Doesn't have footer.
- Has page number. But no unique identifier. Exculing them may not be possible
- Texts after `Exhibit 31.1` may be generic. Check and remove if so

**NVDA Notes:**
- Can skip first 2 pages
- Has header: One type: `NVIDIA CORPORATION AND SUBSIDIARIES
NOTES TO CONDENSED CONSOLIDATED FINANCIAL STATEMENTS (Continued)
(Unaudited)`
- Doesn't have footer
- Has page number. But no unique identifier. Excluding them may not be possible
- Texts after `Exhibit 31.1` may be generic. Check and remove if so