# Cortex AI  Process Documents 

Below is a list of PDFS for Analyst Reports analysing snowflake.

In [None]:
select BUILD_SCOPED_FILE_URL('@DOCUMENT_AI.ANALYST_REPORTS',RELATIVE_PATH), * from directory(@DOCUMENT_AI.ANALYST_REPORTS)

## AI_PARSE_DOCUMENT and AI_EXTRACT 

Cortex Parse document extracts all the text out of the document which will help make them searchable.  Below uses markdown to distinguish breaks between text to add formatting and additional context.

In [None]:
CREATE OR REPLACE TABLE DOCUMENT_AI.PARSED_ANALYST_REPORTS AS

 SELECT 
        RELATIVE_PATH,
        AI_PARSE_DOCUMENT(
        TO_FILE('@DOCUMENT_AI.ANALYST_REPORTS', RELATIVE_PATH),
            {'mode': 'LAYOUT'}) EXTRACTED_DATA
        FROM DIRECTORY('@DOCUMENT_AI.ANALYST_REPORTS');

SELECT *, EXTRACTED_DATA:content::text, EXTRACTED_DATA:metadata:pageCount FROM DOCUMENT_AI.PARSED_ANALYST_REPORTS

AI EXTRACT will answer key questions about each document which will add structured to the unstructured content.

In [None]:
CREATE OR REPLACE TABLE DOCUMENT_AI.AI_EXTRACT_ANALYST_REPORTS AS
SELECT 
    RELATIVE_PATH,
    EXTRACTED_DATA:content::text CONTENT,
    EXTRACTED_DATA:metadata:pageCount PAGE_COUNT,
    
    -- Extract structured fields using AI_EXTRACT from the parsed content
    AI_EXTRACT(
        CONTENT,
        {
            'DATE_REPORT': 'When was the Report Created? Return the date in                     format.',
            'NAME_OF_REPORT_PROVIDER': 'What is the name of the report provider                 or research firm?',
            'RATING': 'What is the rating recommendation? Is it BUY, SELL, HOLD,                or EQUAL-WEIGHT?',
            'CLOSE_PRICE': 'What is the Close Price Value mentioned in the                     report?',
            'PRICE_TARGET': 'What is the Price Target Value?',
            'GROWTH': 'What is the latest revenue Growth - YoY (Year over Year)?'
        }
    ) AS EXTRACTED_DATA

FROM DOCUMENT_AI.PARSED_ANALYST_REPORTS;

select * from DOCUMENT_AI.AI_EXTRACT_ANALYST_REPORTS;

Create a structured view with flattened extracted fields + full text

In [None]:
-- Step 2: Create a structured view with flattened extracted fields + full text
CREATE OR REPLACE VIEW DOCUMENT_AI.AI_EXTRACT_REPORTS_STRUCTURED AS
SELECT 
    RELATIVE_PATH,
    PAGE_COUNT,
    
    
    -- Extract individual fields from the JSON response
    EXTRACTED_DATA:response:DATE_REPORT::text AS DATE_REPORT,
    EXTRACTED_DATA:response:NAME_OF_REPORT_PROVIDER::text AS                NAME_OF_REPORT_PROVIDER,
    EXTRACTED_DATA:response:RATING::text AS RATING,
    EXTRACTED_DATA:response:CLOSE_PRICE::text AS CLOSE_PRICE,
    EXTRACTED_DATA:response:PRICE_TARGET::text AS PRICE_TARGET,
    EXTRACTED_DATA:response:GROWTH::text AS GROWTH,
    
    -- Full parsed text as separate column
    CONTENT AS FULL_TEXT
    
FROM DOCUMENT_AI.AI_EXTRACT_ANALYST_REPORTS;

-- View the results
SELECT 
    RELATIVE_PATH,
    DATE_REPORT,
    NAME_OF_REPORT_PROVIDER,
    RATING,
    CLOSE_PRICE,
    PRICE_TARGET,
    GROWTH,
    PAGE_COUNT,
    LENGTH(FULL_TEXT) AS TEXT_LENGTH,
    FULL_TEXT
FROM DOCUMENT_AI.AI_EXTRACT_REPORTS_STRUCTURED;

This method uses Cortex Complete to extract the a

In [None]:
CREATE OR REPLACE TABLE DOCUMENT_AI.AI_EXTRACT_ANALYST_REPORTS_ADVANCED AS
SELECT 
    RELATIVE_PATH,
    EXTRACTED_DATA:content::text CONTENT,
    EXTRACTED_DATA:metadata:pageCount PAGE_COUNT,
    
    -- Use AI_COMPLETE with structured output schema
    AI_COMPLETE(
        model => 'claude-4-sonnet',
        prompt => CONCAT(
            'Extract the following information from this analyst report:\n',
            '1. Report creation date - assume the first of the month if the date is not in full (or first of the financial year if only the year is supplied.  i need an actual date in the correct format\n',
            '2. Name of the report provider/research firm - Apex Analytics, Sterling Partners, Veridian Capital, Pinnacle Growth Investors, Momentum Metrics, Quant-Vestor Consensus Point \n',
            '3. Stock rating (BUY, SELL, HOLD, or EQUAL-WEIGHT)\n',
            '4. Close price value. NO VALUE if not known\n',
            '5. Price target value NO VALUE if UNKNOWN\n',
            '6. Latest revenue growth YoY this will be as a percentage between 0 and 100 NO VALUE if not known\n\n',
            'Document content:\n',
            CONTENT
        ),
        response_format => {
            'type': 'json',
            'schema': {
                'type': 'object',
                'properties': {
                    'date_report': {'type': 'string', 'description': 'Date when the report was created'},
                    'name_of_report_provider': {'type': 'string', 'description': 'Name of the research firm or report provider'},
                    'rating': {'type': 'string', 'enum': ['BUY', 'SELL', 'HOLD', 'EQUAL-WEIGHT'], 'description': 'Stock rating recommendation'},
                    'close_price': {'type': 'number', 'description': 'Close price value'},
                    'price_target': {'type': 'number', 'description': 'Price target value'},
                    'growth': {'type': 'number', 'description': 'Latest revenue growth YoY'}
                },
                'required': ['date_report', 'name_of_report_provider', 'rating']
            }
        }
    ) AS EXTRACTED_DATA

FROM DOCUMENT_AI.PARSED_ANALYST_REPORTS;

select * from DOCUMENT_AI.AI_EXTRACT_ANALYST_REPORTS_ADVANCED

In [None]:
CREATE OR REPLACE VIEW DOCUMENT_AI.AI_EXTRACT_REPORTS_STRUCTURED_ADVANCED AS
SELECT 
    RELATIVE_PATH,
    PAGE_COUNT,
    CONTENT AS FULL_TEXT,
    
    -- Extract individual fields from the structured JSON response
    EXTRACTED_DATA:date_report::text AS DATE_REPORT,
    EXTRACTED_DATA:name_of_report_provider::text AS NAME_OF_REPORT_PROVIDER,
    EXTRACTED_DATA:rating::text AS RATING,
    EXTRACTED_DATA:close_price::float AS CLOSE_PRICE,
    EXTRACTED_DATA:price_target::float AS PRICE_TARGET,
    EXTRACTED_DATA:growth::integer AS GROWTH
    
FROM DOCUMENT_AI.AI_EXTRACT_ANALYST_REPORTS_ADVANCED;

SELECT * FROM DOCUMENT_AI.AI_EXTRACT_REPORTS_STRUCTURED_ADVANCED

### Using AI_AGG
You will notice that some of these reports are very long.  Let's use **AI_AGG** to summarise them

In [None]:
CREATE OR REPLACE TABLE DOCUMENT_AI.ANALYST_REPORTS_ALL_DATA AS
SELECT 
    RELATIVE_PATH,
    BUILD_SCOPED_FILE_URL('@DOCUMENT_AI.ANALYST_REPORTS',RELATIVE_PATH)FILE_URL,
    RATING,
    DATE_REPORT,
    any_value(CLOSE_PRICE) CLOSE_PRICE,
    any_value(PRICE_TARGET) PRICE_TARGET,
    any_value(GROWTH) GROWTH,
    NAME_OF_REPORT_PROVIDER,
    'ANALYST_REPORTS' AS DOCUMENT_TYPE,
    SPLIT_PART(RELATIVE_PATH, '/', 1)::text AS DOCUMENT,
    AI_AGG(FULL_TEXT, 'summarize the analyst reports in no more than 100 words') AS SUMMARY,
    FULL_TEXT  -- Include full text in summary table too
FROM DOCUMENT_AI.AI_EXTRACT_REPORTS_STRUCTURED_ADVANCED
GROUP BY RELATIVE_PATH, RATING, DATE_REPORT, NAME_OF_REPORT_PROVIDER, FULL_TEXT;

SELECT * FROM DOCUMENT_AI.ANALYST_REPORTS_ALL_DATA;

### A closer look at the PDFS,  

The PDF Viewer is built by streamlit.  You will be able to select a report with the drop down list and then see summary information which has been captured by document AI.

Under the summary, you will see the original PDF side by side to the text that has been extracted.  You will notice that any formatting has been converted to markdown.  This makes it easier to read. Markdown can also be used as a way for chunking the data. 

In [None]:
# Import python packages
import streamlit as st
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F
import pypdfium2 as pdfium

st.title("Equity Research Reports")
session = get_active_session()

# Get all reports from the single table
reports_table = session.table('DOCUMENT_AI.ANALYST_REPORTS_ALL_DATA')
report_list = reports_table.select('RELATIVE_PATH')

# Select a report
file_id = st.selectbox('Select Report:', report_list)

# Get the details for the selected report
doc_details = reports_table.filter(F.col('RELATIVE_PATH') == file_id).limit(1)
doc_details_pd = doc_details.to_pandas()

# Display report details
st.markdown('#### Report Details')
col1, col2, col3 = st.columns(3)

with col1:
    st.markdown(f'''__Report Date:__ {doc_details_pd.DATE_REPORT.iloc[0]}''')
    st.markdown(f'''__Research Firm:__ {doc_details_pd.NAME_OF_REPORT_PROVIDER.iloc[0]}''')
    
with col2:
    st.markdown(f'''__Close Price:__ {doc_details_pd.CLOSE_PRICE.iloc[0]}''')
    st.markdown(f'''__Price Target:__ {doc_details_pd.PRICE_TARGET.iloc[0]}''')

with col3:
    st.markdown(f'''__Recommendation:__ {doc_details_pd.RATING.iloc[0]}''')
    st.markdown(f'''__Growth (YoY):__ {doc_details_pd.GROWTH.iloc[0]}%''')

st.markdown(f'''__Summary:__ {doc_details_pd.SUMMARY.iloc[0]}''')

# Helper functions for PDF navigation
def display_pdf_page():
    pdf = st.session_state['pdf_doc']
    page = pdf[st.session_state['pdf_page']]
    bitmap = page.render(scale=8, rotation=0)
    pil_image = bitmap.to_pil()
    st.image(pil_image)

def next_pdf_page():
    if st.session_state.pdf_page + 1 >= len(st.session_state['pdf_doc']):
        st.session_state.pdf_page = 0
    else:
        st.session_state.pdf_page += 1

def previous_pdf_page():
    if st.session_state.pdf_page > 0:
        st.session_state.pdf_page -= 1

# PDF viewer section
st.divider()
col1, col2 = st.columns(2)

with col1:
    st.markdown('#### RAW PDF STORED IN FILE STORE')
    
    if file_id:
        # Initialize session state for PDF viewing
        if 'pdf_page' not in st.session_state:
            st.session_state['pdf_page'] = 0
        if 'pdf_url' not in st.session_state:
            st.session_state['pdf_url'] = file_id    
        if 'pdf_doc' not in st.session_state or st.session_state['pdf_url'] != file_id:
            pdf_stream = session.file.get_stream(f"@DOCUMENT_AI.ANALYST_REPORTS/{file_id}")
            pdf = pdfium.PdfDocument(pdf_stream)
            st.session_state['pdf_doc'] = pdf
            st.session_state['pdf_url'] = file_id
            st.session_state['pdf_page'] = 0
        
        # Navigation controls
        nav_col1, nav_col2, nav_col3 = st.columns(3)
        with nav_col1:
            st.button("⏮️ Previous", on_click=previous_pdf_page)
        with nav_col2:
            st.write(f"page {st.session_state['pdf_page'] + 1} of {len(st.session_state['pdf_doc'])} pages")
        with nav_col3:
            st.button("Next ⏭️", on_click=next_pdf_page)
        
        # Display the PDF page
        display_pdf_page()

with col2:
    st.markdown('#### EXTRACTED TEXT FROM PDFS')
    with st.container(height=1000):
        st.markdown(doc_details_pd.FULL_TEXT.iloc[0])


#### AI_AGG (summarized)

Youmay wish to have a summary of data from all the reports.  This is what **AI_AGG can summarize unstructured data accross multiple rows.**  The groupby gives you the level of aggregation making AISQL concepts work in a similar way to summarizing measures in SQL.

In [None]:
CREATE  or replace table DOCUMENT_AI.REPORT_PROVIDER_SUMMARY AS
SELECT NAME_OF_REPORT_PROVIDER, AI_AGG(FULL_TEXT,'summarize the analyst reports in chronological order in no more than 500 words and compare what they have said to get an impartial view on the SNOW concensious') SUMMARY FROM DOCUMENT_AI.ANALYST_REPORTS_ALL_DATA GROUP BY ALL;

select * from DOCUMENT_AI.REPORT_PROVIDER_SUMMARY

And this is how we can view the summaries in streamlit

In [None]:
# Import python packages
import streamlit as st
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F
from snowflake.snowpark import types as T
session = get_active_session()
summary_reports = session.table('DOCUMENT_AI.REPORT_PROVIDER_SUMMARY')
provider_filter = st.radio('Choose Provider',summary_reports.select('NAME_OF_REPORT_PROVIDER'))
st.markdown('#### Summarised Reports')
st.write(session.table('DOCUMENT_AI.REPORT_PROVIDER_SUMMARY').filter(F.col('NAME_OF_REPORT_PROVIDER')==provider_filter).to_pandas().SUMMARY.iloc[0])

### Using AI_EXTRACT to extract tables from Financial Reports for SNOW plus other tickers
Please note the data is synthetic

In [None]:
select BUILD_SCOPED_FILE_URL('@DOCUMENT_AI.FINANCIAL_REPORTS',RELATIVE_PATH), * from directory(@DOCUMENT_AI.FINANCIAL_REPORTS)

In [None]:
# Import python packages
import streamlit as st
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F
import pypdfium2 as pdfium

st.title("Financial Reports Preview")
session = get_active_session()

# Get list of financial reports from directory (same as cell3)
financial_reports_df = session.sql("""
    SELECT 
        RELATIVE_PATH,
        BUILD_SCOPED_FILE_URL('@DOCUMENT_AI.FINANCIAL_REPORTS', RELATIVE_PATH) AS FILE_URL,
        SIZE,
        LAST_MODIFIED
    FROM DIRECTORY('@DOCUMENT_AI.FINANCIAL_REPORTS')
""")

# Select a report
file_list = financial_reports_df.select('RELATIVE_PATH')
file_id = st.selectbox('Select Financial Report:', file_list)

# Get file details
file_details = financial_reports_df.filter(F.col('RELATIVE_PATH') == file_id).to_pandas()

if not file_details.empty:
    st.markdown('#### File Details')
    col1, col2 = st.columns(2)
    
    with col1:
        st.markdown(f'''__File Name:__ {file_details.RELATIVE_PATH.iloc[0]}''')
        st.markdown(f'''__File Size:__ {file_details.SIZE.iloc[0]:,} bytes''')
    
    with col2:
        st.markdown(f'''__Last Modified:__ {file_details.LAST_MODIFIED.iloc[0]}''')
    
    # Helper functions for PDF navigation
    def display_pdf_page():
        pdf = st.session_state['pdf_doc_fin']
        page = pdf[st.session_state['pdf_page_fin']]
        bitmap = page.render(scale=8, rotation=0)
        pil_image = bitmap.to_pil()
        st.image(pil_image)
    
    def next_pdf_page():
        if st.session_state.pdf_page_fin + 1 >= len(st.session_state['pdf_doc_fin']):
            st.session_state.pdf_page_fin = 0
        else:
            st.session_state.pdf_page_fin += 1
    
    def previous_pdf_page():
        if st.session_state.pdf_page_fin > 0:
            st.session_state.pdf_page_fin -= 1
    
    # PDF viewer
    st.divider()
    st.markdown('#### PDF VIEWER')
    
    if file_id:
        # Initialize session state for PDF viewing
        if 'pdf_page_fin' not in st.session_state:
            st.session_state['pdf_page_fin'] = 0
        if 'pdf_url_fin' not in st.session_state:
            st.session_state['pdf_url_fin'] = file_id    
        if 'pdf_doc_fin' not in st.session_state or st.session_state['pdf_url_fin'] != file_id:
            pdf_stream = session.file.get_stream(f"@DOCUMENT_AI.FINANCIAL_REPORTS/{file_id}")
            pdf = pdfium.PdfDocument(pdf_stream)
            st.session_state['pdf_doc_fin'] = pdf
            st.session_state['pdf_url_fin'] = file_id
            st.session_state['pdf_page_fin'] = 0
        
        # Show total pages
        total_pages = len(st.session_state['pdf_doc_fin'])
        st.markdown(f'__Total Pages:__ {total_pages}')
        
        # Navigation controls
        nav_col1, nav_col2, nav_col3 = st.columns(3)
        with nav_col1:
            st.button("⏮️ Previous", on_click=previous_pdf_page, key="prev_fin")
        with nav_col2:
            st.write(f"page {st.session_state['pdf_page_fin'] + 1} of {total_pages}")
        with nav_col3:
            st.button("Next ⏭️", on_click=next_pdf_page, key="next_fin")
        
        # Display the PDF page
        display_pdf_page()


In [None]:
CREATE OR REPLACE TABLE DOCUMENT_AI.FINANCIAL_REPORTS AS

 SELECT 
        RELATIVE_PATH,
        AI_PARSE_DOCUMENT(
        TO_FILE('@DOCUMENT_AI.FINANCIAL_REPORTS', RELATIVE_PATH),
            {'mode': 'LAYOUT'}) EXTRACTED_DATA
        FROM DIRECTORY('@DOCUMENT_AI.ANALYST_REPORTS');

SELECT *, EXTRACTED_DATA:content::text, EXTRACTED_DATA:metadata:pageCount FROM DOCUMENT_AI.PARSED_ANALYST_REPORTS

You have now produced meta data as well as Full text extraction with summaries to gain better insights of Snowflake Stock.  Now, Let's process financial report information.  This dataset will hold key metrics extracted from tables.  

- Go back to the notebooks in the project area and click on **DOCUMENTAI_FINANCIAL_REPORTS**