# Section 4 Document AI

### 4b - Snowflake Quarterly Earnings Infographic
So now let's look at the second document AI project you created.  This project, was extracting the metrics from the quartely earnings infographic.  The out put of this produced structured information.  We would like to include this along with the other structured information into the analysis - and at the same time, review the unstructured data.

**Cortex Agents** allow you to gain insight to structured and unstructured information all in one place.  So to start with, lets process these infographics and ensure this information is used when a user utilises a Cortex agent.

In [None]:
# Import python packages
import streamlit as st
import pandas as pd

from snowflake.snowpark.functions import *
from snowflake.snowpark.types import *

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


#### Here we have key metrics in images - you will utilise Document AI to extract the key metrics out.  The result should be a table of metrics over time

In [None]:
infographics = session.sql('''SELECT *, GET_PRESIGNED_URL('@DOCUMENT_AI.INFOGRAPHICS',RELATIVE_PATH) IMAGE FROM DIRECTORY(@DOCUMENT_AI.INFOGRAPHICS)''')

infographics = infographics.select('IMAGE','RELATIVE_PATH').order_by('RELATIVE_PATH')

st.dataframe(
    infographics,
    column_config={
        "IMAGE": st.column_config.ImageColumn(
            "Preview Image", help="Snowflake infographic"
        )
    },
    hide_index=True,
)

In [None]:
CREATE TABLE if not exists DOCUMENT_AI.INFOGRAPHICS AS

SELECT RELATIVE_PATH,'@DOCUMENT_AI.INFOGRAPHICS' AS STAGE, 

DOCUMENT_AI.INFOGRAPHICS!PREDICT(GET_PRESIGNED_URL('@DOCUMENT_AI.INFOGRAPHICS',RELATIVE_PATH),1) METADATA

FROM DIRECTORY (@DOCUMENT_AI.INFOGRAPHICS);

SELECT GET_PRESIGNED_URL('@DOCUMENT_AI.INFOGRAPHICS',RELATIVE_PATH)IMAGE,RELATIVE_PATH,METADATA FROM DOCUMENT_AI.INFOGRAPHICS;

In [None]:
earnings_data = session.sql('''SELECT GET_PRESIGNED_URL('@DOCUMENT_AI.INFOGRAPHICS',RELATIVE_PATH)IMAGE,RELATIVE_PATH,METADATA FROM DOCUMENT_AI.INFOGRAPHICS''')

earnings_formatted = earnings_data.with_column('1M_CUSTOMERS',col('METADATA')['1M_CUSTOMERS'][0]['value'].astype(StringType()))
earnings_formatted = earnings_formatted.with_column('GLOBAL_2000_CUSTOMERS',col('METADATA')['GLOBAL_2000_CUSTOMERS'][0]['value'].astype(StringType()))
earnings_formatted = earnings_formatted.with_column('NET_PROMOTOR',col('METADATA')['NET_PROMOTOR'][0]['value'].astype(StringType()))
earnings_formatted = earnings_formatted.with_column('DRESNER_CUSTOMER_SATISFACTION',col('METADATA')['dresner_customer_satisfaction'][0]['value'].astype(StringType()))
earnings_formatted = earnings_formatted.with_column('NET_REVENUE_RETENTION',col('METADATA')['NET_REVENUE_RETENTION'][0]['value'].astype(StringType()))
earnings_formatted = earnings_formatted.with_column('PRODUCT REVENUE',col('METADATA')['PROD_REVENUE'][0]['value'].astype(StringType()))
earnings_formatted = earnings_formatted.with_column('TOTAL_CUSTOMERS',col('METADATA')['TOTAL_CUSTOMERS'][0]['value'].astype(StringType()))
earnings_formatted = earnings_formatted.with_column('TOTAL_CUSTOMERS',col('METADATA')['TOTAL_CUSTOMERS'][0]['value'].astype(StringType()))
earnings_formatted = earnings_formatted.with_column('DATE_OF_REPORT',col('METADATA')['date_of_report'][0]['value'].astype(StringType()))
earnings_formatted = earnings_formatted.with_column('MARKETPLACE_LISTINGS',col('METADATA')['marketplace_listings'][0]['value'].astype(StringType()))
earnings_formatted = earnings_formatted.with_column('QUARTER',col('METADATA')['quarter'][0]['value'].astype(StringType()))
earnings_formatted = earnings_formatted.with_column('YEAR',col('METADATA')['year'][0]['value'].astype(StringType())).drop('METADATA')


st.dataframe(
    earnings_formatted,
    column_config={
        "IMAGE": st.column_config.ImageColumn(
            "Preview Image", help="Snowflake infographic"
        )
    },
    hide_index=True,
)

#### Use Cortex Complete for a simple date formatter
So we can now see the results in a table.  You may want to re train with new infogtraphics for better accuracy.  You will notice that the dates are of differing formats.  Next we will use cortex complete to return the date to a consistant format

In [None]:
def cortex_date(date_string):
    return call_function('SNOWFLAKE.CORTEX.COMPLETE','reka-flash',
                         concat(lit('return the following which can be parsed as a date in this format YYYY-MM-DD.  ONLY RETURN THE RESULT'),
                                lit(date_string)))

session.create_dataframe([{'DATE':'test'}]).with_column('DATE',cortex_date('April 30, 2022'))

### Keep only Numeric Characters
Some of the fields which would work best as numeric fields have other characters.  Using the **regex_replace** function handles this to keep all numeric fields consistant.

In [None]:
CREATE OR REPLACE FUNCTION NUMBERS(input_string STRING)
RETURNS STRING
LANGUAGE SQL
AS
$$
    REGEXP_REPLACE(input_string, '[^0-9]', '')
$$;

Let's apply this to our structured data results

In [None]:
earnings_formatted_s = session.table('DOCUMENT_AI.INFOGRAPHICS')
earnings_formatted_2 = earnings_formatted_s.with_column('IMAGE',call_function('GET_PRESIGNED_URL',
                                                                              lit('@DOCUMENT_AI.INFOGRAPHICS'),
                                                                              col('RELATIVE_PATH'))).cache_result()
earnings_formatted_2 = earnings_formatted_2.with_column('1M_CUSTOMERS',
            call_function('DOCUMENT_AI.NUMBERS',col('METADATA')['1M_CUSTOMERS'][0]['value']).astype(IntegerType()))\
.with_column('GLOBAL_2000_CUSTOMERS',call_function('DOCUMENT_AI.NUMBERS',col('METADATA')['GLOBAL_2000_CUSTOMERS'][0]['value']).astype(IntegerType()))\
.with_column('NET_PROMOTOR',call_function('DOCUMENT_AI.NUMBERS',col('METADATA')['NET_PROMOTOR'][0]['value']).astype(IntegerType()))\
.with_column('DRESNER_CUSTOMER_SATISFACTION',call_function('DOCUMENT_AI.NUMBERS',col('METADATA')['dresner_customer_satisfaction'][0]['value']).astype(IntegerType()))\
.with_column('NET_REVENUE_RETENTION',call_function('DOCUMENT_AI.NUMBERS',col('METADATA')['NET_REVENUE_RETENTION'][0]['value']).astype(IntegerType()))\
.with_column('PRODUCT REVENUE',col('METADATA')['PROD_REVENUE'][0]['value'].astype(StringType()))\
.with_column('TOTAL_CUSTOMERS',col('METADATA')['TOTAL_CUSTOMERS'][0]['value'].astype(StringType()))\
.with_column('DATE_OF_REPORT',cortex_date(col('METADATA')['date_of_report'][0]['value']).astype(DateType()))\
.with_column('MARKETPLACE_LISTINGS',call_function('DOCUMENT_AI.NUMBERS',col('METADATA')['marketplace_listings'][0]['value']).astype(IntegerType()))\
.with_column('QUARTER',col('METADATA')['quarter'][0]['value'].astype(StringType()))\
.with_column('YEAR',col('METADATA')['year'][0]['value'].astype(StringType()))\
.with_column('PRODUCT REVENUE',call_function('DOCUMENT_AI.NUMBERS',col('PRODUCT REVENUE')).astype(DecimalType(6,1)))\
.with_column('TOTAL_CUSTOMERS',call_function('DOCUMENT_AI.NUMBERS',col('TOTAL_CUSTOMERS')).astype(IntegerType()))\
.with_column('MARKETPLACE_LISTINGS',call_function('DOCUMENT_AI.NUMBERS',col('MARKETPLACE_LISTINGS')).astype(IntegerType()))\
.with_column('NET_REVENUE_RETENTION',call_function('DOCUMENT_AI.NUMBERS',col('NET_REVENUE_RETENTION')).astype(IntegerType()))\
.with_column('NET_PROMOTOR',call_function('DOCUMENT_AI.NUMBERS',col('NET_PROMOTOR')).astype(IntegerType()))\
.with_column('GLOBAL_2000_CUSTOMERS',call_function('DOCUMENT_AI.NUMBERS',col('GLOBAL_2000_CUSTOMERS')).astype(IntegerType()))\
.with_column('1M_CUSTOMERS',call_function('DOCUMENT_AI.NUMBERS',col('1M_CUSTOMERS')).astype(IntegerType()))\
.drop('RELATIVE_PATH','STAGE','METADATA')

earnings_formatted_2.write.mode('overwrite').save_as_table("DOCUMENT_AI.EARNINGS_INFOGRAPHIC_PARSED")

earnings_formatted_2 = session.table('DOCUMENT_AI.EARNINGS_INFOGRAPHIC_PARSED')

Finally we will view the Snowflake reporting information in a Streamlit app.

In [None]:
st.markdown('### Recent Earnings infographics for SNOW')

st.markdown('#### KEY METRICS')
col1,col2,col3 = st.columns(3)
with col1:
    st.markdown('#### PRODUCT REVENUE')
    st.line_chart(earnings_formatted_2.to_pandas(),x='QUARTER',y='PRODUCT REVENUE', color='YEAR')
with col2:
    st.markdown('#### TOTAL CUSTOMERS')
    st.line_chart(earnings_formatted_2.to_pandas(),x='QUARTER',y='TOTAL_CUSTOMERS', color='YEAR')
with col3:
    st.markdown('#### MARKETPLACE LISTINGS')
    st.line_chart(earnings_formatted_2.to_pandas(),x='QUARTER',y='MARKETPLACE_LISTINGS',color='YEAR')
st.divider()
st.markdown('#### ALL EXTRACTED DATA')
st.dataframe(
    earnings_formatted_2,
    column_config={
        "IMAGE": st.column_config.ImageColumn(
            "Preview Image", help="Snowflake infographic"
        )
    },
    hide_index=True,
)


st.divider()
st.markdown('#### ORIGINAL INFOGRAPHIC')
col1, col2 = st.columns(2)


with col1:
    selected_year = st.selectbox('Choose Year:',earnings_formatted_2.select('YEAR').distinct().sort(col('YEAR').desc()))
with col2:
    selected_quarter = st.selectbox('Choose Quarter:',earnings_formatted_2.filter(col('YEAR')==selected_year).select('QUARTER').distinct().sort('QUARTER'))





st.image(earnings_formatted_2.filter((col('QUARTER')==selected_quarter)
                                  &(col('YEAR')==selected_year)).select('IMAGE').limit(1).collect()[0][0])
