### Create a Service to transcript the earnings call
this service is leveraging a container with whisper running on it - leverages pytorch (a deap leaning framework) to parse and transcribe the text.  Follow the instructions here to set up the container which I used.

- git hub https://github.com/michaelgorkow/scs_whisper, 
- blog post https://github.com/michaelgorkow/scs_whisper

In [None]:
USE ROLE ACCOUNTADMIN;
ALTER COMPUTE POOL GPU_COMPUTE_FOR_SOUND_TO_TEXT_MED SUSPEND;


In [None]:
USE ROLE CONTAINER_RUNTIME_LAB_USER;

CREATE SERVICE if not exists SOUND.WHISPER_APP
  IN COMPUTE POOL gpu_compute_for_sound_to_text_MED
  FROM @SOUND.WHISPER_APP
  SPEC='spec.yml'
  MIN_INSTANCES=1
  MAX_INSTANCES=3
  EXTERNAL_ACCESS_INTEGRATIONS = (CONTAINER_ACCESS_INTEGRATION);

ALTER SERVICE SOUND.WHISPER_APP RESUME;


In [None]:
CALL SYSTEM$GET_SERVICE_STATUS('SOUND.WHISPER_APP')

In [None]:
USE SCHEMA SOUND;
SELECT value AS log_line
FROM TABLE(
 SPLIT_TO_TABLE(SYSTEM$GET_SERVICE_LOGS('WHISPER_APP', 0, 'whisper-service-container'), '\n')
  );

#### Create 2 Functions

##### Function 1 - Detect Language

In [None]:
CREATE OR REPLACE FUNCTION UTILS.DETECT_LANGUAGE(AUDIO_FILE TEXT, ENCODE BOOLEAN)
RETURNS VARIANT
SERVICE=SOUND.WHISPER_APP
ENDPOINT=API
AS '/detect-language';

##### Function 2 - Transcribe Text

In [None]:
CREATE OR REPLACE FUNCTION UTILS.TRANSCRIBE(TASK TEXT, LANGUAGE TEXT, AUDIO_FILE TEXT, ENCODE BOOLEAN)
RETURNS VARIANT
SERVICE=SOUND.WHISPER_APP
ENDPOINT=API
AS '/asr';

#### Run The Functions

In [None]:
# Import python packages
import streamlit as st
import pandas as pd

from snowflake.snowpark.functions import *
from snowflake.snowpark.types import *

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


files = session.sql('''SELECT RELATIVE_PATH, GET_PRESIGNED_URL('@DATA.SOUND',RELATIVE_PATH) URL FROM DIRECTORY (@SNOWFLAKE_BUY_OR_SELL.DATA.SOUND)''')


select_call = st.selectbox('Select Call:', files.select('RELATIVE_PATH'))

URL = files.filter(col('RELATIVE_PATH') == select_call).select('URL').collect()[0][0]
st.audio(URL, format="audio/mpeg")

### Detect the Language

In [None]:
SELECT RELATIVE_PATH, UTILS.DETECT_LANGUAGE(GET_PRESIGNED_URL('@DATA.SOUND',RELATIVE_PATH),True) FROM DIRECTORY (@SNOWFLAKE_BUY_OR_SELL.DATA.SOUND)

#### Create a table which will contain the transcript

In [None]:
CREATE TABLE if NOT EXISTS DATA.EARNINGS_CALL_TRANSCRIPT AS 

SELECT RELATIVE_PATH, UTILS.TRANSCRIBE('transcribe','english',GET_PRESIGNED_URL('@DATA.SOUND',RELATIVE_PATH),True) TRANSCRIPT FROM DIRECTORY (@SNOWFLAKE_BUY_OR_SELL.DATA.SOUND);


SELECT * FROM DATA.EARNINGS_CALL_TRANSCRIPT

### Transform the transcript table

In [None]:
CREATE TABLE IF NOT EXISTS DATA.TRANSCRIBED_TRANSCRIPTS AS

SELECT RELATIVE_PATH, PARSE_JSON(TRANSCRIPT):language::TEXT LANGUAGE,

VALUE:end::FLOAT TIME_SECONDS,  
VALUE:text::TEXT TEXT 
FROM DATA.EARNINGS_CALL_TRANSCRIPT,
LATERAL FLATTEN (PARSE_JSON(TRANSCRIPT):segments);

SELECT * FROM DATA.TRANSCRIBED_TRANSCRIPTS LIMIT 5

#### Add Sentiment scores to the calls

In [None]:
SELECT *, SNOWFLAKE.CORTEX.SENTIMENT(TEXT) FROM DATA.TRANSCRIBED_TRANSCRIPTS

#### Put all together in Streamlit

In [None]:
# Import python packages
import streamlit as st
import pandas as pd

from snowflake.snowpark.functions import *
from snowflake.snowpark.types import *

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

def sentiment(text):
    return call_function('snowflake.cortex.sentiment',text)

transcript_with_sentiment = session.table('DATA.TRANSCRIBED_TRANSCRIPTS').with_column('sentiment',sentiment(col('TEXT')))

st.markdown('#### Calls with Sentiment')


st.dataframe(transcript_with_sentiment)
col1,col2,col3 = st.columns(3)

with col1:

    st.markdown('#### Q1')
    q1 = transcript_with_sentiment.filter(col('RELATIVE_PATH')=='EARNINGS_Q1_FY2025.mp3')
    st.line_chart(q1,
              y='SENTIMENT',x='TIME_SECONDS',color = '#29B5E8')
    st.metric('Average Sentiment',q1.agg(avg('SENTIMENT').alias('SENTIMENT')).select(round('SENTIMENT',2)).collect()[0][0])

with col2:
    q2 = transcript_with_sentiment.filter(col('RELATIVE_PATH')=='EARNINGS_Q2_FY2025.mp3')
    st.markdown('#### Q2')
    st.line_chart(q2,
              y='SENTIMENT',x='TIME_SECONDS',color = '#29B5E8')
    st.metric('Average Sentiment',q2.agg(avg('SENTIMENT').alias('SENTIMENT')).select(round('SENTIMENT',2)).collect()[0][0])

with col3:
    
    st.markdown('#### Q3')
    q3 = transcript_with_sentiment.filter(col('RELATIVE_PATH')=='EARNINGS_Q3_FY2025.mp3')
    st.line_chart(q3,
              y='SENTIMENT',x='TIME_SECONDS',color = '#FF9F36')
    st.metric('Average Sentiment',q3.agg(avg('SENTIMENT').alias('SENTIMENT')).select(round('SENTIMENT',2)).collect()[0][0])

In [None]:
grouped = transcript_with_sentiment.with_column('TIME',time_from_parts(15,0,'TIME_SECONDS')).\
with_column('MINUTES',date_trunc('minute','TIME'))
grouped = grouped.with_column('MINUTES',minute('MINUTES'))
data_grouped_minutes = grouped.group_by('RELATIVE_PATH','MINUTES').agg(array_agg('TEXT').alias('TEXT'),avg('SENTIMENT').alias('SENTIMENT'))

st.markdown('''Data Grouped to Minutes''')
data_grouped_minutes


In [None]:
st.markdown('#### Sentiment Analysis during the duration of the last 3 quarterly earnings calls')
col1,col2,col3 = st.columns(3)

with col1:

    st.markdown('#### Q1')
    q1 = data_grouped_minutes.filter(col('RELATIVE_PATH')=='EARNINGS_Q1_FY2025.mp3')
    st.line_chart(q1,
              y='SENTIMENT',x='MINUTES',color = '#29B5E8')
    st.metric('Average Sentiment',q1.agg(avg('SENTIMENT').alias('SENTIMENT')).select(round('SENTIMENT',2)).collect()[0][0])

with col2:
    q2 = data_grouped_minutes.filter(col('RELATIVE_PATH')=='EARNINGS_Q2_FY2025.mp3')
    st.markdown('#### Q2')
    st.line_chart(q2,
              y='SENTIMENT',x='MINUTES',color = '#29B5E8')
    st.metric('Average Sentiment',q2.agg(avg('SENTIMENT').alias('SENTIMENT')).select(round('SENTIMENT',2)).collect()[0][0])

with col3:
    
    st.markdown('#### Q3')
    q3 = data_grouped_minutes.filter(col('RELATIVE_PATH')=='EARNINGS_Q3_FY2025.mp3')
    st.line_chart(q3,
              y='SENTIMENT',x='MINUTES',color = '#FF9F36')
    st.metric('Average Sentiment',q3.agg(avg('SENTIMENT').alias('SENTIMENT')).select(round('SENTIMENT',2)).collect()[0][0])

st.markdown(f'''**:bulb: Most positive minute of the year**: \
{data_grouped_minutes.sort(col('SENTIMENT').desc()).limit(1).select(array_to_string(col('TEXT'),lit(''))).collect()[0][0]}''')

st.markdown(f'''**:warning: Most negative minute of the year**: \
{data_grouped_minutes.sort(col('SENTIMENT').asc()).limit(1).select(array_to_string(col('TEXT'),lit(''))).collect()[0][0]}''')

In [None]:
grouped_text = data_grouped_minutes.with_column('TEXT',replace(replace(replace(cast('TEXT',StringType()),'"',''),'[',''),']',''))
grouped_text

#### Save data in a table

In [None]:
grouped_text.write.mode("overwrite").save_as_table("data.summary_text")

So you have now processed **sound transcripts** and have put the results in a table.  We are starting to get quite a lot of processed but still unstructured text fields.  Let's make sense of this information with a search service.