# Database Queries and Analysis
This notebook contains queries for various database tables, identifies potential data issues, and provides insights for further analysis.

In [2]:
# Import required libraries
import pandas as pd
import os
from utils import connect_to_db


In [4]:
# Query the `stock_metadata` table

# Display all records without truncation
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
conn = connect_to_db()
if conn:
    query = "SELECT * FROM raw.stock_metadata;"
    df = pd.read_sql_query(query, conn)
    conn.close()
    
    # Display the data
    display(df)

  df = pd.read_sql_query(query, conn)


Unnamed: 0,tic,name,sector,industry,country,market_cap,employees,description,website,exchange,currency,last_updated
0,AAPL,Apple Inc.,Technology,Consumer Electronics,United States,3791126003712,150000,"Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and podcasts, as well as advertising services include third-party licensing arrangements and its own advertising platforms. In addition, the company offers various subscription-based services, such as Apple Arcade, a game subscription service; Apple Fitness+, a personalized fitness service; Apple Music, which offers users a curated listening experience with on-demand radio stations; Apple News+, a subscription news and magazine service; Apple TV+, which offers exclusive original content; Apple Card, a co-branded credit card; and Apple Pay, a cashless payment service, as well as licenses its intellectual property. The company serves consumers, and small and mid-sized businesses; and the education, enterprise, and government markets. It distributes third-party applications for its products through the App Store. The company also sells its products through its retail and online stores, and direct sales force; and third-party cellular network carriers, wholesalers, retailers, and resellers. Apple Inc. was founded in 1976 and is headquartered in Cupertino, California.",https://www.apple.com,NASDAQ,USD,2025-09-29 01:49:13.523554
1,TSLA,"Tesla, Inc.",Consumer Cyclical,Auto Manufacturers,United States,1464396414976,125665,"Tesla, Inc. designs, develops, manufactures, leases, and sells electric vehicles, and energy generation and storage systems in the United States, China, and internationally. The company operates in two segments, Automotive; and Energy Generation and Storage. The Automotive segment offers electric vehicles, as well as sells automotive regulatory credits; and non-warranty after-sales vehicle, used vehicles, body shop and parts, supercharging, retail merchandise, and vehicle insurance services. This segment also provides sedans and sport utility vehicles through direct and used vehicle sales, a network of Tesla Superchargers, and in-app upgrades; purchase financing and leasing services; services for electric vehicles through its company-owned service locations and Tesla mobile service technicians; and vehicle limited warranties and extended service plans. The Energy Generation and Storage segment engages in the design, manufacture, installation, sale, and leasing of solar energy generation and energy storage products, and related services to residential, commercial, and industrial customers and utilities through its website, stores, and galleries, as well as through a network of channel partners. This segment also provides services and repairs to its energy product customers, including under warranty; and various financing options to its residential customers. The company was formerly known as Tesla Motors, Inc. and changed its name to Tesla, Inc. in February 2017. Tesla, Inc. was incorporated in 2003 and is headquartered in Austin, Texas.",https://www.tesla.com,NASDAQ,USD,2025-09-29 01:49:13.523554
2,NVDA,NVIDIA Corporation,Technology,Semiconductors,United States,4338392236032,36000,"NVIDIA Corporation, a computing infrastructure company, provides graphics and compute and networking solutions in the United States, Singapore, Taiwan, China, Hong Kong, and internationally. The Compute & Networking segment includes its Data Centre accelerated computing platforms and artificial intelligence solutions and software; networking; automotive platforms and autonomous and electric vehicle solutions; Jetson for robotics and other embedded platforms; and DGX Cloud computing services. The Graphics segment offers GeForce GPUs for gaming and PCs, the GeForce NOW game streaming service and related infrastructure, and solutions for gaming platforms; Quadro/NVIDIA RTX GPUs for enterprise workstation graphics; virtual GPU or vGPU software for cloud-based visual and virtual computing; automotive platforms for infotainment systems; and Omniverse software for building and operating industrial AI and digital twin applications. It also customized agentic solutions designed in collaboration with NVIDIA to accelerate enterprise AI adoption. The company's products are used in gaming, professional visualization, data center, and automotive markets. It sells its products to original equipment manufacturers, original device manufacturers, system integrators and distributors, independent software vendors, cloud service providers, consumer internet companies, add-in board manufacturers, distributors, automotive manufacturers and tier-1 automotive suppliers, and other ecosystem participants. NVIDIA Corporation was incorporated in 1993 and is headquartered in Santa Clara, California.",https://www.nvidia.com,NASDAQ,USD,2025-09-29 01:49:13.523554


In [38]:
# count the number of tokens in the description column
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
df['description'].apply(lambda x: len(encoding.encode(x)))


0    313
1    281
2    260
Name: description, dtype: int64

In [2]:
# Query the `stock_ohlcv` table
conn = connect_to_db()
if conn:
    query = "SELECT * FROM raw.stock_ohlcv;"
    df = pd.read_sql_query(query, conn)
    conn.close()
    
    # Display the data
    display(df)

  df = pd.read_sql_query(query, conn)


Unnamed: 0,date,tic,open,high,low,close,volume,last_updated
0,1980-12-12,AAPL,0.098485,0.098913,0.098485,0.098485,469033600,2025-09-29 01:49:20.541342
1,1980-12-15,AAPL,0.093775,0.093775,0.093347,0.093347,175884800,2025-09-29 01:49:20.541342
2,1980-12-16,AAPL,0.086924,0.086924,0.086495,0.086495,105728000,2025-09-29 01:49:20.541342
3,1980-12-17,AAPL,0.088636,0.089064,0.088636,0.088636,86441600,2025-09-29 01:49:20.541342
4,1980-12-18,AAPL,0.091206,0.091634,0.091206,0.091206,73449600,2025-09-29 01:49:20.541342
...,...,...,...,...,...,...,...,...
21832,2025-09-22,NVDA,175.300003,184.550003,174.710007,183.610001,269637000,2025-09-29 01:49:22.284170
21833,2025-09-23,NVDA,181.970001,182.419998,176.210007,178.429993,192559600,2025-09-29 01:49:22.284170
21834,2025-09-24,NVDA,179.770004,179.779999,175.399994,176.970001,143564100,2025-09-29 01:49:22.284170
21835,2025-09-25,NVDA,174.479996,180.259995,173.130005,177.690002,191586700,2025-09-29 01:49:22.284170


In [3]:
# Query the `earnings` table
conn = connect_to_db()
if conn:
    query = "SELECT * FROM raw.earnings;"
    df = pd.read_sql_query(query, conn)
    conn.close()

    # Replace NaN with None for consistency
    df = df.where(pd.notnull(df), None)

    # Display the data
    display(df)

  df = pd.read_sql_query(query, conn)


Unnamed: 0,tic,fiscal_year,fiscal_quarter,fiscal_date,earnings_date,eps,eps_estimated,session,revenue,revenue_estimated,price_before,price_after,last_updated
0,AAPL,1985,3,1985-09-30,1985-09-30,0.00112,,amc,1.918300e+09,,,,2023-05-17
1,AAPL,1989,4,1989-12-31,1989-12-31,0.00857,,amc,1.493400e+09,,,,2023-05-17
2,AAPL,1990,1,1990-03-31,1990-03-31,0.00929,,amc,1.346200e+09,,,,2023-05-17
3,AAPL,1990,2,1990-06-30,1990-06-30,0.00857,,amc,1.364800e+09,,,,2023-05-17
4,AAPL,1990,3,1990-09-30,1990-09-30,0.00748,,amc,1.354100e+09,,,,2023-05-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,NVDA,2024,4,2025-01-26,2025-02-26,0.89000,0.848,amc,3.933100e+10,3.810135e+10,126.640,120.964,2025-05-25
310,NVDA,2025,1,2025-04-27,2025-05-28,0.81000,0.737,amc,4.406200e+10,4.333416e+10,135.167,139.020,2025-08-27
311,NVDA,2025,2,2025-07-27,2025-08-27,1.05000,1.010,amc,4.674300e+10,4.604892e+10,181.770,180.170,2025-09-28
312,NVDA,2025,3,2025-10-27,2025-11-19,,1.230,bmo,,5.459076e+10,,,2025-09-28


In [4]:
# Query the `earnings_transcripts_chunks` table
conn = connect_to_db()
if conn:
    query = "SELECT * FROM core.earnings_transcript_chunks;"
    df = pd.read_sql_query(query, conn)
    conn.close()

    # Replace NaN with None for consistency
    df = df.where(pd.notnull(df), None)

    # Display the data
    display(df)

  df = pd.read_sql_query(query, conn)


Unnamed: 0,tic,fiscal_year,fiscal_quarter,earnings_date,chunk_id,chunk,token_count,chunk_hash,transcript_hash,last_updated
0,AAPL,2024,4,2025-01-30,0,"Suhasini Chandramouli: Good afternoon, and wel...",256,4ba6d30e3d28cb1de70acd90415ecf7bd811f12a62c302...,abba2130960470fb89ec6dd28ea23fbeb60cff51be7654...,2025-10-07 09:16:42.127171
1,AAPL,2024,4,2025-01-30,1,"Tim Cook: Thank you, Suhasini. Good afternoon,...",489,2de23ff5c887b957ec6e815286aac7e99e3e417cc57fd4...,abba2130960470fb89ec6dd28ea23fbeb60cff51be7654...,2025-10-07 09:16:42.127171
2,AAPL,2024,4,2025-01-30,2,Tim Cook: (contd) And we were excited to recen...,511,a3b9984d9b4f1617e2c12bb6f5ef1e5ee3e5ae8321bc5d...,abba2130960470fb89ec6dd28ea23fbeb60cff51be7654...,2025-10-07 09:16:42.127171
3,AAPL,2024,4,2025-01-30,3,Tim Cook: (contd) All of this is enabled by th...,496,3e6d28e7c895d95777c8f0f4854a8f415d562a53036ed6...,abba2130960470fb89ec6dd28ea23fbeb60cff51be7654...,2025-10-07 09:16:42.127171
4,AAPL,2024,4,2025-01-30,4,Tim Cook: (contd) We have so much in store for...,496,0ea41e118ddda93364048c132a691f33a6d920035b0932...,abba2130960470fb89ec6dd28ea23fbeb60cff51be7654...,2025-10-07 09:16:42.127171
...,...,...,...,...,...,...,...,...,...,...
692,NVDA,2025,2,2025-08-27,34,Sarah: Your final question comes from Timothy ...,19,991dce35427015fade249bb0ae4696f2dcedd2151f4d73...,389f300f6683c7c63d35c577bdf7d4167259fd665c1f04...,2025-10-07 09:16:42.127171
693,NVDA,2025,2,2025-08-27,35,"Timothy Arcuri: Thanks a lot. Jensen, I wanted...",103,f1346975e7f14bb72900bc10981bb79b7bed5beedd53d5...,389f300f6683c7c63d35c577bdf7d4167259fd665c1f04...,2025-10-07 09:16:42.127171
694,NVDA,2025,2,2025-08-27,36,"Jensen Huang: Well, I think the best way to lo...",512,e9f62058552f2bfc8db7bf7cb4d138994d4311301d7010...,389f300f6683c7c63d35c577bdf7d4167259fd665c1f04...,2025-10-07 09:16:42.127171
695,NVDA,2025,2,2025-08-27,37,"Jensen Huang: (contd) Our next platform, Rubin...",339,95e4af641dacec7be23358159a768bcde65165e12848c1...,389f300f6683c7c63d35c577bdf7d4167259fd665c1f04...,2025-10-07 09:16:42.127171


In [5]:
df['token_count'].describe()

count    697.000000
mean     155.595409
std      168.150187
min        5.000000
25%       24.000000
50%       92.000000
75%      206.000000
max      521.000000
Name: token_count, dtype: float64

In [6]:
# Query the `earnings_transcripts_chunks` table
conn = connect_to_db()
if conn:
    query = "SELECT * FROM core.earnings_transcript_chunks;"
    df = pd.read_sql_query(query, conn)
    conn.close()

    # Replace NaN with None for consistency
    df = df.where(pd.notnull(df), None)

    # Display the data
    display(df)

  df = pd.read_sql_query(query, conn)


Unnamed: 0,tic,fiscal_year,fiscal_quarter,earnings_date,chunk_id,chunk,token_count,chunk_hash,transcript_hash,last_updated
0,AAPL,2024,4,2025-01-30,0,"Suhasini Chandramouli: Good afternoon, and wel...",256,4ba6d30e3d28cb1de70acd90415ecf7bd811f12a62c302...,abba2130960470fb89ec6dd28ea23fbeb60cff51be7654...,2025-10-07 09:16:42.127171
1,AAPL,2024,4,2025-01-30,1,"Tim Cook: Thank you, Suhasini. Good afternoon,...",489,2de23ff5c887b957ec6e815286aac7e99e3e417cc57fd4...,abba2130960470fb89ec6dd28ea23fbeb60cff51be7654...,2025-10-07 09:16:42.127171
2,AAPL,2024,4,2025-01-30,2,Tim Cook: (contd) And we were excited to recen...,511,a3b9984d9b4f1617e2c12bb6f5ef1e5ee3e5ae8321bc5d...,abba2130960470fb89ec6dd28ea23fbeb60cff51be7654...,2025-10-07 09:16:42.127171
3,AAPL,2024,4,2025-01-30,3,Tim Cook: (contd) All of this is enabled by th...,496,3e6d28e7c895d95777c8f0f4854a8f415d562a53036ed6...,abba2130960470fb89ec6dd28ea23fbeb60cff51be7654...,2025-10-07 09:16:42.127171
4,AAPL,2024,4,2025-01-30,4,Tim Cook: (contd) We have so much in store for...,496,0ea41e118ddda93364048c132a691f33a6d920035b0932...,abba2130960470fb89ec6dd28ea23fbeb60cff51be7654...,2025-10-07 09:16:42.127171
...,...,...,...,...,...,...,...,...,...,...
692,NVDA,2025,2,2025-08-27,34,Sarah: Your final question comes from Timothy ...,19,991dce35427015fade249bb0ae4696f2dcedd2151f4d73...,389f300f6683c7c63d35c577bdf7d4167259fd665c1f04...,2025-10-07 09:16:42.127171
693,NVDA,2025,2,2025-08-27,35,"Timothy Arcuri: Thanks a lot. Jensen, I wanted...",103,f1346975e7f14bb72900bc10981bb79b7bed5beedd53d5...,389f300f6683c7c63d35c577bdf7d4167259fd665c1f04...,2025-10-07 09:16:42.127171
694,NVDA,2025,2,2025-08-27,36,"Jensen Huang: Well, I think the best way to lo...",512,e9f62058552f2bfc8db7bf7cb4d138994d4311301d7010...,389f300f6683c7c63d35c577bdf7d4167259fd665c1f04...,2025-10-07 09:16:42.127171
695,NVDA,2025,2,2025-08-27,37,"Jensen Huang: (contd) Our next platform, Rubin...",339,95e4af641dacec7be23358159a768bcde65165e12848c1...,389f300f6683c7c63d35c577bdf7d4167259fd665c1f04...,2025-10-07 09:16:42.127171


In [9]:
# Query the `earnings_transcripts_chunks` table
conn = connect_to_db()
if conn:
    query = "SELECT * FROM core.earnings_transcript_embeddings;"
    df = pd.read_sql_query(query, conn)
    conn.close()

    # Replace NaN with None for consistency
    df = df.where(pd.notnull(df), None)

    # Display the data
    display(df)

  df = pd.read_sql_query(query, conn)


Unnamed: 0,tic,fiscal_year,fiscal_quarter,earnings_date,chunk_id,embedding,embedding_model,last_updated
0,AAPL,2024,4,2025-01-30,12,"[0.035682842,-0.00039636696,0.0022631593,0.036...",text-embedding-3-small,2025-10-07 19:12:22.993442
1,AAPL,2024,4,2025-01-30,10,"[0.06074509,-0.023144782,0.027064249,-0.002893...",text-embedding-3-small,2025-10-07 19:12:22.993442
2,AAPL,2025,1,2025-05-01,55,"[0.017443154,0.026426641,0.01774435,0.01972176...",text-embedding-3-small,2025-10-07 19:12:22.993442
3,NVDA,2024,3,2024-11-20,13,"[0.0280212,0.0048746895,0.059862804,0.01138039...",text-embedding-3-small,2025-10-07 19:12:22.993442
4,AAPL,2025,1,2025-05-01,68,"[0.019060645,0.0038684208,0.038854394,0.010021...",text-embedding-3-small,2025-10-07 19:12:22.993442
...,...,...,...,...,...,...,...,...
692,TSLA,2025,2,2025-07-23,73,"[0.031511176,-0.00019379305,0.02731332,0.04437...",text-embedding-3-small,2025-10-07 19:12:22.993442
693,AAPL,2025,2,2025-07-31,5,"[0.018208748,0.013937241,0.016918262,0.0470769...",text-embedding-3-small,2025-10-07 19:12:22.993442
694,AAPL,2024,4,2025-01-30,23,"[0.067168735,-0.0017938287,0.013038943,0.03223...",text-embedding-3-small,2025-10-07 19:12:22.993442
695,NVDA,2024,4,2025-02-26,9,"[0.06750801,-0.02043978,0.037897386,0.02805356...",text-embedding-3-small,2025-10-07 19:12:22.993442


In [11]:
type(df['embedding'].values[0])

str

In [12]:
# retrieve chunks using langchain_openai.OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

# Initialize the embedding model
embedding_model_name = "text-embedding-3-small"
embedding_model = OpenAIEmbeddings(model=embedding_model_name)

query_text = "What is the NVDA guidance for next quarter?"
query_vec = embedding_model.embed_query(query_text)
k = 5

conn = connect_to_db()
if conn:
    with conn.cursor() as cursor:
        sql = """
            SELECT
                c.tic,
                c.fiscal_year,
                c.fiscal_quarter,
                c.chunk_id,
                c.chunk,
                1 - (e.embedding <=> '{vec}'::vector) AS similarity
            FROM core.earnings_transcript_embeddings e
            JOIN core.earnings_transcript_chunks c
            USING (tic, fiscal_year, fiscal_quarter, chunk_id)
            WHERE c.tic = 'NVDA' AND c.fiscal_year = 2025 AND c.fiscal_quarter = 2
            ORDER BY e.embedding <=> '{vec}'::vector
            LIMIT {k};
        """.format(vec=query_vec, k=k)
        # Fetch all records from the earnings_transcript_chunks table
        cursor.execute(sql)
        records = cursor.fetchall()

        # Create a DataFrame from the fetched records
        df = pd.DataFrame(records, columns=[desc[0] for desc in cursor.description])

        # Display the DataFrame
        display(df)

Unnamed: 0,tic,fiscal_year,fiscal_quarter,chunk_id,chunk,similarity
0,NVDA,2025,2,8,Colette Kress: (contd) Our full stack drive AV...,0.489236
1,NVDA,2025,2,2,Colette Kress: We delivered another record qua...,0.466615
2,NVDA,2025,2,1,"Toshiya Hari: Good afternoon, everyone, and we...",0.464109
3,NVDA,2025,2,7,"Colette Kress: (contd) Note, our Q3 outlook do...",0.453857
4,NVDA,2025,2,6,Colette Kress: (contd) InfiniBand revenue near...,0.434921


In [None]:
# retrieve chunks using langchain_openai.OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

# Initialize the embedding model
embedding_model_name = "text-embedding-3-small"
embedding_model = OpenAIEmbeddings(model=embedding_model_name)

query_text = "What is the NVDA guidance for next quarter?"
query_vec = embedding_model.embed_query(query_text)
vec_str = "[" + ",".join(map(str, query_vec)) + "]"
k = 5

conn = connect_to_db()
if conn:
    with conn.cursor() as cursor:
        sql = """
            SELECT
                c.tic,
                c.fiscal_year,
                c.fiscal_quarter,
                c.chunk_id,
                c.chunk,
                1 - (e.embedding <=> '{vec}'::vector) AS similarity
            FROM core.earnings_transcript_embeddings e
            JOIN core.earnings_transcript_chunks c
            USING (tic, fiscal_year, fiscal_quarter, chunk_id)
            WHERE c.tic = 'NVDA' 
                AND c.fiscal_year = 2025 
                AND c.fiscal_quarter = 2
            ORDER BY e.embedding <=> '{vec}'::vector
            LIMIT {k};
        """.format(vec=vec_str, k=k)
        # Fetch all records from the earnings_transcript_chunks table
        cursor.execute(sql)
        records = cursor.fetchall()

        # Create a DataFrame from the fetched records
        df = pd.DataFrame(records, columns=[desc[0] for desc in cursor.description])

        # Display the DataFrame
        display(df)

Unnamed: 0,tic,fiscal_year,fiscal_quarter,chunk_id,chunk,similarity
0,NVDA,2025,2,8,Colette Kress: (contd) Our full stack drive AV...,0.491054
1,NVDA,2025,2,2,Colette Kress: We delivered another record qua...,0.467001
2,NVDA,2025,2,1,"Toshiya Hari: Good afternoon, everyone, and we...",0.463842
3,NVDA,2025,2,7,"Colette Kress: (contd) Note, our Q3 outlook do...",0.454184
4,NVDA,2025,2,6,Colette Kress: (contd) InfiniBand revenue near...,0.435345


In [24]:
def escape_sql_literal(s: str) -> str:
    """Very basic escape for SQL string literals (single quotes)."""
    return s.replace("'", "''") if s is not None else s

query_text="data center revenue and Blackwell shipments"
query_text = escape_sql_literal(query_text)
k = 5

conn = connect_to_db()
if conn:
    with conn.cursor() as cursor:
        sql = """
            SELECT
                c.tic,
                c.fiscal_year,
                c.fiscal_quarter,
                c.chunk_id,
                c.chunk,
                ts_rank_cd(
                    to_tsvector('english', c.chunk),
                    websearch_to_tsquery('english', '{query_text}')
                ) AS bm25_score
            FROM core.earnings_transcript_chunks c
            WHERE c.tic = 'NVDA' 
                AND c.fiscal_year = 2025 
                AND c.fiscal_quarter = 2
                AND to_tsvector('english', c.chunk) @@ websearch_to_tsquery('english', '{query_text}')
            ORDER BY bm25_score DESC
            LIMIT {k};
        """.format(query_text=query_text, k=k)
        # Fetch all records from the earnings_transcript_chunks table
        cursor.execute(sql)
        records = cursor.fetchall()

        # Create a DataFrame from the fetched records
        df = pd.DataFrame(records, columns=[desc[0] for desc in cursor.description])

        # Display the DataFrame
        display(df)

Unnamed: 0,tic,fiscal_year,fiscal_quarter,chunk_id,chunk,bm25_score
0,NVDA,2025,2,2,Colette Kress: We delivered another record qua...,0.006381
1,NVDA,2025,2,7,"Colette Kress: (contd) Note, our Q3 outlook do...",0.002071
2,NVDA,2025,2,3,"Colette Kress: (contd) The Vera CPU, Rubin GPU...",0.000847
3,NVDA,2025,2,6,Colette Kress: (contd) InfiniBand revenue near...,0.000328


In [30]:
def escape_sql_literal(s: str) -> str:
    """Very basic escape for SQL string literals (single quotes)."""
    return s.replace("'", "''") if s is not None else s

query_text="data center revenue and Blackwell shipments"
query_text = escape_sql_literal(query_text)
k = 5

conn = connect_to_db()
if conn:
    with conn.cursor() as cursor:
        sql = """
            SELECT
                c.tic,
                c.fiscal_year,
                c.fiscal_quarter,
                c.chunk_id,
                c.chunk,
                ts_rank_cd(
                    to_tsvector('english', c.chunk),
                    websearch_to_tsquery('english', '{query_text}'),
                    32
                ) AS bm25_score
            FROM core.earnings_transcript_chunks c
            WHERE c.tic = 'NVDA' 
                AND c.fiscal_year = 2025 
                AND c.fiscal_quarter = 2
                AND to_tsvector('english', c.chunk) @@ websearch_to_tsquery('english', '{query_text}')
            ORDER BY bm25_score DESC
            LIMIT {k};
        """.format(query_text=query_text, k=k)
        # Fetch all records from the earnings_transcript_chunks table
        cursor.execute(sql)
        records = cursor.fetchall()

        # Create a DataFrame from the fetched records
        df = pd.DataFrame(records, columns=[desc[0] for desc in cursor.description])

        # Display the DataFrame
        display(df)

Unnamed: 0,tic,fiscal_year,fiscal_quarter,chunk_id,chunk,bm25_score
0,NVDA,2025,2,2,Colette Kress: We delivered another record qua...,0.00634
1,NVDA,2025,2,7,"Colette Kress: (contd) Note, our Q3 outlook do...",0.002067
2,NVDA,2025,2,3,"Colette Kress: (contd) The Vera CPU, Rubin GPU...",0.000847
3,NVDA,2025,2,6,Colette Kress: (contd) InfiniBand revenue near...,0.000328


In [20]:
df['chunk'].values[0]

"Colette Kress: We delivered another record quarter while navigating what continues to be a dynamic external environment. Total revenue was $46.7 billion, exceeding our outlook as we grew sequentially across all market platforms. Data center revenue grew 56% year over year. Data center revenue also grew sequentially despite the $4 billion decline in H20 revenue. NVIDIA Corporation's Blackwell platform reached record levels, growing sequentially by 17%. We began production shipments of GB300 in Q2. Our full stack AI solutions for cloud service providers, Neo Clouds, enterprises, and sovereigns are all contributing to our growth. We are at the beginning of an industrial revolution that will transform every industry. We see $3 to $4 trillion in AI infrastructure spend by the end of the decade. The scale and scope of these build-outs present significant long-term growth opportunities for NVIDIA Corporation. The GB200 NBL system is seeing widespread adoption with deployments at CSPs and con

In [5]:
# retrieve chunks using langchain_openai.OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

# Initialize the embedding model
embedding_model_name = "text-embedding-3-small"
embedding_model = OpenAIEmbeddings(model=embedding_model_name)

query_text = "What is the NVDA guidance for next quarter?"
query_vec = embedding_model.embed_query(query_text)
vec_str = "[" + ",".join(map(str, query_vec)) + "]"
k = 5

conn = connect_to_db()
if conn:
    with conn.cursor() as cursor:
        sql = """
            SELECT
                c.tic,
                c.fiscal_year,
                c.fiscal_quarter,
                c.chunk_id,
                c.chunk,
                1 - (e.embedding <=> '{vec}'::vector) AS similarity
            FROM core.earnings_transcript_embeddings e
            JOIN core.earnings_transcript_chunks c
            USING (tic, fiscal_year, fiscal_quarter, chunk_id)
            WHERE c.tic = 'AAPL' 
                AND c.fiscal_year = 2025 
                AND c.fiscal_quarter = 2
            ORDER BY e.embedding <=> '{vec}'::vector
            LIMIT {k};
        """.format(vec=vec_str, k=k)
        # Fetch all records from the earnings_transcript_chunks table
        cursor.execute(sql)
        records = cursor.fetchall()

        # Create a DataFrame from the fetched records
        df = pd.DataFrame(records, columns=[desc[0] for desc in cursor.description])

        # Display the DataFrame
        display(df)

Unnamed: 0,tic,fiscal_year,fiscal_quarter,chunk_id,chunk,similarity
0,AAPL,2025,2,57,Aaron Christopher Rakers: I've got two as well...,0.423685
1,AAPL,2025,2,28,"Kevan Parekh: Yes, Ben, this is Kevan. Thanks ...",0.396541
2,AAPL,2025,2,16,"Michael Ng: Just on the CapEx, it's up notably...",0.394495
3,AAPL,2025,2,48,Kevan Parekh: And then I think you also asked ...,0.386233
4,AAPL,2025,2,58,"Kevan Parekh: Great, Aaron. Thanks for the que...",0.385461


In [6]:
sql

"\n            SELECT\n                c.tic,\n                c.fiscal_year,\n                c.fiscal_quarter,\n                c.chunk_id,\n                c.chunk,\n                1 - (e.embedding <=> '[0.01662296955223377,0.027754540938077873,0.0825061930804267,0.00846026634843954,-0.02284474625746261,0.003645150617380783,-0.012328589091473665,0.02964812644846586,-0.014039578570168024,0.018624757979847667,0.01686643030694357,0.010894874826860076,-0.061839065223900716,-0.005589456187010754,0.031190046424273025,0.022181992912075163,-0.016338931073749785,0.003239382460033814,-0.028457873869884624,0.01677175019323387,-0.03278606692737059,0.021248725754365046,-0.008176227869955553,0.00989398061305313,-0.01934161557781574,-0.009603179801488481,-0.02375096222020497,0.05166781219073449,0.043606552365053175,0.0010989553776416582,0.05007178982499181,-0.02991863839814341,-0.02648313477459337,-0.010516158097311502,-0.00846026634843954,0.00908920709710113,-0.003841271780897009,0.0067087000772