# Analyze and Fix `stock_metadata` Table
This notebook queries the `stock_metadata` table, identifies data issues, and applies fixes.

In [2]:
# Import required libraries
from psycopg import connect
import pandas as pd
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Database credentials
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")

# Connect to PostgreSQL
def connect_to_db():
    try:
        conn = connect(
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            host=DB_HOST,
            port=DB_PORT
        )
        return conn
    except Exception as e:
        print(f'Error connecting to the database: {e}')
        return None

In [3]:
# Query the `stock_metadata` table
conn = connect_to_db()
if conn:
    query = "SELECT * FROM stock_metadata;"
    df = pd.read_sql_query(query, conn)
    conn.close()
    
    # Display the data
    display(df)

  df = pd.read_sql_query(query, conn)


Unnamed: 0,tic,name,sector,industry,country,market_cap,employees,description,website,exchange,currency,last_updated
0,AAPL,Apple Inc.,Technology,Consumer Electronics,United States,3791126003712,150000,"Apple Inc. designs, manufactures, and markets ...",https://www.apple.com,NASDAQ,USD,2025-09-27 00:42:13.231448
1,TSLA,"Tesla, Inc.",Consumer Cyclical,Auto Manufacturers,United States,1464396414976,125665,"Tesla, Inc. designs, develops, manufactures, l...",https://www.tesla.com,NASDAQ,USD,2025-09-27 00:42:13.231448
2,NVDA,NVIDIA Corporation,Technology,Semiconductors,United States,4338392236032,36000,"NVIDIA Corporation, a computing infrastructure...",https://www.nvidia.com,NASDAQ,USD,2025-09-27 00:42:13.231448


In [4]:
df['description'][1]

'Tesla, Inc. designs, develops, manufactures, leases, and sells electric vehicles, and energy generation and storage systems in the United States, China, and internationally. The company operates in two segments, Automotive; and Energy Generation and Storage. The Automotive segment offers electric vehicles, as well as sells automotive regulatory credits; and non-warranty after-sales vehicle, used vehicles, body shop and parts, supercharging, retail merchandise, and vehicle insurance services. This segment also provides sedans and sport utility vehicles through direct and used vehicle sales, a network of Tesla Superchargers, and in-app upgrades; purchase financing and leasing services; services for electric vehicles through its company-owned service locations and Tesla mobile service technicians; and vehicle limited warranties and extended service plans. The Energy Generation and Storage segment engages in the design, manufacture, installation, sale, and leasing of solar energy generati

In [5]:
# Query the `historical_earnings` table
conn = connect_to_db()
if conn:
    query = "SELECT * FROM historical_earnings;"
    df = pd.read_sql_query(query, conn)
    conn.close()

    # Replace NaN with None for consistency
    df = df.where(pd.notnull(df), None)

    # Display the data
    display(df)

  df = pd.read_sql_query(query, conn)


Unnamed: 0,tic,fiscal_year,fiscal_quarter,fiscal_date_ending,earnings_date,eps,eps_estimated,session,revenue,revenue_estimated,price_before,price_after,last_updated
0,AAPL,1985,3,1985-09-30,1985-09-30,0.00112,,amc,1.918300e+09,,,,2023-05-17
1,AAPL,1989,4,1989-12-31,1989-12-31,0.00857,,amc,1.493400e+09,,,,2023-05-17
2,AAPL,1990,1,1990-03-31,1990-03-31,0.00929,,amc,1.346200e+09,,,,2023-05-17
3,AAPL,1990,2,1990-06-30,1990-06-30,0.00857,,amc,1.364800e+09,,,,2023-05-17
4,AAPL,1990,3,1990-09-30,1990-09-30,0.00748,,amc,1.354100e+09,,,,2023-05-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,NVDA,2024,4,2024-12-31,2025-02-26,0.89000,0.848,amc,3.933100e+10,3.810135e+10,126.640,120.964,2025-05-25
310,NVDA,2025,1,2025-03-31,2025-05-28,0.81000,0.737,amc,4.406200e+10,4.333416e+10,135.167,139.020,2025-08-27
311,NVDA,2025,2,2025-06-30,2025-08-27,1.05000,1.010,amc,4.674300e+10,4.604892e+10,181.770,180.170,2025-09-26
312,NVDA,2025,3,2025-09-30,2025-11-19,,1.230,bmo,,5.459076e+10,,,2025-09-26


In [6]:
df.iloc[0]['eps_estimated']

np.float64(nan)

In [7]:
# check if any duplication from ticker, fiscal_year, fiscal_quarter
df_duplicates = df[df.duplicated(subset=['tic', 'fiscal_year', 'fiscal_quarter'], keep=False)]
if not df_duplicates.empty:
    print("Duplicate entries found:")
    display(df_duplicates)
else:
    print("No duplicate entries found.")

No duplicate entries found.


In [8]:
df[(df['tic']=='NVDA' ) & (df['fiscal_year']==2010) ]

Unnamed: 0,tic,fiscal_year,fiscal_quarter,fiscal_date_ending,earnings_date,eps,eps_estimated,session,revenue,revenue_estimated,price_before,price_after,last_updated
258,NVDA,2010,1,2010-03-31,2010-05-13,0.06,0.05,amc,1001813000.0,914687000.0,14.68,12.96,2023-05-17
259,NVDA,2010,2,2010-06-30,2010-08-12,-0.06153,-0.03577,amc,811208000.0,2974400000.0,8.88,9.39,2023-05-17
260,NVDA,2010,3,2010-09-30,2010-11-11,0.04,0.03773,amc,843912000.0,787640000.0,12.74,13.26,2023-05-17
261,NVDA,2010,4,2010-12-31,2011-02-16,0.06,0.04128,amc,886376000.0,616626100.0,22.55,25.68,2023-05-17


In [9]:
df[(df['tic']=='TSLA' )]

Unnamed: 0,tic,fiscal_year,fiscal_quarter,fiscal_date_ending,earnings_date,eps,eps_estimated,session,revenue,revenue_estimated,price_before,price_after,last_updated
144,TSLA,2007,4,2007-12-31,2008-01-30,-0.01012,,amc,3.685500e+06,,,,2023-05-17
145,TSLA,2008,1,2008-03-31,2008-03-30,-0.01012,,amc,3.685500e+06,,,,2023-05-17
146,TSLA,2008,4,2008-12-31,2008-12-31,-0.02180,,amc,1.416200e+07,,,,2023-05-17
147,TSLA,2009,1,2009-03-31,2009-03-31,-0.00763,,amc,2.088600e+07,,,,2023-05-17
148,TSLA,2009,2,2009-06-30,2009-06-30,-0.00518,,amc,2.694500e+07,,,,2023-05-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,TSLA,2024,4,2024-12-31,2025-01-29,0.73000,0.7740,amc,2.570700e+10,2.725892e+10,397.85,400.07,2025-04-28
210,TSLA,2025,1,2025-03-31,2025-04-22,0.27000,0.4136,amc,1.933500e+10,2.126950e+10,227.17,252.11,2025-07-21
211,TSLA,2025,2,2025-06-30,2025-07-23,0.40000,0.3972,amc,2.249600e+10,2.227968e+10,332.11,305.30,2025-09-22
212,TSLA,2025,3,2025-09-30,2025-10-22,,0.4800,bmo,,2.498468e+10,,,2025-09-22
