In [1]:
import pandas as pd
import numpy as np
import os
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# SETTING UP THE DB INFO
DB_PATH = os.path.join('data', 'weather.db')

conn = sqlite3.connect(DB_PATH)

In [3]:
main_5 = pd.read_sql_query("SELECT * FROM weather_main", conn)
main_5.tail()

Unnamed: 0,DATA_SOURCE,STATION,NAME,DATE,YEAR,TEMP,PRCP,GUST_MAX,GUST_MIN,WDSP,FRSHTT,DEWP,SEASON
28332,current,72422093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",2025-07-27 00:00:00,2025,82.4,0.0,91.9,75.0,7.2,0,72.4,Summer
28333,current,72422093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",2025-07-28 00:00:00,2025,81.5,0.0,93.0,72.0,3.4,0,72.3,Summer
28334,current,72422093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",2025-07-29 00:00:00,2025,78.1,0.21,93.0,71.1,4.8,10010,72.0,Summer
28335,current,72422093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",2025-07-30 00:00:00,2025,79.4,0.0,90.0,70.0,3.0,0,71.5,Summer
28336,current,72422093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",2025-07-31 00:00:00,2025,78.8,0.0,82.9,75.0,1.0,0,73.0,Summer


In [4]:
# Fetching the data from the DB and joining the two tables based off the DATE column #
def fetch_data(conn: sqlite3.Connection) -> pd.DataFrame:
    """
    Returns a dataframe based off the weather_dailies and weather_main in the tables.
    We will be JOINing the two tables based off the DATE column.  
    I will average the TMAX, TMIN, and TAVG columns from the weather_dailies table, as there are more than one rows per day.
    """

    query =  """
    WITH d AS (
        SELECT 
            DATE,
            MAX(CAST(TMAX AS DECIMAL(10, 2))) AS TMAX,
            MIN(CAST(TMIN AS DECIMAL(10, 2))) AS TMIN,
            AVG(CAST(TAVG AS DECIMAL(10, 2))) AS TAVG
        FROM weather_dailies
        GROUP BY DATE
    )
    SELECT 
        m.DATE,
        m.SEASON,
        m.YEAR,
        m.DATA_SOURCE,
        m.STATION,
        m.NAME,
        m.TEMP,
        m.PRCP,
        m.DEWP,
        d.TMAX,
        d.TMIN,
        d.TAVG
    FROM weather_main m
    LEFT JOIN d
    ON m.DATE = d.DATE
    ORDER BY m.DATE;
    """
    
    return pd.read_sql_query(query, conn, parse_dates=["DATE"])

df_main = fetch_data(conn)

In [5]:
df_main.tail()

Unnamed: 0,DATE,SEASON,YEAR,DATA_SOURCE,STATION,NAME,TEMP,PRCP,DEWP,TMAX,TMIN,TAVG
28332,2025-07-27,Summer,2025,current,72422093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",82.4,0.0,72.4,92.0,72.0,82.0
28333,2025-07-28,Summer,2025,current,72422093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",81.5,0.0,72.3,93.0,72.0,81.5
28334,2025-07-29,Summer,2025,current,72422093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",78.1,0.21,72.0,89.0,71.0,79.0
28335,2025-07-30,Summer,2025,current,72422093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",79.4,0.0,71.5,,,
28336,2025-07-31,Summer,2025,current,72422093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",78.8,0.0,73.0,,,
