In [1]:
import pandas as pd
import numpy as np
import os
import sqlite3
import matplotlib.pyplot as plt


In [2]:
DB_PATH = os.path.join('data', 'weather.db')

conn = sqlite3.connect(DB_PATH)

In [3]:
query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql_query(query, conn)
print(tables)

              name
0     weather_main
1  weather_dailies


In [4]:
main_5 = pd.read_sql_query("SELECT * FROM weather_dailies", conn)
main_5.tail()

Unnamed: 0,DATA_SOURCE,STATION,NAME,DATE,YEAR,PRCP,TAVG,TMAX,TMIN
17367,dailies,USW00093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",2025-07-27 00:00:00,2025,0.0,82.0,92.0,74.0
17368,dailies,USW00093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",2025-07-28 00:00:00,2025,0.21,81.0,93.0,72.0
17369,dailies,USW00063838,"VERSAILLES 3 NNW, KY US",2025-07-28 00:00:00,2025,0.5,82.0,92.0,72.0
17370,dailies,USW00063838,"VERSAILLES 3 NNW, KY US",2025-07-29 00:00:00,2025,0.0,80.0,88.0,72.0
17371,dailies,USW00093820,"LEXINGTON BLUEGRASS AIRPORT, KY US",2025-07-29 00:00:00,2025,0.0,78.0,89.0,71.0


In [5]:
def add_categories(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creating categories for the data:
    
    Seasons based off of the month - Winter, Spring, Summer, Fall

    Temp Range: - Range of temps for the day

    Temp Categories: Freezing/Cool/Warm/Hot based off of 32/55/70/88 degrees
    """

    # Seasons based off of the DATE column
    df['SEASON'] = df["DATE"].dt.month.map(
        {
            12: "Winter", 1: "Winter", 2: "Winter",
            3: "Spring", 4: "Spring", 5: "Spring",
            6: "Summer", 7: "Summer", 8: "Summer",
            9: "Fall", 10: "Fall", 11: "Fall"
        }
    )

    # Temperature Range
    df["TEMP_RANGE"] = df["TMAX"] - df["TMIN"]

    # Temperature Categories
    labels = ["Freezing", "Cool", "Warm", "Hot"]
    df["TEMP_CATEGORY"] = pd.cut(
        df["TMAX"], 
        bins=[0, 32, 55, 70, 88], 
        labels=labels
    )

    return df

In [11]:
def fetch_data(conn: sqlite3.Connection) -> pd.DataFrame:
    """
    Returns a dataframe based off the weather_dailies and weather_main in the tables.
    We will be JOINing the two tables based off the DATE column.  
    I will average the TMAX, TMIN, and TAVG columns from the weather_dailies table, as there are more than one rows per day.
    """

    query =  """
    WITH d AS (
        SELECT 
            DATE,
            MAX(CAST(TMAX AS FLOAT)) AS TMAX,
            MIN(CAST(TMIN AS FLOAT)) AS TMIN,
            AVG(CAST(TAVG AS FLOAT)) AS TAVG
        FROM weather_dailies
        GROUP BY DATE
    )
    SELECT 
        m.DATE,
        m.YEAR,
        m.DATA_SOURCE,
        m.STATION,
        m.NAME,
        m.TEMP,
        m.PRCP,
        m.DEWP,
        d.TMAX,
        d.TMIN,
        d.TAVG
    FROM weather_main m
    LEFT JOIN d
    ON m.DATE = d.DATE
    ORDER BY m.DATE;
    """
    
    return pd.read_sql_query(query, conn, parse_dates=["DATE"])

df_main = fetch_data(conn)

In [12]:
df_main.head()

Unnamed: 0,DATE,YEAR,DATA_SOURCE,STATION,NAME,TEMP,PRCP,DEWP,TMAX,TMIN,TAVG
0,1948-01-01,1948,historical,99999993820,"LEXINGTON BLUEGRASS AIRPORT, KY US",53.5,99.99,47.3,,,
1,1948-01-02,1948,historical,99999993820,"LEXINGTON BLUEGRASS AIRPORT, KY US",29.1,99.99,27.8,,,
2,1948-01-03,1948,historical,99999993820,"LEXINGTON BLUEGRASS AIRPORT, KY US",29.4,99.99,28.6,,,
3,1948-01-04,1948,historical,99999993820,"LEXINGTON BLUEGRASS AIRPORT, KY US",32.4,99.99,29.6,,,
4,1948-01-05,1948,historical,99999993820,"LEXINGTON BLUEGRASS AIRPORT, KY US",30.9,99.99,27.8,,,
