# Analyzing and visualizing data from a SQL database

For this module, we will be exploring the techniques for analyzing and visualizing data from a SQL database. You will learn about:

* Connecting to a SQL database
* Querying a database
* Getting stats
* Visualizing data


In [None]:
import numpy as np
import pandas as pd

import sqlite3

In [None]:
def tweak_alta(df):
    return (df
           .assign(DATE=pd.to_datetime(df.DATE).dt.tz_localize('America/Denver'))
           .loc[: ,['DATE', 'STATION', 'NAME', 'LATITUDE', 'LONGITUDE',
                   'PRCP', 'SNOW', 'SNWD', 'TMIN', 'TMAX', 'TOBS'
                  ]]
            .assign(MONTH=lambda df_: df_.DATE.dt.month,
                    YEAR=lambda df_: df_.DATE.dt.year,
                    SEASON=lambda df_: np.select([df_.MONTH < 5,
                                                 df_.MONTH > 10],
                [(df_.YEAR - 1).astype(str) + '-' + 
                 (df_.YEAR).astype(str) + ' Season',
                (df_.YEAR).astype(str) + '-' + 
                 (df_.YEAR + 1).astype(str) + ' Season'],
                default='Off Season'))
           )
        
df = pd.read_csv('data/snow-alta-1990-2017.csv', dtype_backend='pyarrow',
                 engine='pyarrow')
alta = tweak_alta(df)

In [None]:
alta

In [None]:
alta.dtypes

## Connect to Database

In [None]:
!pip install sqlalchemy

In [None]:
# use SQLite connection
conn = sqlite3.connect('data/alta.db')

In [None]:
# use SQLAlchemy connection
# see https://docs.sqlalchemy.org/en/20/core/engines.html
from sqlalchemy import create_engine
conn = create_engine('sqlite:///data/alta.db')

## Create Table

In [None]:
conn = sqlite3.connect('data/alta.db')
alta.to_sql('alta', conn, if_exists='replace', index=False)
conn.close()

## Query Database

In [None]:
# use SQLAlchemy connection
from sqlalchemy import create_engine
conn = create_engine('sqlite:///data/alta.db')
alta = pd.read_sql('select * from alta', conn)
alta

In [None]:
alta.dtypes

In [None]:
conn = sqlite3.connect('data/alta.db')
alta = pd.read_sql('select * from alta', conn, dtype_backend='pyarrow')
alta

In [None]:
# Note date failed
alta.dtypes

In [None]:
pd.read_sql('PRAGMA table_info(alta)', conn)

## Fancy Query

In [None]:
alta90 = pd.read_sql('select * from alta where YEAR == 1990', conn,
                     dtype_backend='pyarrow')
alta90

## Stats on SQL Data

In [None]:
conn = sqlite3.connect('data/alta.db')
alta = pd.read_sql('select * from alta', conn, dtype_backend='pyarrow')
alta.describe()

In [None]:
query = '''SELECT 
    COUNT(SNWD) as count, 
    AVG(SNWD) as mean, 
    MIN(SNWD) as min, 
    MAX(SNWD) as max, 
    (
        SELECT 
            SNWD
        FROM 
            alta
        WHERE SNWD IS NOT NULL                        
        ORDER BY 
            SNWD
        LIMIT 
            1 OFFSET (ROUND(0.25 * (SELECT COUNT(SNWD) FROM alta))-1)
    ) as "25%",
    (
        SELECT 
            SNWD
        FROM 
            alta
        WHERE SNWD IS NOT NULL            
        ORDER BY 
            SNWD
        LIMIT 
            1 OFFSET (ROUND(0.50 * (SELECT COUNT(SNWD) FROM alta))-1)
    ) as "50%",
    (
        SELECT 
            SNWD
        FROM 
            alta
        WHERE SNWD IS NOT NULL
        ORDER BY 
            SNWD
        LIMIT 
            1 OFFSET (ROUND(0.75 * (SELECT COUNT(SNWD) FROM alta))-1)
    ) as "75%"
FROM 
    alta;
'''
depth_desc = pd.read_sql_query(query, conn)
depth_desc

In [None]:
import math

class STD:
    def __init__(self):
        self.mean = 0.0
        self.squared_deviation_sum = 0.0
        self.count = 0

    def step(self, value):
        if value is None:
            return
        new_mean = self.mean + (value - self.mean) / (self.count + 1)
        self.squared_deviation_sum += (value - self.mean) * (value - new_mean)
        self.mean = new_mean
        self.count += 1

    def finalize(self):
        if self.count <= 1:
            return None
        return math.sqrt(self.squared_deviation_sum / (self.count - 1))

conn.create_aggregate("stdev", 1, STD)

pd.read_sql('select stdev(SNWD) from alta', conn)

## Visualize Data

In [None]:
tobs = pd.read_sql('select DATE, TOBS from alta', conn)

In [None]:
tobs

In [None]:
# Expect an error (for demonstration purposes)
# TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'

(tobs
 .groupby(pd.Grouper(key='DATE', freq='M'))
 .mean()
 .plot()
)

In [None]:
# Expect an error (for demonstration purposes)
# ValueError: time data "1990-01-01T00:00:00-07:00" doesn't match format "%Y-%m-%d %H:%M:%S%z",

pd.to_datetime(tobs.DATE, format='%Y-%m-%d %H:%M:%S%z').iloc[0]

In [None]:
pd.to_datetime(tobs.DATE, format='%Y-%m-%d %H:%M:%S%z', utc=True).iloc[0]

In [None]:
(pd.to_datetime(tobs.DATE, format='%Y-%m-%d %H:%M:%S%z', utc=True)
 .dt.tz_convert('America/Denver')
)

In [None]:
(tobs
 .assign(DATE=(pd.to_datetime(tobs.DATE, format='%Y-%m-%d %H:%M:%S%z', utc=True)
     .dt.tz_convert('America/Denver')))
 .groupby(pd.Grouper(key='DATE', freq='M'))
 .mean()
 .loc['2010':'2012']
 .plot()
)