In [None]:
# ignore this cell - it makes the emphasized text red and uses the full width of the screen
from IPython.core.display import HTML
HTML('<style>em { color: red; }</style> <style>.container {width:100% !important; }</style>')

In [None]:
import sqlite3
import pandas as pd

In [None]:
conn = sqlite3.connect("worksheet.db")

In [None]:
# this function gives us a shortcut to making queries
# instead of typing all that code over and over again, we just call qry with our SQL
# it assumes we have access to a connection object, conn

def qry(QUERY):
    '''QUERY is a string containing SQL, conn is a global connection variable'''
    return pd.read_sql(QUERY, conn)

In [None]:
pd.read_sql("SELECT * from sqlite_master", conn)

In [None]:
# Remember that one database can hold several tables
hydrants = qry("SELECT * FROM hydrants")
trees = qry("SELECT * FROM trees")
species = qry("SELECT * FROM species")

In [None]:
# this is made-up data, but is inspired by an actual City of Madison database!
trees

https://data-cityofmadison.opendata.arcgis.com/datasets/b700541a20e446839b18d62426c266a3/explore?location=43.072110%2C-89.405159%2C18.00

In [None]:
# Databases typically split up data into manageable pieces
# It may be more efficient to keep the species codes separate, since they are rarely updated
species

In [None]:
# The City of Madison keeps data on fire hydrants!
hydrants

https://data-cityofmadison.opendata.arcgis.com/datasets/54c4877f16084409849ebd5385e2ee27_6/explore?location=43.071084%2C-89.403280%2C17.00

### 1a. *Without* running this cell - *predict* the output of the following statement

In [None]:
#trees[trees["priority"] > 90]  

In [None]:
# DataFrame with Boolean Indexing      # show only the columns in this list
#trees[trees["priority"] > 90]           [["x", "y"]]    

### 1b. *Convert* the statement to an equivalent *SQL* querry.

In [None]:
trees

In [None]:
# your answer here

----
### 2a. *Predict* the output of the following *SQL* querry

In [None]:
trees

In [None]:
#qry("SELECT x+y FROM trees WHERE species = 'm'")

### 2b. *Convert* the querry into an equivalent *pandas* statement.

In [None]:
# Series
trees["x"]

In [None]:
# Series              with Boolean indexing applied
#trees["x"]            [trees["species"] == 'm']

In [None]:
# Do the same for y 

In [None]:
# because the two Series have matching indices, we can add them
# this answer is acceptable on a quiz/exam
#trees["x"][trees["species"] == 'm'] + trees["y"][trees["species"] == 'm']

In [None]:
# if you want to get fancy, you can turn a Series into a DataFrame and add column names
#result2 = pd.DataFrame(trees["x"][trees["species"] == 'm'] + trees["y"][trees["species"] == 'm'])
#result2.columns = ["x+y"]
#result2

----
### 3a. *Predict* the output of the following *pandas* statements

In [None]:
species

In [None]:
# species["code"]    [species["species"]=="maple"]       .iloc[0]


In [None]:
# this is a Series
#species["code"]

In [None]:
# Series              with Boolean indexing applied
# species["code"]       [species["species"]=="maple"]

In [None]:
# Series              with Boolean indexing applied     get the value at integer location 0
# species["code"]       [species["species"]=="maple"]     .iloc[0]

In [None]:
cd = species["code"][species["species"]=="maple"].iloc[0]
cd

In [None]:
# DataFrame     with Boolean Indexing


In [None]:
# DataFrame     with Boolean Indexing         with column selection
# trees           [trees["species"] == cd]     ['tree']

### 3b. *Convert* the statements into an equivalent *SQL* querry.

In [None]:
qry("select code from species where species = 'maple' ") 

In [None]:
# DataFrame                                                 with column selection
#qry("select code from species where species = 'maple' ")    ['code']

In [None]:
# DataFrame                                                 with column selection   get the value at iloc 0
#cd = qry("select code from species where species = 'maple' ")    ['code']           .iloc[0]
#cd



In [None]:
# hard coding 
#qry("select tree from trees where species = 'm'" ) 

In [None]:
# not hard coding
#qry("select tree from trees where species = '{}'".format(cd))

----
### 4a. *Predict* the output of the following querry

In [None]:
#qry("SELECT species FROM trees ORDER BY priority DESC")

### 4.b *Convert* the querry code to *Pandas*

In [None]:
# DataFrame soted by priority                     # with column selection


----
### 5a. *Predict* the output of the following code

In [None]:
trees

In [None]:
# list(qry("SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1").iloc[0])

In [None]:
# qry("SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1")

In [None]:
#qry("SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1").iloc[0]

In [None]:
# list gets the values only
#list(qry("SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1").iloc[0])

### 5.b *Convert* the above code to *Pandas*

In [None]:
trees.sort_values("priority", ascending=False)

In [None]:
# DataFrame                                      # grab the first row


In [None]:
# DataFrame                                      # grab the first row  #slicing by certain indices


In [None]:
# list gets the values only


----
### 6a. *Predict* the output of the following code

In [None]:
# qry("""SELECT COUNT(SPECIES) AS c1,
# COUNT(DISTINCT SPECIES) as c2
# FROM trees""")

In [None]:
#qry("""SELECT COUNT(SPECIES) AS c1,
#    COUNT(DISTINCT SPECIES) as c2
#  FROM trees""")

### 6b. *Convert* the above code to *Pandas*

In [None]:
# get the 5
c1 = None
c1

In [None]:
# get the 2


In [None]:
c2 = None
#c2

In [None]:
# this answer is acceptable
[c1, c2]

In [None]:
# A dataframe can be made from a dict of lists
d = {"c1":[c1], "c2":[c2]}
pd.DataFrame(d)

----
### 7a. *Predict* the output of the following code

In [None]:
# qry("""SELECT species, COUNT(SPECIES) AS count,
# AVG(diameter) AS size
# FROM trees
# GROUP BY species ORDER BY count DESC""")

### 7b. *Convert* the above code to *Pandas*

In [None]:
# part 1: species list
# species_list = list(pd.unique(trees['species']))
species_list

In [None]:
# trees.groupby("species").mean()

In [None]:
# part 2: size
#size_list = list(trees.groupby("species").mean()["diameter"]) 
size_list

In [None]:
# part 3: counts
#count_list = list(trees['species'].value_counts())
count_list

In [None]:
# part 4: make a DataFrame from a dict of lists
pd.DataFrame({"species": species_list,
             "count": count_list,
             "size": size_list})

----
# Additional Exercises: 
### *Predict* the output of the following statements

In [None]:

hydrants

In [None]:
#qry("SELECT color, year FROM hydrants WHERE color = 'blue' ")

In [None]:
#df = qry("SELECT color, year FROM hydrants")
#df[df.color == "blue"]

In [None]:
#qry("SELECT year FROM hydrants WHERE owner='private' AND active")

In [None]:
#df = qry("SELECT year, style, active FROM hydrants")
#df[df.active == 1]["style"]

In [None]:
hydrants

In [None]:
#hydrants["color"].value_counts()

In [None]:
#qry("""SELECT color, COUNT(*) FROM hydrants WHERE active GROUP BY color""")

In [None]:
#qry("""SELECT color, COUNT(*) AS count FROM hydrants GROUP BY color HAVING count > 1""")

In [None]:
#qry("""SELECT color, COUNT(*) AS count
    FROM hydrants WHERE year >= 2000
    GROUP BY color HAVING count < 2""")