In [1]:
# ignore this cell - it makes the emphasized text red and uses the full width of the screen
from IPython.core.display import HTML
HTML('<style>em { color: red; }</style> <style>.container {width:100% !important; }</style>')

In [2]:
import sqlite3
import pandas as pd

In [3]:
conn = sqlite3.connect("worksheet.db")

In [4]:
# this function gives us a shortcut to making queries
# instead of typing all that code over and over again, we just call qry with our SQL
# it assumes we have access to a connection object, conn

def qry(QUERY):
    '''QUERY is a string containing SQL, conn is a global connection variable'''
    return pd.read_sql(QUERY, conn)

In [5]:
pd.read_sql("SELECT * from sqlite_master", conn)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,hydrants,hydrants,2,"CREATE TABLE ""hydrants"" (\n""year"" INTEGER,\n ..."
1,table,trees,trees,3,"CREATE TABLE ""trees"" (\n""tree"" TEXT,\n ""x"" IN..."
2,table,species,species,4,"CREATE TABLE ""species"" (\n""code"" TEXT,\n ""spe..."


In [6]:
# Remember that one database can hold several tables
hydrants = qry("SELECT * FROM hydrants")
trees = qry("SELECT * FROM trees")
species = qry("SELECT * FROM species")

In [7]:
# this is made-up data, but is inspired by an actual City of Madison database!
trees

Unnamed: 0,tree,x,y,species,diameter,priority
0,A,10,4,m,8,71
1,B,20,4,m,10,100
2,C,30,4,p,6,30
3,D,40,4,p,8,40
4,E,50,4,m,12,99


https://data-cityofmadison.opendata.arcgis.com/datasets/b700541a20e446839b18d62426c266a3/explore?location=43.072110%2C-89.405159%2C18.00

In [8]:
# Databases typically split up data into manageable pieces
# It may be more efficient to keep the species codes separate, since they are rarely updated
species

Unnamed: 0,code,species
0,m,maple
1,p,pine


In [9]:
# The City of Madison keeps data on fire hydrants!
hydrants

Unnamed: 0,year,color,style,owner,alt,active
0,1999,red,K-81,private,1179,0
1,2000,red,M-3,public,1065,0
2,2001,green,Pacer,private,1058,1
3,2010,blue,Pacer,public,1081,1
4,2014,blue,Pacer,public,1052,1
5,2018,blue,Pacer,public,1109,1


https://data-cityofmadison.opendata.arcgis.com/datasets/54c4877f16084409849ebd5385e2ee27_6/explore?location=43.071084%2C-89.403280%2C17.00

### 1a. *Without* running this cell - *predict* the output of the following statement

In [10]:
trees[trees["priority"] > 90]  

Unnamed: 0,tree,x,y,species,diameter,priority
1,B,20,4,m,10,100
4,E,50,4,m,12,99


In [11]:
trees[trees["priority"] > 90]   [["x", "y"]]    # show only the columns in this list

Unnamed: 0,x,y
1,20,4
4,50,4


### 1b. *Convert* the statement to an equivalent *SQL* query.

In [12]:
trees

Unnamed: 0,tree,x,y,species,diameter,priority
0,A,10,4,m,8,71
1,B,20,4,m,10,100
2,C,30,4,p,6,30
3,D,40,4,p,8,40
4,E,50,4,m,12,99


In [13]:
qry("select x,y from trees where priority > 90") 

Unnamed: 0,x,y
0,20,4
1,50,4


----
### 2a. *Predict* the output of the following *SQL* query

In [14]:
trees

Unnamed: 0,tree,x,y,species,diameter,priority
0,A,10,4,m,8,71
1,B,20,4,m,10,100
2,C,30,4,p,6,30
3,D,40,4,p,8,40
4,E,50,4,m,12,99


In [15]:
qry("SELECT x+y FROM trees WHERE species = 'm'")

Unnamed: 0,x+y
0,14
1,24
2,54


### 2b. *Convert* the query into an equivalent *pandas* statement.

In [16]:
# Series
trees["x"]

0    10
1    20
2    30
3    40
4    50
Name: x, dtype: int64

In [17]:
# Series              with Boolean indexing applied
trees["x"]            [trees["species"] == 'm']

0    10
1    20
4    50
Name: x, dtype: int64

In [18]:
trees["y"][trees["species"] == 'm']

0    4
1    4
4    4
Name: y, dtype: int64

In [19]:
# because the two Series have matching indices, we can add them
# this answer is acceptable on a quiz/exam
trees["x"][trees["species"] == 'm'] + trees["y"][trees["species"] == 'm']

0    14
1    24
4    54
dtype: int64

In [20]:
# if you want to get fancy, you can turn a Series into a DataFrame and add column names
result2 = pd.DataFrame(trees["x"][trees["species"] == 'm'] + trees["y"][trees["species"] == 'm'])
result2.columns = ["x+y"]
result2

Unnamed: 0,x+y
0,14
1,24
4,54


----
### 3a. *Predict* the output of the following *pandas* statements

In [21]:
species

Unnamed: 0,code,species
0,m,maple
1,p,pine


In [22]:
# species["code"]    [species["species"]=="maple"]       .iloc[0]


In [23]:
# this is a Series
species["code"]

0    m
1    p
Name: code, dtype: object

In [24]:
# Series              with Boolean indexing applied
species["code"]       [species["species"]=="maple"]

0    m
Name: code, dtype: object

In [25]:
# Series              with Boolean indexing applied     get the value at integer location 0
species["code"]       [species["species"]=="maple"]     .iloc[0]

'm'

In [26]:
cd = species["code"][species["species"]=="maple"].iloc[0]
cd

'm'

In [27]:
# DataFrame     with Boolean Indexing
trees           [trees["species"] == cd]

Unnamed: 0,tree,x,y,species,diameter,priority
0,A,10,4,m,8,71
1,B,20,4,m,10,100
4,E,50,4,m,12,99


In [28]:
# DataFrame     with Boolean Indexing         with column selection
trees           [trees["species"] == cd]     ['tree']

0    A
1    B
4    E
Name: tree, dtype: object

### 3b. *Convert* the statements into an equivalent *SQL* query.

In [29]:
qry("select code from species where species = 'maple' ") 

Unnamed: 0,code
0,m


In [30]:
# DataFrame                                                 with column selection
qry("select code from species where species = 'maple' ")    ['code']

0    m
Name: code, dtype: object

In [31]:
# DataFrame                                                 with column selection   get the value at iloc 0
cd = qry("select code from species where species = 'maple' ")    ['code']           .iloc[0]
cd



'm'

In [32]:
# hard coding 
qry("select tree from trees where species = 'm'" ) 

Unnamed: 0,tree
0,A
1,B
2,E


In [33]:
# not hard coding
qry("select tree from trees where species = '{}'".format(cd))

Unnamed: 0,tree
0,A
1,B
2,E


----
### 4a. *Predict* the output of the following query

In [34]:
qry("SELECT species FROM trees ORDER BY priority DESC")

Unnamed: 0,species
0,m
1,m
2,m
3,p
4,p


### 4.b *Convert* the query code to *Pandas*

In [35]:
# DataFrame soted by priority                     # with column selection
trees.sort_values("priority", ascending = False)  ["species"]

1    m
4    m
0    m
3    p
2    p
Name: species, dtype: object

----
### 5a. *Predict* the output of the following code

In [36]:
trees

Unnamed: 0,tree,x,y,species,diameter,priority
0,A,10,4,m,8,71
1,B,20,4,m,10,100
2,C,30,4,p,6,30
3,D,40,4,p,8,40
4,E,50,4,m,12,99


In [37]:
# list(qry("SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1").iloc[0])

In [38]:
qry("SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1")

Unnamed: 0,tree,priority
0,B,100


In [39]:
qry("SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1").iloc[0]

tree          B
priority    100
Name: 0, dtype: object

In [40]:
# list gets the values only
list(qry("SELECT tree, priority FROM trees ORDER BY priority DESC LIMIT 1").iloc[0])

['B', 100]

### 5.b *Convert* the above code to *Pandas*

In [41]:
trees.sort_values("priority", ascending=False)

Unnamed: 0,tree,x,y,species,diameter,priority
1,B,20,4,m,10,100
4,E,50,4,m,12,99
0,A,10,4,m,8,71
3,D,40,4,p,8,40
2,C,30,4,p,6,30


In [42]:
# DataFrame                                      # grab the first row
trees.sort_values("priority", ascending=False)    .iloc[0]

tree          B
x            20
y             4
species       m
diameter     10
priority    100
Name: 1, dtype: object

In [43]:
# DataFrame                                      # grab the first row  #slicing by certain indices
trees.sort_values("priority", ascending=False)   .iloc[0]              [['tree', 'priority']]

tree          B
priority    100
Name: 1, dtype: object

In [44]:
# list gets the values only
list(trees.sort_values("priority", ascending=False).iloc[0]   [['tree', 'priority']]        )

['B', 100]

----
### 6a. *Predict* the output of the following code

In [45]:
# qry("""SELECT COUNT(SPECIES) AS c1,
# COUNT(DISTINCT SPECIES) as c2
# FROM trees""")

In [46]:
qry("""SELECT COUNT(SPECIES) AS c1,
    COUNT(DISTINCT SPECIES) as c2
    FROM trees""")

Unnamed: 0,c1,c2
0,5,2


### 6b. *Convert* the above code to *Pandas*

In [47]:
c1 = len(trees)
c1

5

In [48]:
trees["species"].value_counts()

m    3
p    2
Name: species, dtype: int64

In [49]:
c2 = len(trees["species"].value_counts())
c2

2

In [50]:
# this answer is acceptable
[c1, c2]

[5, 2]

In [51]:
# A dataframe can be made from a dict of lists
d = {"c1":[c1], "c2":[c2]}
pd.DataFrame(d)

Unnamed: 0,c1,c2
0,5,2


----
### 7a. *Predict* the output of the following code

In [52]:
# qry("""SELECT species, COUNT(SPECIES) AS count,
# AVG(diameter) AS size
# FROM trees
# GROUP BY species ORDER BY count DESC""")

In [53]:
qry("""SELECT species, COUNT(SPECIES) AS count,
    AVG(diameter) AS size
    FROM trees
    GROUP BY species ORDER BY count DESC""")

Unnamed: 0,species,count,size
0,m,3,10.0
1,p,2,7.0


### 7b. *Convert* the above code to *Pandas*

In [54]:
# part 1: species
species_list = list(pd.unique(trees['species']))
species_list

['m', 'p']

In [55]:
trees.groupby("species").mean()

Unnamed: 0_level_0,x,y,diameter,priority
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
m,26.666667,4.0,10.0,90.0
p,35.0,4.0,7.0,35.0


In [56]:
# part 2: size
size_list = list(trees.groupby("species").mean()["diameter"]) 
size_list

[10.0, 7.0]

In [57]:
# part 3: counts
count_list = list(trees['species'].value_counts())
count_list

[3, 2]

In [58]:
# part 4: make a DataFrame from a dict of lists
pd.DataFrame({"species": species_list,
             "count": count_list,
             "size": size_list})

Unnamed: 0,species,count,size
0,m,3,10.0
1,p,2,7.0


----
# Additional Exercises: 
### *Predict* the output of the following statements

In [59]:

hydrants

Unnamed: 0,year,color,style,owner,alt,active
0,1999,red,K-81,private,1179,0
1,2000,red,M-3,public,1065,0
2,2001,green,Pacer,private,1058,1
3,2010,blue,Pacer,public,1081,1
4,2014,blue,Pacer,public,1052,1
5,2018,blue,Pacer,public,1109,1


In [60]:
qry("SELECT color, year FROM hydrants WHERE color = 'blue' ")

Unnamed: 0,color,year
0,blue,2010
1,blue,2014
2,blue,2018


In [61]:
df = qry("SELECT color, year FROM hydrants")
df[df.color == "blue"]

Unnamed: 0,color,year
3,blue,2010
4,blue,2014
5,blue,2018


In [62]:
qry("SELECT year FROM hydrants WHERE owner='private' AND active")

Unnamed: 0,year
0,2001


In [63]:
df = qry("SELECT year, style, active FROM hydrants")
df[df.active == 1]["style"]

2    Pacer
3    Pacer
4    Pacer
5    Pacer
Name: style, dtype: object

In [64]:
hydrants

Unnamed: 0,year,color,style,owner,alt,active
0,1999,red,K-81,private,1179,0
1,2000,red,M-3,public,1065,0
2,2001,green,Pacer,private,1058,1
3,2010,blue,Pacer,public,1081,1
4,2014,blue,Pacer,public,1052,1
5,2018,blue,Pacer,public,1109,1


In [65]:
hydrants["color"].value_counts()

blue     3
red      2
green    1
Name: color, dtype: int64

In [66]:
qry("""SELECT color, COUNT(*) FROM hydrants WHERE active GROUP BY color""")

Unnamed: 0,color,COUNT(*)
0,blue,3
1,green,1


In [67]:
qry("""SELECT color, COUNT(*) AS count FROM hydrants GROUP BY color HAVING count > 1""")

Unnamed: 0,color,count
0,blue,3
1,red,2


In [68]:
qry("""SELECT color, COUNT(*) AS count
    FROM hydrants WHERE year >= 2000
    GROUP BY color HAVING count < 2""")

Unnamed: 0,color,count
0,green,1
1,red,1
