# Pandas, SQL, and the Grammar of Data

In [1]:
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

## Populating the Database
Lets start with Relational Databases, so called because they contain "relations" (tables), which are SETS of "tuples" (rows) which map "attributes" to atomic values.

The available attributes are constrained by a "header" tuple of attributes which set the type. We do this below here.

In [2]:
ourschema="""
DROP TABLE IF EXISTS "candidates";
DROP TABLE IF EXISTS "contributors";
CREATE TABLE "candidates" (
    "id" INTEGER PRIMARY KEY  NOT NULL ,
    "first_name" VARCHAR,
    "last_name" VARCHAR,
    "middle_name" VARCHAR,
    "party" VARCHAR NOT NULL
);
CREATE TABLE "contributors" (
    "id" INTEGER PRIMARY KEY  AUTOINCREMENT  NOT NULL,
    "last_name" VARCHAR,
    "first_name" VARCHAR,
    "middle_name" VARCHAR,
    "street_1" VARCHAR,
    "street_2" VARCHAR,
    "city" VARCHAR,
    "state" VARCHAR,
    "zip" VARCHAR,
    "amount" INTEGER,
    "date" DATETIME,
    "candidate_id" INTEGER NOT NULL,
    FOREIGN KEY(candidate_id) REFERENCES candidates(id)
);
"""

## SQLite

We use sqlite here (and recommend Postgres for production purposes). Still sqlite is great for on-disk large databases which wont fit into memory.

Its also built into Python, but to use the [command line tool](https://www.sqlite.org/cli.html), I recommend you install it: https://www.sqlite.org/download.html. I also recommend you download and install the sqlite browser: http://sqlitebrowser.org .

Python implements a standard database API over all databases. Its called [DBAPI2](http://cewing.github.io/training.codefellows/lectures/day21/intro_to_dbapi2.html). It works across many SQL databases.

There is an even higher level API available, called [SQLAlchemy](http://www.sqlalchemy.org). While we wont use it here, I thoroughly recommend it, either in its direct relational form, or ORM form. Many things in Pandas use it to interface with databases. Here we'll get away with things by using SQLITE.

In [3]:
# Connect and get a DBAPI2 connection
from sqlite3 import dbapi2 as sq3
import os
PATHSTART = '.'
def get_db(dbfile):
    sqlite_db = sq3.connect(os.path.join(PATHSTART, dbfile))
    return sqlite_db

In [4]:
# Drop tables if they exist and create them.
def init_db(dbfile, schema):
    # Creates the database tables
    db = get_db(dbfile)
    db.cursor().executescript(schema)
    
    # Explicit commit after entire operation completed
    db.commit()
    return db

In [5]:
# Use Pandas to read in the data
dfcand = pd.read_csv("./Resources/candidates.txt", sep='|')
dfcand

Unnamed: 0,id,first_name,last_name,middle_name,party
0,33,Joseph,Biden,,D
1,36,Samuel,Brownback,,R
2,34,Hillary,Clinton,R.,D
3,39,Christopher,Dodd,J.,D
4,26,John,Edwards,,D
5,22,Rudolph,Giuliani,,R
6,24,Mike,Gravel,,D
7,16,Mike,Huckabee,,R
8,30,Duncan,Hunter,,R
9,31,Dennis,Kucinich,,D


In [6]:
dfcwci = pd.read_csv("./Resources/contributors_with_candidate_id.txt", sep="|")
dfcwci.head()

Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
1,,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,250.0,2007-05-16,16
2,,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,50.0,2007-06-18,16
3,,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,100.0,2007-06-21,16
4,,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16


In [7]:
# ID is defined to auto increment, so:
del dfcwci['id']
dfcwci.head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
1,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,250.0,2007-05-16,16
2,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,50.0,2007-06-18,16
3,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,100.0,2007-06-21,16
4,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16


In [8]:
# Initialise a Database
db = init_db("./Resources/cancont.db", ourschema)

### Populating w/ Pandas

In [9]:
# Append both dataframes as tables in db
dfcand.to_sql("candidates", db, if_exists="append", index=False)
dfcwci.to_sql("contributors", db, if_exists="append", index=False)

In [10]:
# Get the entire table
sel = """
SELECT * FROM candidates;
"""
c = db.cursor().execute(sel)

In [11]:
c.fetchall()

[(16, u'Mike', u'Huckabee', None, u'R'),
 (20, u'Barack', u'Obama', None, u'D'),
 (22, u'Rudolph', u'Giuliani', None, u'R'),
 (24, u'Mike', u'Gravel', None, u'D'),
 (26, u'John', u'Edwards', None, u'D'),
 (29, u'Bill', u'Richardson', None, u'D'),
 (30, u'Duncan', u'Hunter', None, u'R'),
 (31, u'Dennis', u'Kucinich', None, u'D'),
 (32, u'Ron', u'Paul', None, u'R'),
 (33, u'Joseph', u'Biden', None, u'D'),
 (34, u'Hillary', u'Clinton', u'R.', u'D'),
 (35, u'Mitt', u'Romney', None, u'R'),
 (36, u'Samuel', u'Brownback', None, u'R'),
 (37, u'John', u'McCain', None, u'R'),
 (38, u'Tom', u'Tancredo', None, u'R'),
 (39, u'Christopher', u'Dodd', u'J.', u'D'),
 (41, u'Fred', u'Thompson', u'D.', u'R')]

In [12]:
# Delete the table
rem = """
DELETE FROM candidates;
"""
c = db.cursor().execute(rem)
db.commit()
# Delete, unlike Select, actually mutates the table. So, we commit

In [13]:
c.fetchall()

[]

### Populate w/ SQL Insert 
When data > memory

In [14]:
# Create a tuple consisting of each row's data to be added to table

ins = """
INSERT INTO candidates (id, first_name, last_name, middle_name, party) \
    VALUES (?,?,?,?,?);
"""

with open("./Resources/candidates.txt") as fd:
    slines =[l.strip().split('|') for l in fd.readlines()]
    for line in slines[1:]:
        theid, first_name, last_name, middle_name, party = line
        print theid, first_name, last_name, middle_name, party
        valstoinsert = (int(theid), first_name, last_name, middle_name, party)
        print ins, valstoinsert
        db.cursor().execute(ins, valstoinsert)

33 Joseph Biden  D

INSERT INTO candidates (id, first_name, last_name, middle_name, party)     VALUES (?,?,?,?,?);
(33, 'Joseph', 'Biden', '', 'D')
36 Samuel Brownback  R

INSERT INTO candidates (id, first_name, last_name, middle_name, party)     VALUES (?,?,?,?,?);
(36, 'Samuel', 'Brownback', '', 'R')
34 Hillary Clinton R. D

INSERT INTO candidates (id, first_name, last_name, middle_name, party)     VALUES (?,?,?,?,?);
(34, 'Hillary', 'Clinton', 'R.', 'D')
39 Christopher Dodd J. D

INSERT INTO candidates (id, first_name, last_name, middle_name, party)     VALUES (?,?,?,?,?);
(39, 'Christopher', 'Dodd', 'J.', 'D')
26 John Edwards  D

INSERT INTO candidates (id, first_name, last_name, middle_name, party)     VALUES (?,?,?,?,?);
(26, 'John', 'Edwards', '', 'D')
22 Rudolph Giuliani  R

INSERT INTO candidates (id, first_name, last_name, middle_name, party)     VALUES (?,?,?,?,?);
(22, 'Rudolph', 'Giuliani', '', 'R')
24 Mike Gravel  D

INSERT INTO candidates (id, first_name, last_name, midd

In [15]:
def make_query(sel):
    c = db.cursor().execute(sel)
    return c.fetchall()

In [16]:
make_query("SELECT * FROM candidates;")

[(16, u'Mike', u'Huckabee', u'', u'R'),
 (20, u'Barack', u'Obama', u'', u'D'),
 (22, u'Rudolph', u'Giuliani', u'', u'R'),
 (24, u'Mike', u'Gravel', u'', u'D'),
 (26, u'John', u'Edwards', u'', u'D'),
 (29, u'Bill', u'Richardson', u'', u'D'),
 (30, u'Duncan', u'Hunter', u'', u'R'),
 (31, u'Dennis', u'Kucinich', u'', u'D'),
 (32, u'Ron', u'Paul', u'', u'R'),
 (33, u'Joseph', u'Biden', u'', u'D'),
 (34, u'Hillary', u'Clinton', u'R.', u'D'),
 (35, u'Mitt', u'Romney', u'', u'R'),
 (36, u'Samuel', u'Brownback', u'', u'R'),
 (37, u'John', u'McCain', u'', u'R'),
 (38, u'Tom', u'Tancredo', u'', u'R'),
 (39, u'Christopher', u'Dodd', u'J.', u'D'),
 (41, u'Fred', u'Thompson', u'D.', u'R')]

## Single Table Verbs
Let us now focus on core data manipulation commands. The reason to do this is that they are ***universal across systems, and by identifying them, we can quickly ask how to do these*** when we encounter a new system.

See https://gist.github.com/TomAugspurger/6e052140eaa5fdb6e8c0/ which has a comparison of r/dplyr and pandas. I stole and modified this table from there:

``dplyr`` has a small set of nicely defined verbs. I've listed their closest pandas verbs.


<table>
  <tr>
    <th><b>VERB</b></th>
    <th><b>dplyr</b></th>
    <th><b>pandas</b></th>
    <th><b>SQL</b></th>
  </tr>
  <tr>
    <td>QUERY/SELECTION</td>
    <td>filter() (and slice())</td>
    <td>query() (and loc[], iloc[])</td>
    <td>SELECT WHERE</td>
  </tr>
  <tr>
    <td>SORT</td>
    <td>arrange()</td>
    <td>sort()</td>
    <td>ORDER BY</td>
  </tr>
  <tr>
    <td>SELECT-COLUMNS/PROJECTION</td>
    <td>select() (and rename())</td>
    <td>[](__getitem__) (and rename())</td>
    <td>SELECT COLUMN</td>
  </tr>
  <tr>
    <td>SELECT-DISTINCT</td>
    <td>distinct()</td>
    <td>unique(),drop_duplicates()</td>
    <td>SELECT DISTINCT COLUMN</td>
  </tr>
  <tr>
    <td>ASSIGN</td>
    <td>mutate() (and transmute())</td>
    <td>assign</td>
    <td>ALTER/UPDATE</td>
  </tr>
  <tr>
    <td>AGGREGATE</td>
    <td>summarise()</td>
    <td>describe(), mean(), max()</td>
    <td>None, AVG(),MAX()</td>
  </tr>
  <tr>
    <td>SAMPLE</td>
    <td>sample_n() and sample_frac()</td>
    <td>sample()</td>
    <td>implementation dep, use RAND()</td>
  </tr>
  <tr>
    <td>GROUP-AGG</td>
    <td>group_by/summarize</td>
    <td>groupby/agg, count, mean</td>
    <td>GROUP BY</td>
  </tr>
  <tr>
    <td>DELETE</td>
    <td>?</td>
    <td>drop/masking</td>
    <td>DELETE/WHERE</td>
  </tr>
</table>

In [19]:
dfcwci.head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
1,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,250.0,2007-05-16,16
2,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,50.0,2007-06-18,16
3,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,100.0,2007-06-21,16
4,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16


### Query

In [20]:
# Pandas
dfcwci.query("state=='VA' & amount < 400")

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
27,Buckheit,Bruce,,8904 KAREN DR,,FAIRFAX,VA,220312731,100.0,2007-09-19,20
77,Ranganath,Anoop,,2507 Willard Drive,,Charlottesville,VA,22903,-100.0,2008-04-21,32
88,Perreault,Louise,,503 Brockridge Hunt Drive,,Hampton,VA,23666,-34.08,2008-04-21,32
145,ABDELLA,THOMAS,M.,4231 MONUMENT WALL WAY #340,,FAIRFAX,VA,220308440,50.0,2007-09-30,35


In [21]:
# Pandas Masking
dfcwci[(dfcwci.state=='VA') & (dfcwci.amount < 400)]

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
27,Buckheit,Bruce,,8904 KAREN DR,,FAIRFAX,VA,220312731,100.0,2007-09-19,20
77,Ranganath,Anoop,,2507 Willard Drive,,Charlottesville,VA,22903,-100.0,2008-04-21,32
88,Perreault,Louise,,503 Brockridge Hunt Drive,,Hampton,VA,23666,-34.08,2008-04-21,32
145,ABDELLA,THOMAS,M.,4231 MONUMENT WALL WAY #340,,FAIRFAX,VA,220308440,50.0,2007-09-30,35


In [22]:
# Formatting SQL results (tuples) to nice dataframe style
cont_cols = [e[1] for e in make_query("PRAGMA table_info(contributors);")]
def make_frame(list_of_tuples, legend=cont_cols):
    framelist=[]
    for i, cname in enumerate(legend):
        framelist.append((cname,[e[i] for e in list_of_tuples]))
    return pd.DataFrame.from_items(framelist)

In [23]:
# Query in SQL
out = make_query("SELECT * FROM contributors WHERE state='VA' AND amount < 400;")
print out
make_frame(out)

[(28, u'Buckheit', u'Bruce', None, u'8904 KAREN DR', None, u'FAIRFAX', u'VA', u'220312731', 100, u'2007-09-19', 20), (78, u'Ranganath', u'Anoop', None, u'2507 Willard Drive', None, u'Charlottesville', u'VA', u'22903', -100, u'2008-04-21', 32), (89, u'Perreault', u'Louise', None, u'503 Brockridge Hunt Drive', None, u'Hampton', u'VA', u'23666', -34.08, u'2008-04-21', 32), (146, u'ABDELLA', u'THOMAS', u'M.', u'4231 MONUMENT WALL WAY #340', None, u'FAIRFAX', u'VA', u'220308440', 50, u'2007-09-30', 35)]


Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,28,Buckheit,Bruce,,8904 KAREN DR,,FAIRFAX,VA,220312731,100.0,2007-09-19,20
1,78,Ranganath,Anoop,,2507 Willard Drive,,Charlottesville,VA,22903,-100.0,2008-04-21,32
2,89,Perreault,Louise,,503 Brockridge Hunt Drive,,Hampton,VA,23666,-34.08,2008-04-21,32
3,146,ABDELLA,THOMAS,M.,4231 MONUMENT WALL WAY #340,,FAIRFAX,VA,220308440,50.0,2007-09-30,35


In [24]:
out = make_query("SELECT * FROM contributors WHERE state IS NULL;")
make_frame(out)

Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,126,BOURNE,TRAVIS,,LAGE KAART 77,,BRASSCHATT,,2930,-500,2008-11-20,35


In [25]:
dfcwci[dfcwci.state.isnull()]

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
125,BOURNE,TRAVIS,,LAGE KAART 77,,BRASSCHATT,,2930,-500.0,2008-11-20,35


In [26]:
out = make_query("SELECT * FROM contributors WHERE state IS NOT NULL;")
make_frame(out).shape

(174, 12)

In [27]:
dfcwci[dfcwci.state.notnull()].shape

(174, 11)

In [37]:
out = make_query("SELECT * FROM contributors WHERE state IN ('VA','WA');")
make_frame(out).head()

Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,1,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
1,28,Buckheit,Bruce,,8904 KAREN DR,,FAIRFAX,VA,220312731,100.0,2007-09-19,20
2,63,BURKE,SUZANNE,M.,3401 EVANSTON,,SEATTLE,WA,981038677,-700.0,2008-03-05,22
3,78,Ranganath,Anoop,,2507 Willard Drive,,Charlottesville,VA,22903,-100.0,2008-04-21,32
4,89,Perreault,Louise,,503 Brockridge Hunt Drive,,Hampton,VA,23666,-34.08,2008-04-21,32


In [38]:
dfcwci[dfcwci.state.isin(['VA','WA'])].head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
27,Buckheit,Bruce,,8904 KAREN DR,,FAIRFAX,VA,220312731,100.0,2007-09-19,20
62,BURKE,SUZANNE,M.,3401 EVANSTON,,SEATTLE,WA,981038677,-700.0,2008-03-05,22
77,Ranganath,Anoop,,2507 Willard Drive,,Charlottesville,VA,22903,-100.0,2008-04-21,32
88,Perreault,Louise,,503 Brockridge Hunt Drive,,Hampton,VA,23666,-34.08,2008-04-21,32


In [39]:
# Note that bounds are inclusive in SQL
out = make_query("SELECT * FROM contributors WHERE amount BETWEEN 10 AND 50;")
make_frame(out).head()

Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,3,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,50,2007-06-18,16
1,19,Ardle,William,,412 Dakota Avenue,,Springfield,OH,45504,50,2007-06-28,16
2,26,Buckler,Steve,,24351 Armada Dr,,Dana Point,CA,926291306,50,2007-07-30,20
3,27,Buckler,Steve,,24351 Armada Dr,,Dana Point,CA,926291306,25,2007-08-16,20
4,35,Buck,Barbara,,1780 NE 138th St,,North Miami,FL,331811316,50,2007-09-13,20


In [40]:
dfcwci.query("10 <= amount <= 50").head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
2,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,50.0,2007-06-18,16
18,Ardle,William,,412 Dakota Avenue,,Springfield,OH,45504,50.0,2007-06-28,16
25,Buckler,Steve,,24351 Armada Dr,,Dana Point,CA,926291306,50.0,2007-07-30,20
26,Buckler,Steve,,24351 Armada Dr,,Dana Point,CA,926291306,25.0,2007-08-16,20
34,Buck,Barbara,,1780 NE 138th St,,North Miami,FL,331811316,50.0,2007-09-13,20


### Sort

In [41]:
dfcwci.sort_values("amount").head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
90,Kazor,Christopher,M,707 Spindletree ave,,Naperville,IL,60565,-2592.0,2008-04-21,32
72,BRUNO,JOHN,,10136 WINDERMERE CHASE BLVD.,,GOTHA,FL,347344707,-2300.0,2008-03-06,22
64,BURKE,DONALD,J.,12 LOMPOC,,RANCHO SANTA MARGA,CA,926881817,-2300.0,2008-03-11,22
73,BRUNO,IRENE,,10136 WINDERMERE CHASE BLVD.,,GOTHA,FL,347344707,-2300.0,2008-03-06,22
74,BROWN,TIMOTHY,J.,26826 MARLOWE COURT,,STEVENSON RANCH,CA,913811020,-2300.0,2008-03-06,22


In [42]:
dfcwci.sort_values("amount", ascending=False).head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
30,Buckel,Linda,,PO Box 683130,,Park City,UT,840683130,4600.0,2007-08-14,20
159,ABATE,MARIA,ELENA,1291 NIGHTINGALE AVENUE,,MIAMI SPRINGS,FL,331663832,2600.0,2008-01-25,37
15,Anthony,John,,211 Long Island Drive,,Hot Springs,AR,71913,2300.0,2007-06-12,16
33,Buck,Blaine,M,45 Eaton Ave,,Camden,ME,48431752,2300.0,2007-09-30,20
28,Buckel,Linda,,PO Box 683130,,Park City,UT,840683130,2300.0,2007-08-14,20


**NOTE:** In pandas, sorted result is just in the ouput, original table is not mutated.  
Can do `inplace=True`, but not recommended

In [43]:
# SQL
out = make_query("SELECT * FROM contributors ORDER BY amount;")
make_frame(out).head()

Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,91,Kazor,Christopher,M,707 Spindletree ave,,Naperville,IL,60565,-2592.0,2008-04-21,32
1,30,Buckel,Linda,,PO Box 683130,,Park City,UT,840683130,-2300.0,2007-08-14,20
2,52,BYINGTON,MARGARET,E.,2633 MIDDLEBORO LANE N.E.,,GRAND RAPIDS,MI,495061254,-2300.0,2008-03-03,22
3,53,BYERS,BOB,A.,13170 TELFAIR AVENUE,,SYLMAR,CA,913423573,-2300.0,2008-03-07,22
4,55,BUSH,KRYSTIE,,P.O. BOX 61046,,DENVER,CO,802061046,-2300.0,2008-03-06,22


In [44]:
out = make_query("SELECT * FROM contributors ORDER BY amount DESC;")
make_frame(out).head()

Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,31,Buckel,Linda,,PO Box 683130,,Park City,UT,840683130,4600.0,2007-08-14,20
1,160,ABATE,MARIA,ELENA,1291 NIGHTINGALE AVENUE,,MIAMI SPRINGS,FL,331663832,2600.0,2008-01-25,37
2,14,Altes,R.D.,,8600 Moody Road,,Fort Smith,AR,72903,2300.0,2007-06-21,16
3,16,Anthony,John,,211 Long Island Drive,,Hot Springs,AR,71913,2300.0,2007-06-12,16
4,22,Baker,David,,2550 Adamsbrooke Drive,,Conway,AR,72034,2300.0,2007-04-11,16


### Select Columns

In [36]:
dfcwci[['first_name', 'amount']].head()

Unnamed: 0,first_name,amount
0,Steven,500.0
1,Don,250.0
2,Don,50.0
3,Don,100.0
4,Charles,100.0


In [45]:
out = make_query("SELECT first_name, amount FROM contributors;")
make_frame(out,['first_name', 'amount']).head()

Unnamed: 0,first_name,amount
0,Steven,500.0
1,Don,250.0
2,Don,50.0
3,Don,100.0
4,Charles,100.0


### Select Distinct/Unique

In [46]:
dfcwci[['last_name','first_name']].count()

last_name     175
first_name    175
dtype: int64

In [47]:
dfcwci[['last_name','first_name']].drop_duplicates().count()

last_name     126
first_name    126
dtype: int64

In [49]:
dfcwci[['last_name','first_name']].drop_duplicates().head()

Unnamed: 0,last_name,first_name
0,Agee,Steven
1,Ahrens,Don
4,Akin,Charles
5,Akin,Mike
6,Akin,Rebecca


In [50]:
out = make_query("SELECT DISTINCT last_name, first_name FROM contributors;")
make_frame(out,['last_name', 'first_name']).head()

Unnamed: 0,last_name,first_name
0,Agee,Steven
1,Ahrens,Don
2,Akin,Charles
3,Akin,Mike
4,Akin,Rebecca


### Assign: Adding/Mutating Columns

In [51]:
# Add a new Columns
# Pandas, we need to assign to itself
dfcwci['name'] = dfcwci['last_name'] + ", " + dfcwci['first_name']
dfcwci.head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id,name
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16,"Agee, Steven"
1,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,250.0,2007-05-16,16,"Ahrens, Don"
2,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,50.0,2007-06-18,16,"Ahrens, Don"
3,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,100.0,2007-06-21,16,"Ahrens, Don"
4,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16,"Akin, Charles"


In [54]:
dfcwci.assign(ucname = dfcwci.last_name + ":" + dfcwci.first_name).head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id,name,ucname
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16,"Agee, Steven",Agee:Steven
1,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,250.0,2007-05-16,16,"Ahrens, Don",Ahrens:Don
2,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,50.0,2007-06-18,16,"Ahrens, Don",Ahrens:Don
3,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,100.0,2007-06-21,16,"Ahrens, Don",Ahrens:Don
4,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16,"Akin, Charles",Akin:Charles


In [57]:
# Modify data in exisiting column

In [58]:
dfcwci[dfcwci.state == 'VA']

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id,name
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16,"Agee, Steven"
27,Buckheit,Bruce,,8904 KAREN DR,,FAIRFAX,VA,220312731,100.0,2007-09-19,20,"Buckheit, Bruce"
77,Ranganath,Anoop,,2507 Willard Drive,,Charlottesville,VA,22903,-100.0,2008-04-21,32,"Ranganath, Anoop"
88,Perreault,Louise,,503 Brockridge Hunt Drive,,Hampton,VA,23666,-34.08,2008-04-21,32,"Perreault, Louise"
145,ABDELLA,THOMAS,M.,4231 MONUMENT WALL WAY #340,,FAIRFAX,VA,220308440,50.0,2007-09-30,35,"ABDELLA, THOMAS"


In [59]:
dfcwci.loc[dfcwci.state == 'VA', 'name']

0           Agee, Steven
27       Buckheit, Bruce
77      Ranganath, Anoop
88     Perreault, Louise
145      ABDELLA, THOMAS
Name: name, dtype: object

In [60]:
# Mutate actual df, not just view:
dfcwci.loc[dfcwci.state == 'VA', 'name'] = "Junk"

In [61]:
dfcwci.query("state == 'VA'")

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id,name
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16,Junk
27,Buckheit,Bruce,,8904 KAREN DR,,FAIRFAX,VA,220312731,100.0,2007-09-19,20,Junk
77,Ranganath,Anoop,,2507 Willard Drive,,Charlottesville,VA,22903,-100.0,2008-04-21,32,Junk
88,Perreault,Louise,,503 Brockridge Hunt Drive,,Hampton,VA,23666,-34.08,2008-04-21,32,Junk
145,ABDELLA,THOMAS,M.,4231 MONUMENT WALL WAY #340,,FAIRFAX,VA,220308440,50.0,2007-09-30,35,Junk


In [62]:
# SQL

In [63]:
make_query("ALTER TABLE contributors ADD COLUMN name;")

[]

In [64]:
# Create new column
make_query("PRAGMA table_info(contributors);")

[(0, u'id', u'INTEGER', 1, None, 1),
 (1, u'last_name', u'VARCHAR', 0, None, 0),
 (2, u'first_name', u'VARCHAR', 0, None, 0),
 (3, u'middle_name', u'VARCHAR', 0, None, 0),
 (4, u'street_1', u'VARCHAR', 0, None, 0),
 (5, u'street_2', u'VARCHAR', 0, None, 0),
 (6, u'city', u'VARCHAR', 0, None, 0),
 (7, u'state', u'VARCHAR', 0, None, 0),
 (8, u'zip', u'VARCHAR', 0, None, 0),
 (9, u'amount', u'INTEGER', 0, None, 0),
 (10, u'date', u'DATETIME', 0, None, 0),
 (11, u'candidate_id', u'INTEGER', 1, None, 0),
 (12, u'name', u'', 0, None, 0)]

In [65]:
# Select first and last names, and IDs and return in tuple form
out = make_query("SELECT id, last_name, first_name from contributors;")
out2 = [(e[1] + ", " + e[2], e[0]) for e in out]
out2

[(u'Agee, Steven', 1),
 (u'Ahrens, Don', 2),
 (u'Ahrens, Don', 3),
 (u'Ahrens, Don', 4),
 (u'Akin, Charles', 5),
 (u'Akin, Mike', 6),
 (u'Akin, Rebecca', 7),
 (u'Aldridge, Brittni', 8),
 (u'Allen, John D.', 9),
 (u'Allen, John D.', 10),
 (u'Allison, John W.', 11),
 (u'Allison, Rebecca', 12),
 (u'Allison, Rebecca', 13),
 (u'Altes, R.D.', 14),
 (u'Andres, Dale', 15),
 (u'Anthony, John', 16),
 (u'Arbogast, Robert', 17),
 (u'Arbogast, Robert', 18),
 (u'Ardle, William', 19),
 (u'Atiq, Omar', 20),
 (u'Atiq, Omar', 21),
 (u'Baker, David', 22),
 (u'Bancroft, David', 23),
 (u'Banks, Charles', 24),
 (u'Barbee, John', 25),
 (u'Buckler, Steve', 26),
 (u'Buckler, Steve', 27),
 (u'Buckheit, Bruce', 28),
 (u'Buckel, Linda', 29),
 (u'Buckel, Linda', 30),
 (u'Buckel, Linda', 31),
 (u'Buck, Thomas', 32),
 (u'Buck, Jay', 33),
 (u'Buck, Blaine', 34),
 (u'Buck, Barbara', 35),
 (u'Buck, Barbara', 36),
 (u'Buchman, Mark M', 37),
 (u'Bucher, Ida', 38),
 (u'Buchanek, Elizabeth', 39),
 (u'Buchanan, John', 40),


In [66]:
# Update name for above ids, data passed in tuple form 
alt2 = "UPDATE contributors SET name = ? WHERE id = ?;"
for ele in out2:
    db.cursor().execute(alt2, ele)

In [67]:
# Must save since action was destructive
db.commit()

In [68]:
out = make_query("SELECT * from contributors;")
make_frame(out,cont_cols+["name"]).head()

Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id,name
0,1,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16,"Agee, Steven"
1,2,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,250.0,2007-05-16,16,"Ahrens, Don"
2,3,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,50.0,2007-06-18,16,"Ahrens, Don"
3,4,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,100.0,2007-06-21,16,"Ahrens, Don"
4,5,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16,"Akin, Charles"


In [71]:
# Modify existing columns, as above
make_query("UPDATE contributors SET name = 'Junk' WHERE state = 'VA';")
db.commit()

In [72]:
out = make_query("SELECT * from contributors where state='VA';")
make_frame(out, cont_cols + ["name"]).head()

Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id,name
0,1,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16,Junk
1,28,Buckheit,Bruce,,8904 KAREN DR,,FAIRFAX,VA,220312731,100.0,2007-09-19,20,Junk
2,78,Ranganath,Anoop,,2507 Willard Drive,,Charlottesville,VA,22903,-100.0,2008-04-21,32,Junk
3,89,Perreault,Louise,,503 Brockridge Hunt Drive,,Hampton,VA,23666,-34.08,2008-04-21,32,Junk
4,146,ABDELLA,THOMAS,M.,4231 MONUMENT WALL WAY #340,,FAIRFAX,VA,220308440,50.0,2007-09-30,35,Junk


#### No DROP COLUMN in SQLite

Its available in other databases. Here you must just re-create your database, or no about this gotcha from the start.

In [73]:
make_query("ALTER TABLE contributors DROP COLUMN name;")
db.commit()

OperationalError: near "DROP": syntax error

In [79]:
# Pandas, simpler
del dfcwci['name']

In [80]:
dfcwci.head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
1,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,250.0,2007-05-16,16
2,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,50.0,2007-06-18,16
3,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,100.0,2007-06-21,16
4,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16


### Aggregate

In [81]:
dfcwci.describe()

Unnamed: 0,zip,amount,candidate_id
count,175.0,175.0,175.0
mean,378001400.0,3.418114,28.0
std,362827800.0,1028.418999,7.823484
min,2474.0,-2592.0,16.0
25%,93367.0,-175.0,20.0
50%,323331300.0,100.0,32.0
75%,781694600.0,300.0,35.0
max,995153200.0,4600.0,37.0


In [82]:
dfcwci[dfcwci.amount == dfcwci.amount.max()]

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
30,Buckel,Linda,,PO Box 683130,,Park City,UT,840683130,4600.0,2007-08-14,20


In [83]:
out = make_query("SELECT *, MAX(amount) AS maxamt FROM contributors;")
print out
make_frame(out, cont_cols + ['maxamt'])

[(31, u'Buckel', u'Linda', None, u'PO Box 683130', None, u'Park City', u'UT', u'840683130', 4600, u'2007-08-14', 20, u'Buckel, Linda', 4600)]


Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id,maxamt
0,31,Buckel,Linda,,PO Box 683130,,Park City,UT,840683130,4600,2007-08-14,20,"Buckel, Linda"


In [87]:
out = make_query("SELECT COUNT(amount) AS AMOUNTCOUNT FROM contributors;")
print "Count", out[0][0]
out = make_query("SELECT AVG(amount) FROM contributors;")
print "Avg", out[0][0]

Count 175
Avg 3.41811428571


In [88]:
# Select w/i a Range
dfcwci[dfcwci.amount > dfcwci.amount.max() - 2300]

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
30,Buckel,Linda,,PO Box 683130,,Park City,UT,840683130,4600.0,2007-08-14,20
159,ABATE,MARIA,ELENA,1291 NIGHTINGALE AVENUE,,MIAMI SPRINGS,FL,331663832,2600.0,2008-01-25,37


In [89]:
out = make_query("SELECT * FROM contributors WHERE amount > (select (MAX(amount) - 2300) FROM contributors);")
make_frame(out)

Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,31,Buckel,Linda,,PO Box 683130,,Park City,UT,840683130,4600,2007-08-14,20
1,160,ABATE,MARIA,ELENA,1291 NIGHTINGALE AVENUE,,MIAMI SPRINGS,FL,331663832,2600,2008-01-25,37


### Grouping

In [96]:
dfcwci.groupby("state").sum()

Unnamed: 0_level_0,zip,amount,candidate_id
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,2985459621,1210.0,111
AR,864790,14200.0,192
AZ,860011121,120.0,37
CA,14736360720,-5013.73,600
CO,2405477834,-5823.0,111
CT,68901376,2300.0,35
DC,800341853,-1549.91,102
FL,8970626520,-4050.0,803
IA,50266,250.0,16
ID,83648,-261.0,32


In [97]:
dfcwci.groupby("state")['amount'].sum()

state
AK     1210.00
AR    14200.00
AZ      120.00
CA    -5013.73
CO    -5823.00
CT     2300.00
DC    -1549.91
FL    -4050.00
IA      250.00
ID     -261.00
IL    -5586.80
KS     -330.00
KY     -200.00
LA     1300.00
MA      -83.00
MD      300.00
ME     2520.00
MI    -1265.00
MN      322.00
MO      100.00
NC      500.00
NH      -24.60
NJ     -817.45
NV      725.00
NY    -6474.50
OH      450.00
OK      800.00
PA    -2146.00
RI      200.00
SC     2400.00
TN      -25.00
TX     1985.24
UT     5050.00
VA      515.92
WA     -500.00
Name: amount, dtype: float64

In [98]:
dfcwci.state.unique()

array(['VA', 'CA', 'AR', 'DC', 'SC', 'IA', 'OH', 'NC', 'UT', 'MO', 'IL',
       'ME', 'FL', 'MD', 'MI', 'CO', 'WA', 'NY', 'TX', 'KY', 'PA', 'TN',
       'MA', 'MN', 'KS', 'NJ', 'NH', 'ID', 'OK', nan, 'NV', 'CT', 'RI',
       'AK', 'LA', 'AZ'], dtype=object)

In [99]:
out = make_query("SELECT state, SUM(amount) FROM contributors GROUP BY state;")
make_frame(out, legend = ['state','sum'])

Unnamed: 0,state,sum
0,,-500.0
1,AK,1210.0
2,AR,14200.0
3,AZ,120.0
4,CA,-5013.73
5,CO,-5823.0
6,CT,2300.0
7,DC,-1549.91
8,FL,-4050.0
9,IA,250.0


### Delete

In [100]:
dfcwci.head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
1,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,250.0,2007-05-16,16
2,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,50.0,2007-06-18,16
3,Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,100.0,2007-06-21,16
4,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16


In [101]:
# In-place drop
df2 = dfcwci.copy()
df2.set_index('last_name', inplace=True)
df2.head()

Unnamed: 0_level_0,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
last_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,250.0,2007-05-16,16
Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,50.0,2007-06-18,16
Ahrens,Don,,4034 Rennellwood Way,,Pleasanton,CA,94566,100.0,2007-06-21,16
Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16


In [102]:
df2.drop(['Ahrens'], inplace=True)
df2.head()

Unnamed: 0_level_0,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
last_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16
Akin,Mike,,181 Baywood Lane,,Monticello,AR,71655,1500.0,2007-05-18,16
Akin,Rebecca,,181 Baywood Lane,,Monticello,AR,71655,500.0,2007-05-18,16
Aldridge,Brittni,,"808 Capitol Square Place, SW",,Washington,DC,20024,250.0,2007-06-06,16


In [103]:
df2.reset_index(inplace=True)
df2.head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
1,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16
2,Akin,Mike,,181 Baywood Lane,,Monticello,AR,71655,1500.0,2007-05-18,16
3,Akin,Rebecca,,181 Baywood Lane,,Monticello,AR,71655,500.0,2007-05-18,16
4,Aldridge,Brittni,,"808 Capitol Square Place, SW",,Washington,DC,20024,250.0,2007-06-06,16


In [104]:
# The recommended way to do it is to create a new dataframe. This might be impractical is things are very large.

In [108]:
dfcwci = dfcwci[dfcwci.last_name!='Ahrens']
dfcwci.head()

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
4,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16
5,Akin,Mike,,181 Baywood Lane,,Monticello,AR,71655,1500.0,2007-05-18,16
6,Akin,Rebecca,,181 Baywood Lane,,Monticello,AR,71655,500.0,2007-05-18,16
7,Aldridge,Brittni,,"808 Capitol Square Place, SW",,Washington,DC,20024,250.0,2007-06-06,16


In [111]:
# SQL
make_query("DELETE FROM contributors WHERE last_name=\"Ahrens\";")
db.commit()
out = make_query("SELECT * FROM contributors;")
make_frame(out).head()

Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,1,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
1,5,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16
2,6,Akin,Mike,,181 Baywood Lane,,Monticello,AR,71655,1500.0,2007-05-18,16
3,7,Akin,Rebecca,,181 Baywood Lane,,Monticello,AR,71655,500.0,2007-05-18,16
4,8,Aldridge,Brittni,,"808 Capitol Square Place, SW",,Washington,DC,20024,250.0,2007-06-06,16


### Limit

In [112]:
out = make_query("SELECT * FROM contributors LIMIT 3;")
make_frame(out).head()

Unnamed: 0,id,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,1,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500,2007-06-30,16
1,5,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100,2007-06-16,16
2,6,Akin,Mike,,181 Baywood Lane,,Monticello,AR,71655,1500,2007-05-18,16


In [113]:
dfcwci[:3]

Unnamed: 0,last_name,first_name,middle_name,street_1,street_2,city,state,zip,amount,date,candidate_id
0,Agee,Steven,,549 Laurel Branch Road,,Floyd,VA,24091,500.0,2007-06-30,16
4,Akin,Charles,,10187 Sugar Creek Road,,Bentonville,AR,72712,100.0,2007-06-16,16
5,Akin,Mike,,181 Baywood Lane,,Monticello,AR,71655,1500.0,2007-05-18,16
