In [1]:
import re

import pandas as pd
from sklearn.datasets import load_iris

from mypandas import load_births, load_meat, sqldf

In [2]:
births = load_births()
meat = load_meat()
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df["species"] = pd.Categorical.from_codes(iris.target, iris.target_names)
iris_df.columns = [re.sub("[() ]", "", col) for col in iris_df.columns]

In [3]:
iris_df

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [4]:
sqldf("SELECT * FROM iris_df LIMIT 10;", locals())

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [5]:
sqldf("SELECT sepalwidthcm, species FROM iris_df LIMIT 10;", locals())

Unnamed: 0,sepalwidthcm,species
0,3.5,setosa
1,3.0,setosa
2,3.2,setosa
3,3.1,setosa
4,3.6,setosa
5,3.9,setosa
6,3.4,setosa
7,3.4,setosa
8,2.9,setosa
9,3.1,setosa


In [6]:
query = """
      select
        species
        , avg(sepalwidthcm)
        , min(sepalwidthcm)
        , max(sepalwidthcm)
      from
        iris_df
      group by
        species;

"""
sqldf(query, locals())

Unnamed: 0,species,avg(sepalwidthcm),min(sepalwidthcm),max(sepalwidthcm)
0,setosa,3.428,2.3,4.4
1,versicolor,2.77,2.0,3.4
2,virginica,2.974,2.2,3.8


In [7]:
def pysqldf(query):
    """Helper function"""
    return sqldf(query, globals())

In [8]:
pysqldf(query)

Unnamed: 0,species,avg(sepalwidthcm),min(sepalwidthcm),max(sepalwidthcm)
0,setosa,3.428,2.3,4.4
1,versicolor,2.77,2.0,3.4
2,virginica,2.974,2.2,3.8


In [9]:
query = """
    select
        a.*
    from
        iris_df a
    inner join
        iris_df b
            on a.species = b.species
    limit 10;
"""
pysqldf(query)

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species
0,5.1,3.5,1.4,0.2,setosa
1,5.1,3.5,1.4,0.2,setosa
2,5.1,3.5,1.4,0.2,setosa
3,5.1,3.5,1.4,0.2,setosa
4,5.1,3.5,1.4,0.2,setosa
5,5.1,3.5,1.4,0.2,setosa
6,5.1,3.5,1.4,0.2,setosa
7,5.1,3.5,1.4,0.2,setosa
8,5.1,3.5,1.4,0.2,setosa
9,5.1,3.5,1.4,0.2,setosa


In [10]:
query = """
    select
        *
    from
        iris_df
    where
        species = 'virginica'
        and sepallengthcm > 7.7;
"""
pysqldf(query)

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species
0,7.9,3.8,6.4,2.0,virginica


In [11]:
iris_df["id"] = range(len(iris_df))

In [12]:
query = """
    select
        *
    from
        iris_df
    where
        id in (select id from iris_df where sepalwidthcm*sepallengthcm > 25);
"""
pysqldf(query)

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species,id
0,5.7,4.4,1.5,0.4,setosa,15
1,7.2,3.6,6.1,2.5,virginica,109
2,7.7,3.8,6.7,2.2,virginica,117
3,7.9,3.8,6.4,2.0,virginica,131


In [13]:
query = """
    SELECT
        m.*
        , b.births
    FROM
        meat m
    INNER JOIN
        births b
            on m.date = b.date
    ORDER BY
        m.date;
"""
pysqldf(query).head()

Unnamed: 0,date,beef,veal,pork,lamb_and_mutton,broilers,other_chicken,turkey,births
0,1975-01-01 00:00:00.000000,2106.0,59.0,1114.0,36.0,646.2,,64.9,265775
1,1975-02-01 00:00:00.000000,1845.0,50.0,954.0,31.0,570.2,,47.1,241045
2,1975-03-01 00:00:00.000000,1891.0,57.0,976.0,35.0,616.6,,54.4,268849
3,1975-04-01 00:00:00.000000,1895.0,60.0,1100.0,34.0,688.3,,68.7,247455
4,1975-05-01 00:00:00.000000,1849.0,59.0,934.0,31.0,690.1,,81.9,254545
