In [1]:
import pandas as pd
from sqlalchemy import create_engine

You cannot use `engine.execute()`, because postgres does not allow you to create/delete databases inside transactions, and sqlalchemy always tries to run queries in a transaction. To get around this, get the underlying connection from the engine:

In [2]:
def drop_db():
    # if you don't specify database name, then by default db name = username
    engine = create_engine('postgresql://root:root@localhost:5432/')
    connection = engine.connect()
    try:
        connection.execute('commit')
        connection.execute('DROP DATABASE IF EXISTS datacamp_countries')
        connection.execute('commit')
        
        # PostgreSQL does not support create if not exists syntax.
        connection.execute('CREATE DATABASE datacamp_countries')
    except Exception:
        print('Something went wrong')
    finally:
        connection.close()


drop_db()

In [3]:
!cd datasets/countries; psql postgresql://root:root@localhost:5432/datacamp_countries < ./countries.sql

CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
CREATE TABLE
COPY 236
COPY 206
COPY 955
COPY 380
COPY 190
COPY 190
COPY 224
COPY 434
COPY 206


In [4]:
engine = create_engine('postgresql://root:root@localhost:5432/datacamp_countries')

In [5]:
print(engine.table_names())

['cities', 'countries', 'languages', 'economies', 'currencies', 'populations', 'countries_plus', 'economies2010', 'economies2015']


In [6]:
%load_ext sql
%sql postgresql://root:root@localhost:5432/datacamp_countries

In [7]:
%%sql
SELECT * FROM populations LIMIT 5;

 * postgresql://root:***@localhost:5432/datacamp_countries
5 rows affected.


pop_id,country_code,year,fertility_rate,life_expectancy,size
20,ABW,2010,1.704,74.9535,101597.0
19,ABW,2015,1.647,75.5736,103889.0
2,AFG,2010,5.746,58.9708,27962200.0
1,AFG,2015,4.653,60.7172,32526600.0
12,AGO,2010,6.416,50.6542,21220000.0


# Window functions
A few considerations:
1. Processed after the entire query except final ORDER BY clause;
2. Uses information in the result set instead of using database directly
3. Available in PostgreSQL, MySQL, Oracle, ... , but not in SQLite!

In [8]:
%%sql
-- Compering each countries' poplation to average in 2010
SELECT
    country_code,
    size AS population,
    (SELECT AVG(size) FROM populations WHERE year = 2010) AS population_avg
FROM populations
WHERE year = 2010
ORDER BY population DESC
LIMIT 5;

 * postgresql://root:***@localhost:5432/datacamp_countries
5 rows affected.


country_code,population,population_avg
CHN,1337700000.0,31801329.0230415
IND,1230980000.0,31801329.0230415
USA,309347000.0,31801329.0230415
IDN,241613000.0,31801329.0230415
BRA,198614000.0,31801329.0230415


In [9]:
%%sql
-- The same, but with Window function
SELECT
    country_code,
    size AS population,
    AVG(size) OVER() AS population_avg
FROM populations
WHERE year = 2010
ORDER BY population DESC
LIMIT 5;

 * postgresql://root:***@localhost:5432/datacamp_countries
5 rows affected.


country_code,population,population_avg
CHN,1337700000.0,31801329.0230415
IND,1230980000.0,31801329.0230415
USA,309347000.0,31801329.0230415
IDN,241613000.0,31801329.0230415
BRA,198614000.0,31801329.0230415


### RANK()

In [10]:
%%sql
SELECT
    country_code,
    size AS population,
    RANK() OVER(ORDER BY CASE WHEN size IS NOT NULL THEN size ELSE 0 END) AS population_rank
FROM populations
ORDER BY population_rank DESC
LIMIT 5

 * postgresql://root:***@localhost:5432/datacamp_countries
5 rows affected.


country_code,population,population_rank
CHN,1371220000.0,434
CHN,1337700000.0,433
IND,1311050000.0,432
IND,1230980000.0,431
USA,321419000.0,430


## PARTITION BY

In [11]:
%%sql
SELECT
    country_code,
    year,
    size AS population,
    AVG(size) OVER(PARTITION BY year) AS population_avg_by_year
FROM populations
WHERE size IS NOT NULL
ORDER BY population DESC
LIMIT 6;

 * postgresql://root:***@localhost:5432/datacamp_countries
6 rows affected.


country_code,year,population,population_avg_by_year
CHN,2015,1371220000.0,33879703.875
CHN,2010,1337700000.0,31801329.0230415
IND,2015,1311050000.0,33879703.875
IND,2010,1230980000.0,31801329.0230415
USA,2015,321419000.0,33879703.875
USA,2010,309347000.0,31801329.0230415


## Sliding Windows
Syntax
```sql
ROWS BETWEEN <start> AND <finish>

PRECEDING
FOLLOWING
UNBOUNDED PRECEDING
UNBOUNDED FOLLOWING
CURRENT ROW
```

In [12]:
%%sql
SELECT * FROM economies LIMIT 5;

 * postgresql://root:***@localhost:5432/datacamp_countries
5 rows affected.


econ_id,code,year,income_group,gdp_percapita,gross_savings,inflation_rate,total_investment,unemployment_rate,exports,imports
1,AFG,2010,Low income,539.667,37.133,2.179,30.402,,46.394,24.381
2,AFG,2015,Low income,615.091,21.466,-1.549,18.602,,-49.11,-7.294
3,AGO,2010,Upper middle income,3599.27,23.534,14.48,14.433,,-3.266,-21.076
4,AGO,2015,Upper middle income,3876.2,-0.425,10.287,9.552,,6.721,-21.778
5,ALB,2010,Upper middle income,4098.13,20.011,3.605,31.305,14.0,10.645,-8.013


In [24]:
%%sql
-- Total value of inflation across all countries.
-- It doesn't present much information, more like an example of sliding window function
SELECT
    code,
    inflation_rate,
    sum(inflation_rate) OVER (ORDER BY year ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM economies
LIMIT 5;

 * postgresql://root:***@localhost:5432/datacamp_countries
5 rows affected.


code,inflation_rate,running_total
LBN,3.983,3.983
ECU,3.552,7.535
LBR,7.291,14.826
ALB,3.605,18.431
LBY,2.458,20.889
