In [11]:
import psycopg2
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [15]:
DB_USER ="postgres"
DB_PASSWORD = "123456"
DB_NAME="dvdrental"
DB_PORT ="localhost"
# postgresql://username:password@host:port/database
conn_string="postgresql://{}:{}@{}/{}"\
                            .format(DB_USER, DB_PASSWORD, DB_PORT, DB_NAME)

In [16]:
%sql $conn_string

# STEP2: Explore the 3NF Schema

### 2.1 How much? What data sizes are we looking at?

In [18]:
nStores = %sql select count(*) from store;
nFilms = %sql select count(*) from film;
nCustomers = %sql select count(*) from customer;
nRentals = %sql select count(*) from rental;
nPayment = %sql select count(*) from payment;
nStaff = %sql select count(*) from staff;
nCity = %sql select count(*) from city;
nCountry = %sql select count(*) from country;

 * postgresql://postgres:***@localhost/dvdrental
1 rows affected.
 * postgresql://postgres:***@localhost/dvdrental
1 rows affected.
 * postgresql://postgres:***@localhost/dvdrental
1 rows affected.
 * postgresql://postgres:***@localhost/dvdrental
1 rows affected.
 * postgresql://postgres:***@localhost/dvdrental
1 rows affected.
 * postgresql://postgres:***@localhost/dvdrental
1 rows affected.
 * postgresql://postgres:***@localhost/dvdrental
1 rows affected.
 * postgresql://postgres:***@localhost/dvdrental
1 rows affected.


In [24]:
print("nFilms\t\t=", nFilms[0][0])
print("nCustomers\t=", nCustomers[0][0])
print("nRentals\t=", nRentals[0][0])
print("nPayment\t=", nPayment[0][0])
print("nStaff\t\t=", nStaff[0][0])
print("nStores\t\t=", nStores[0][0])
print("nCities\t\t=", nCity[0][0])
print("nCountry\t=", nCountry[0][0])

nFilms		= 1000
nCustomers	= 599
nRentals	= 16044
nPayment	= 14596
nStaff		= 2
nStores		= 2
nCities		= 600
nCountry	= 109


### 2.2 When? What time period are we talking about?

In [26]:
%%sql
select min(payment_date) as start, max(payment_date) as end from payment;

 * postgresql://postgres:***@localhost/dvdrental
1 rows affected.


start,end
2007-02-14 21:21:59.996577,2007-05-14 13:44:29.996577


### 2.3 Where? Where do events in this databse occur?

In [28]:
%%sql
select district, sum(city_id) as n
from address
group by district
order by n desc
limit 10;

 * postgresql://postgres:***@localhost/dvdrental
10 rows affected.


district,n
Shandong,3237
England,2974
So Paulo,2952
West Bengali,2623
Buenos Aires,2572
Uttar Pradesh,2462
California,2444
Southern Tagalog,1931
Tamil Nadu,1807
Hubei,1790


# STEP3: Perform some simple data analysis

## 3.1 Insight 1: Top Grossing Movies
- Payments amounts are in table payment
- Movies are in table film
- They are not directly linked, payment refers to a rental, rental refers to an inventory item and inventory item refers to a film
- payment → rental → inventory → film

In [31]:
%%sql
SELECT title, sum(p.amount) as revenue             
FROM payment p 
JOIN rental r ON (p.rental_id = r.rental_id)
JOIN inventory i ON (r.inventory_id = i.inventory_id)
JOIN film f on (i.film_id = f.film_id)
group by title
order by revenue desc
limit 10

 * postgresql://postgres:***@localhost/dvdrental
10 rows affected.


title,revenue
Telegraph Voyage,215.75
Zorro Ark,199.72
Wife Turn,198.73
Innocent Usual,191.74
Hustler Party,190.78
Saturday Lambs,190.74
Titans Jerk,186.73
Harry Idaho,177.73
Torque Bound,169.76
Dogma Family,168.72


## 3.2 Insight 2: Top grossing cities
- Payments amounts are tin table payment
- Cities are table cities
- payment -> customer -> address -> city

In [33]:
%%sql
SELECT city.city, sum(p.amount) as amount
FROM payment as p
JOIN customer as c
ON p.customer_id = c.customer_id
JOIN address as a
ON a.address_id = c.address_id
JOIN city
ON city.city_id = a.city_id
GROUP BY city
ORDER BY amount DESC
LIMIT 10

 * postgresql://postgres:***@localhost/dvdrental
10 rows affected.


city,amount
Saint-Denis,211.55
Cape Coral,208.58
Santa Brbara dOeste,194.61
Apeldoorn,191.62
Molodetno,189.6
Qomsheh,183.63
London,174.54
Memphis,167.67
Richmond Hill,167.62
Tanza,166.61


## 3.3 Insight 3: Revenue of a movie by customer city and by month

In [34]:
%%sql
SELECT film.title,city.city, sum(payment.amount) AS revenue, 
extract(MONTH FROM payment.payment_date) as month
FROM payment
JOIN customer
ON customer.customer_id = payment.customer_id
JOIN address
ON customer.address_id = address.address_id
JOIN city 
ON address.city_id = city.city_id
JOIN rental
ON payment.rental_id = rental.rental_id
JOIN inventory
ON inventory.inventory_id = rental.inventory_id
JOIN film
ON film.film_id = inventory.film_id
GROUP BY film.title, city.city, payment.payment_date
ORDER BY month, revenue DESC
LIMIT 10

 * postgresql://postgres:***@localhost/dvdrental
10 rows affected.


title,city,revenue,month
Saturday Lambs,Wroclaw,10.99,2
Telegraph Voyage,Datong,10.99,2
Mine Titans,Plock,10.99,2
Satisfaction Confidential,Suihua,10.99,2
Doors President,Zhoushan,10.99,2
Stranger Strangers,Czestochowa,10.99,2
Flintstones Happiness,Alessandria,10.99,2
Autumn Crow,Stockport,10.99,2
Behavior Runaway,Battambang,10.99,2
Telegraph Voyage,Pangkal Pinang,10.99,2
