# Join: `dvd_rentals.rental` and `dvd_rentals.inventory`

## Purpose

In [None]:
-- Purpose 
SELECT * 
FROM dvd_rentals.rental
WHERE customer_id = 130; 

## Hypotheses

In [1]:
-- Hypothesis 1: no. of unique inventory_id records would be equal in both `rental` and `inventory` tables 
SELECT 
    COUNT(DISTINCT inventory_id)
FROM dvd_rentals.rental; 


SELECT
    COUNT(DISTINCT inventory_id)
FROM dvd_rentals.inventory; 


-- Rejected 

count
4580


count
4581


In [6]:
-- 2.0
-- H2
SELECT
    inventory_id as target_column_values, 
    COUNT(*) as record_counts
FROM dvd_rentals.rental
GROUP BY target_column_values
ORDER BY record_counts ASC

/*Not much insight: we can only see (a) max number of times a unique inventory_id appears and/or (b) the top ranking 
inventory_id values*/

target_column_values,record_counts
2662,1
3372,1
2786,1
1580,1
160,2
700,2
2400,2
3031,2
791,2
4239,2


In [8]:
-- 2.1
-- H2.1: there will be multiple records per unique inventory_id in dvd_rentals.rental 

WITH counts_base AS (
SELECT
    inventory_id as target_column_values, 
    COUNT(*) as record_counts
FROM dvd_rentals.rental
GROUP BY target_column_values
)

SELECT 
    record_counts, 
    COUNT(target_column_values) AS count_of_target_values
FROM counts_base
GROUP BY record_counts
ORDER BY record_counts; 


record_counts,count_of_target_values
1,4
2,1126
3,1151
4,1160
5,1139


In [7]:
-- H3: multiple inventory_id records per unique film_id value in the dvd_rentals.inventory table  

SELECT
    film_id as target_column_values,
    COUNT(DISTINCT inventory_id) as row_counts
FROM dvd_rentals.inventory
GROUP BY target_column_values
ORDER BY row_counts ASC

target_column_values,row_counts
719,2
454,2
517,2
425,2
721,2
422,2
297,2
289,2
903,2
279,2


In [9]:
-- H3: multiple inventory_id records per unique film_id value in the dvd_rentals.inventory table  
WITH counts_base AS (
SELECT
    film_id as target_column_values,
    COUNT(DISTINCT inventory_id) as unique_record_counts
FROM dvd_rentals.inventory
GROUP BY target_column_values
)

SELECT 
    unique_record_counts, 
    COUNT(target_column_values) AS count_of_target_values
FROM counts_base
GROUP BY unique_record_counts
ORDER BY unique_record_counts; 

unique_record_counts,count_of_target_values
2,133
3,131
4,183
5,136
6,187
7,116
8,72


## Returning to the 2 Key Questions

### 1. How many records exist per `inventory_id` in `rental` or `inventory` table

`rental` distribution analysis on `inventory_id` foreign key

In [10]:
WITH counts_base AS(
SELECT
    inventory_id AS foreign_key_values,
    COUNT(*) AS row_counts
FROM dvd_rentals.rental
GROUP BY foreign_key_values
)

SELECT
    row_counts,
    COUNT(foreign_key_values) AS count_of_foreign_keys
FROM counts_base
GROUP BY row_counts
ORDER BY row_counts; 

-- 1-to-many relationship!

row_counts,count_of_foreign_keys
1,4
2,1126
3,1151
4,1160
5,1139


<span style="color: #800000;">`inventory`</span> distribution analysis on <span style="color: #800000;">`inventory_id`</span> foreign key

In [14]:
WITH counts_base AS(
SELECT
    inventory_id AS foreign_key_values,
    COUNT(*) AS row_counts
FROM dvd_rentals.inventory
GROUP BY foreign_key_values
)

SELECT
    row_counts,
    COUNT(foreign_key_values) AS count_of_foreign_keys
FROM counts_base
GROUP BY row_counts
ORDER BY row_counts; 

-- 1-to-1 relationship!

-- Confirmation: Order by descending order to see the biggest record_count values 
SELECT 
    inventory_id, 
    COUNT(*) as record_count
FROM dvd_rentals.inventory
GROUP BY inventory_id
ORDER BY record_count DESC
LIMIT 5; 
-- 1-to-1 confirmed! 

row_counts,count_of_foreign_keys
1,4581


inventory_id,record_count
273,1
3936,1
2574,1
951,1
1489,1


### 2. How many overlapping and missing unique `foreign key` values are there between two tables?

In [15]:
-- FKVs in left but not right 
SELECT 
    COUNT(DISTINCT rental.inventory_id)
FROM dvd_rentals.rental
WHERE NOT EXISTS (
    SELECT inventory_id
    FROM dvd_rentals.inventory
    WHERE rental.inventory_id = inventory.inventory_id
)

count
0


In [17]:
-- FKVs in right but not left 
SELECT 
    COUNT(DISTINCT inventory.inventory_id)
FROM dvd_rentals.inventory
WHERE NOT EXISTS (
    SELECT inventory_id
    FROM dvd_rentals.rental
    WHERE rental.inventory_id = inventory.inventory_id
)

count
1


In [18]:
-- Inspection of FKV that only exists in the right table 
SELECT *
FROM dvd_rentals.inventory
WHERE NOT EXISTS (
    SELECT inventory_id
    FROM dvd_rentals.rental
    WHERE rental.inventory_id = inventory.inventory_id
)

inventory_id,film_id,store_id,last_update
5,1,2,2006-02-15T05:09:17


In [19]:
-- Joint overlap analysis 
SELECT 
    COUNT(DISTINCT rental.inventory_id)
FROM dvd_rentals.rental
WHERE EXISTS (
    SELECT inventory_id
    FROM dvd_rentals.inventory
    WHERE rental.inventory_id = inventory.inventory_id
)

count
4580


## Implementing Joins 

In [20]:
DROP TABLE IF EXISTS left_rental_join; 
CREATE TEMP TABLE left_rental_join AS 
SELECT
    rental.customer_id,
    rental.inventory_id, 
    inventory.film_id
FROM dvd_rentals.rental
LEFT JOIN dvd_rentals.inventory
    ON rental.inventory_id = inventory.inventory_id; 

DROP TABLE IF EXISTS inner_rental_join; 
CREATE TEMP TABLE inner_rental_join AS 
SELECT
    rental.customer_id,
    rental.inventory_id, 
    inventory.film_id
FROM dvd_rentals.rental
INNER JOIN dvd_rentals.inventory
    ON rental.inventory_id = inventory.inventory_id; 

-- check counts for each output 
-- parantheses used to make code look cleaner
    
(
    SELECT 
        'left join' AS join_type,
        COUNT(*) AS record_count, 
        COUNT(DISTINCT inventory_id) AS unique_key_values
    FROM left_rental_join
)
UNION
(
    SELECT 
        'inner join' AS join_type,
        COUNT(*) AS record_count, 
        COUNT(DISTINCT inventory_id) AS unique_key_values
    FROM inner_rental_join
)

join_type,record_count,unique_key_values
inner join,16044,4580
left join,16044,4580


# Part 2 

In [2]:
-- H1: 1-to-many (film_id - rows) in dvd_rentals.inventory table 

WITH counts_base AS ( 
SELECT 
    film_id as film_id_values, 
    COUNT(*) AS row_count
FROM dvd_rentals.inventory
GROUP BY film_id
)

SELECT
    row_count, 
    COUNT(DISTINCT film_id_values) AS unique_film_id_values
FROM counts_base
GROUP BY row_count
ORDER BY row_count

row_count,unique_film_id_values
2,133
3,131
4,183
5,136
6,187
7,116
8,72


In [3]:
-- H2: 1-to-1 (film_id in `film` tbl)

SELECT
    film_id, 
    COUNT(*) AS record_count
FROM dvd_rentals.film
GROUP BY film_id
ORDER BY record_count DESC
LIMIT 5; 


film_id,record_count
273,1
51,1
951,1
839,1
652,1


In [4]:
-- Distribution of `film_id` values within each table -- cf. above 

In [5]:
-- Overlap analysis 
-- only in left 
SELECT
    COUNT(DISTINCT inventory.film_id)
FROM dvd_rentals.inventory
WHERE NOT EXISTS (
    SELECT 
        film_id
    FROM dvd_rentals.film
    WHERE inventory.film_id = film.film_id
)

count
0


In [8]:
-- only in right 
SELECT
    COUNT(DISTINCT film.film_id)
FROM dvd_rentals.film
WHERE NOT EXISTS (
    SELECT 
        film_id
    FROM dvd_rentals.inventory
    WHERE inventory.film_id = film.film_id
)

count
42


In [9]:
-- only in right 
SELECT *
FROM dvd_rentals.film
WHERE NOT EXISTS (
    SELECT 
        film_id
    FROM dvd_rentals.inventory
    WHERE inventory.film_id = film.film_id
)

-- 42 films that do not have any inventories?

film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
14,ALICE FANTASIA,A Emotional Drama of a A Shark And a Database Administrator who must Vanquish a Pioneer in Soviet Georgia,2006,1,,6,0.99,94,23.99,NC-17,2006-02-15T05:03:42,"[""Trailers"", ""Deleted Scenes"", ""Behind the Scenes""]",'administr':13 'alic':1 'databas':12 'drama':5 'emot':4 'fantasia':2 'georgia':21 'must':15 'pioneer':18 'shark':9 'soviet':20 'vanquish':16
33,APOLLO TEEN,A Action-Packed Reflection of a Crocodile And a Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft,2006,1,,5,2.99,153,15.99,PG-13,2006-02-15T05:03:42,"[""Trailers"", ""Commentaries"", ""Deleted Scenes"", ""Behind the Scenes""]",'abandon':22 'action':5 'action-pack':4 'apollo':1 'crocodil':10 'explor':13 'find':16 'mine':23 'must':15 'pack':6 'reflect':7 'shaft':24 'sumo':18 'teen':2 'wrestler':19
36,ARGONAUTS TOWN,A Emotional Epistle of a Forensic Psychologist And a Butler who must Challenge a Waitress in An Abandoned Mine Shaft,2006,1,,7,0.99,127,12.99,PG-13,2006-02-15T05:03:42,"[""Trailers"", ""Commentaries""]",'abandon':20 'argonaut':1 'butler':12 'challeng':15 'emot':4 'epistl':5 'forens':8 'mine':21 'must':14 'psychologist':9 'shaft':22 'town':2 'waitress':17
38,ARK RIDGEMONT,A Beautiful Yarn of a Pioneer And a Monkey who must Pursue a Explorer in The Sahara Desert,2006,1,,6,0.99,68,25.99,NC-17,2006-02-15T05:03:42,"[""Trailers"", ""Commentaries"", ""Deleted Scenes"", ""Behind the Scenes""]",'ark':1 'beauti':4 'desert':20 'explor':16 'monkey':11 'must':13 'pioneer':8 'pursu':14 'ridgemont':2 'sahara':19 'yarn':5
41,ARSENIC INDEPENDENCE,A Fanciful Documentary of a Mad Cow And a Womanizer who must Find a Dentist in Berlin,2006,1,,4,0.99,137,17.99,PG,2006-02-15T05:03:42,"[""Trailers"", ""Deleted Scenes"", ""Behind the Scenes""]",'arsenic':1 'berlin':19 'cow':9 'dentist':17 'documentari':5 'fanci':4 'find':15 'independ':2 'mad':8 'must':14 'woman':12
87,BOONDOCK BALLROOM,A Fateful Panorama of a Crocodile And a Boy who must Defeat a Monkey in The Gulf of Mexico,2006,1,,7,0.99,76,14.99,NC-17,2006-02-15T05:03:42,"[""Behind the Scenes""]",'ballroom':2 'boondock':1 'boy':11 'crocodil':8 'defeat':14 'fate':4 'gulf':19 'mexico':21 'monkey':16 'must':13 'panorama':5
108,BUTCH PANTHER,A Lacklusture Yarn of a Feminist And a Database Administrator who must Face a Hunter in New Orleans,2006,1,,6,0.99,67,19.99,PG-13,2006-02-15T05:03:42,"[""Trailers"", ""Commentaries"", ""Deleted Scenes""]",'administr':12 'butch':1 'databas':11 'face':15 'feminist':8 'hunter':17 'lacklustur':4 'must':14 'new':19 'orlean':20 'panther':2 'yarn':5
128,CATCH AMISTAD,A Boring Reflection of a Lumberjack And a Feminist who must Discover a Woman in Nigeria,2006,1,,7,0.99,183,10.99,G,2006-02-15T05:03:42,"[""Trailers"", ""Behind the Scenes""]",'amistad':2 'bore':4 'catch':1 'discov':14 'feminist':11 'lumberjack':8 'must':13 'nigeria':18 'reflect':5 'woman':16
144,CHINATOWN GLADIATOR,A Brilliant Panorama of a Technical Writer And a Lumberjack who must Escape a Butler in Ancient India,2006,1,,7,4.99,61,24.99,PG,2006-02-15T05:03:42,"[""Trailers"", ""Commentaries"", ""Deleted Scenes""]",'ancient':19 'brilliant':4 'butler':17 'chinatown':1 'escap':15 'gladiat':2 'india':20 'lumberjack':12 'must':14 'panorama':5 'technic':8 'writer':9
148,CHOCOLATE DUCK,A Unbelieveable Story of a Mad Scientist And a Technical Writer who must Discover a Composer in Ancient China,2006,1,,3,2.99,132,13.99,R,2006-02-15T05:03:42,"[""Trailers"", ""Commentaries"", ""Behind the Scenes""]",'ancient':20 'china':21 'chocol':1 'compos':18 'discov':16 'duck':2 'mad':8 'must':15 'scientist':9 'stori':5 'technic':12 'unbeliev':4 'writer':13


In [11]:
-- overlapping foreign key values between `inventory` and `film` tables 
-- overlap
SELECT 
    COUNT (DISTINCT inventory.film_id)
FROM dvd_rentals.inventory
WHERE EXISTS (
    SELECT film_id
    FROM dvd_rentals.film
    WHERE inventory.film_id = film.film_id
)


count
958


In [12]:
-- Join Implementation & Output count comparisons: `Inner` vs. `Left` 
DROP TABLE IF EXISTS inner_join_pt_2; 
CREATE TEMP TABLE inner_join_pt_2 AS 
SELECT
    inventory.inventory_id, 
    inventory.film_id, 
    film.title
FROM dvd_rentals.inventory
INNER JOIN dvd_rentals.film
    ON inventory.film_id = film.film_id; 

DROP TABLE IF EXISTS left_join_pt_2; 
CREATE TEMP TABLE left_join_pt_2 AS 
SELECT
    inventory.inventory_id, 
    inventory.film_id, 
    film.title
FROM dvd_rentals.inventory
LEFT JOIN dvd_rentals.film
    ON inventory.film_id = film.film_id; 


(
    SELECT
        'inner join' AS join_type,
        COUNT(*) as record_count, 
        COUNT(DISTINCT film_id) AS unique_inventory_ids
    FROM inner_join_pt_2
)
UNION
(
    SELECT
        'left join' AS join_type,
        COUNT(*) as record_count, 
        COUNT(DISTINCT film_id) AS unique_inventory_ids
    FROM left_join_pt_2
)


join_type,record_count,unique_inventory_ids
inner join,4581,958
left join,4581,958


# Joining Parts 1 and 2 

In [13]:
DROP TABLE IF EXISTS join_parts_1_and_2; 
CREATE TEMP TABLE join_parts_1_and_2 AS
SELECT
    rental.customer_id, 
    inventory.film_id, 
    film.title
FROM dvd_rentals.rental
INNER JOIN dvd_rentals.inventory
    ON rental.inventory_id = inventory.inventory_id
INNER JOIN dvd_rentals.film
    ON inventory.film_id = film.film_id; 


SELECT *
FROM join_parts_1_and_2
LIMIT 10; 


customer_id,film_id,title
130,80,BLANKET BEVERLY
459,333,FREAKY POCUS
408,373,GRADUATE LORD
333,535,LOVE SUICIDES
222,450,IDOLS SNATCHERS
549,613,MYSTIC TRUMAN
269,870,SWARM GOLD
239,510,LAWLESS VISION
126,565,MATRIX SNOWMAN
399,396,HANGING DEEP


# Parts 3 & 4

In [14]:
-- `film_id` distribution on `dvd_rentals.film` 1-to-1 (already explored)

-- `film_id` distribution on `dvd_rentals.film_category` 1-to-1 
-- we would expect a unique film_id to only have one category, but we would expect a unique category_id to have multiple rows in 
-- film_category 
SELECT 
    film_id, 
    COUNT(*) AS row_count
FROM dvd_rentals.film_category
GROUP BY film_id
ORDER BY row_count DESC 
LIMIT 5; 

WITH base_counts AS (
SELECT 
    category_id, 
    COUNT(*) AS row_count
FROM dvd_rentals.film_category
GROUP BY category_id
)

SELECT 
    row_count, 
    COUNT(DISTINCT category_id) AS no_of_unique_category_ids
FROM base_counts
GROUP BY row_count
ORDER BY row_count DESC; 



film_id,row_count
273,1
51,1
951,1
839,1
652,1


row_count,no_of_unique_category_ids
74,1
73,1
69,1
68,1
66,1
64,1
63,1
62,1
61,2
60,1


In [15]:
SELECT
    COUNT(DISTINCT film.film_id)
FROM dvd_rentals.film
WHERE NOT EXISTS (
    SELECT film_id
    FROM dvd_rentals.film_category
    WHERE film.film_id = film_category.film_id
); 

SELECT
    COUNT(DISTINCT film_category.film_id)
FROM dvd_rentals.film_category
WHERE NOT EXISTS (
    SELECT film_id
    FROM dvd_rentals.film
    WHERE film.film_id = film_category.film_id
)

count
0


count
0


In [16]:
SELECT
    COUNT(DISTINCT film.film_id)
FROM dvd_rentals.film
WHERE EXISTS (
    SELECT film_id
    FROM dvd_rentals.film_category
    WHERE film.film_id = film_category.film_id
); 

count
1000


In [17]:
DROP TABLE IF EXISTS inner_join_pt_3; 

CREATE TEMP TABLE inner_join_pt_3 AS 
SELECT 
    film.film_id, 
    film_category.category_id
FROM dvd_rentals.film
INNER JOIN dvd_rentals.film_category
    ON film.film_id = film_category.film_id; 


DROP TABLE IF EXISTS left_join_pt_3; 

CREATE TEMP TABLE left_join_pt_3 AS 
SELECT 
    film.film_id, 
    film_category.category_id
FROM dvd_rentals.film
LEFT JOIN dvd_rentals.film_category
    ON film.film_id = film_category.film_id; 

(
    SELECT
        'inner join' AS join_type,
        COUNT(*) AS row_record_count, 
        COUNT(DISTINCT film_id) AS unique_foreign_value_count
    FROM inner_join_pt_3
)
UNION

(
    SELECT
        'left join' AS join_type,
        COUNT(*) AS row_record_count, 
        COUNT(DISTINCT film_id) AS unique_foreign_value_count
    FROM left_join_pt_3
)

join_type,row_record_count,unique_foreign_value_count
inner join,1000,1000
left join,1000,1000


# Part 4

In [18]:
WITH base_counts AS (
SELECT 
    category_id, 
    COUNT(*) AS row_count
FROM dvd_rentals.film_category
GROUP BY category_id
)

SELECT 
    row_count, 
    COUNT(DISTINCT category_id) AS count_of_unique_cat_id
FROM base_counts
GROUP BY row_count
ORDER BY row_count 

row_count,count_of_unique_cat_id
51,1
56,1
57,2
58,1
60,1
61,2
62,1
63,1
64,1
66,1


In [19]:
-- 1-to-1 
SELECT 
    category_id, 
    COUNT(*) AS record_count
FROM dvd_rentals.category
GROUP BY category_id
ORDER BY record_count DESC 
LIMIT 5; 

category_id,record_count
10,1
6,1
13,1
2,1
4,1


In [20]:
-- no. of unique FKVs in each table
-- unique category_id values in the left table and not the right table 

-- in left, but not right 
SELECT 
    COUNT(DISTINCT film_category.category_id)
FROM dvd_rentals.film_category
WHERE NOT EXISTS ( 
    SELECT 
        category_id
    FROM dvd_rentals.category
    WHERE film_category.category_id = category.category_id
); 

-- in right, but not left 
SELECT 
    COUNT(DISTINCT category.category_id)
FROM dvd_rentals.category
WHERE NOT EXISTS ( 
    SELECT 
        category_id
    FROM dvd_rentals.film_category
    WHERE film_category.category_id = category.category_id
)

count
0


count
0


In [22]:
-- in both tables
SELECT 
    COUNT(DISTINCT film_category.category_id)
FROM dvd_rentals.film_category
WHERE EXISTS ( 
    SELECT 
        category_id
    FROM dvd_rentals.category
    WHERE film_category.category_id = category.category_id
)

count
16


In [24]:
DROP TABLE IF EXISTS inner_join_pt_4; 

CREATE TEMP TABLE inner_join_pt_4 AS 
SELECT 
    film_category.film_id,
    film_category.category_id, 
    category.name
FROM dvd_rentals.film_category
INNER JOIN dvd_rentals.category
    ON film_category.category_id = category.category_id; 


DROP TABLE IF EXISTS left_join_pt_4; 

CREATE TEMP TABLE left_join_pt_4 AS 
SELECT 
    film_category.film_id,
    film_category.category_id, 
    category.name
FROM dvd_rentals.film_category
LEFT JOIN dvd_rentals.category
    ON film_category.category_id = category.category_id; 

(
    SELECT
        'inner join' AS join_type,
        COUNT(*) AS row_record_count, 
        COUNT(DISTINCT category_id) AS unique_category_ids
    FROM inner_join_pt_4
)
UNION

(
    SELECT
        'left join' AS join_type,
        COUNT(*) AS row_record_count, 
        COUNT(DISTINCT category_id) AS unique_category_ids
    FROM left_join_pt_4
)

join_type,row_record_count,unique_category_ids
inner join,1000,16
left join,1000,16


# Doing multiple joins (1, 2, 3, 4)

In [None]:
DROP TABLE IF EXISTS complete_join_dataset; 
CREATE TEMP TABLE complete_join_dataset AS 
SELECT
    rental.customer_id, 
    inventory.film_id, 
    film.title, 
    film_category.category_id, 
    category.name AS category_name 
FROM dvd_rentals.rental
INNER JOIN dvd_rentals.inventory
    ON rental.inventory_id = inventory.inventory_id
INNER JOIN dvd_rentals.film
    ON inventory.film_id = film.film_id 
INNER JOIN dvd_rentals.film_category
    ON film.film_id = film_category.film_id 
INNER JOIN dvd_rentals.category
    ON film_category.category_id = category.category_id; 

SELECT *
FROM complete_join_dataset
LIMIT 10; 

In [25]:
DROP TABLE IF EXISTS complete_joint_dataset; 

CREATE TEMP TABLE complete_joint_dataset AS 
SELECT
    rental.customer_id, 
    inventory.film_id, 
    film.title, 
    film_category.category_id, 
    category.name AS category_name 
FROM dvd_rentals.rental
INNER JOIN dvd_rentals.inventory
    ON rental.inventory_id = inventory.inventory_id
INNER JOIN dvd_rentals.film
    ON inventory.film_id = film.film_id 
INNER JOIN dvd_rentals.film_category
    ON film.film_id = film_category.film_id 
INNER JOIN dvd_rentals.category
    ON film_category.category_id = category.category_id; 


DROP TABLE IF EXISTS complete_joint_dataset_left; 

CREATE TEMP TABLE complete_joint_dataset_left AS 
SELECT
    rental.customer_id, 
    inventory.film_id, 
    film.title, 
    film_category.category_id, 
    category.name AS category_name 
FROM dvd_rentals.rental
LEFT JOIN dvd_rentals.inventory
    ON rental.inventory_id = inventory.inventory_id
LEFT JOIN dvd_rentals.film
    ON inventory.film_id = film.film_id 
LEFT JOIN dvd_rentals.film_category
    ON film.film_id = film_category.film_id 
LEFT JOIN dvd_rentals.category
    ON film_category.category_id = category.category_id; 

SELECT 
    'inner join' AS join_type,
    COUNT(*) AS final_record_count 
FROM complete_joint_dataset

UNION

SELECT 
    'left join' AS join_type,
    COUNT(*) AS final_record_count 
FROM complete_joint_dataset_left

join_type,final_record_count
inner join,16044
left join,16044
