# Helper Function and Data Imports

In [1]:
from pandasql import sqldf
import pandas as pd

# Create helper function for easier query execution
execute = lambda q: sqldf(q, globals())

In [2]:
import pandas as pd
reviews = pd.read_csv("dataset/reviews.csv", header=None)
reviews = reviews.reset_index()
reviews.columns = ['id',	'film_id',	'num_user',	'num_critic',	'imdb_score',	'num_votes',	'facebook_likes']
print(reviews.columns)
reviews.head()


Index(['id', 'film_id', 'num_user', 'num_critic', 'imdb_score', 'num_votes',
       'facebook_likes'],
      dtype='object')


Unnamed: 0,id,film_id,num_user,num_critic,imdb_score,num_votes,facebook_likes
0,0,3934,588.0,432.0,7.1,203461,46000
1,1,3405,285.0,267.0,6.4,149998,0
2,2,478,65.0,29.0,3.2,8465,491
3,3,74,83.0,25.0,7.6,7071,930
4,4,1254,1437.0,224.0,8.0,241030,13000


In [3]:
people = pd.read_csv("dataset/people.csv", header=None)
people.columns = ['id',	'name',	'birthdate',	'deathdate']
people.head()

Unnamed: 0,id,name,birthdate,deathdate
0,1,50 Cent,1975-07-06,
1,2,A. Michael Baldwin,1963-04-04,
2,3,A. Raven Cruz,,
3,4,A.J. Buckley,1978-02-09,
4,5,A.J. DeLucia,,


In [4]:
films = pd.read_csv("dataset/films.csv", header=None)
films.columns = ['id',	'title',	'release_year',	'country',	'duration',	'language',	'certification',	'gross',	'budget']

films.head()

Unnamed: 0,id,title,release_year,country,duration,language,certification,gross,budget
0,1,Intolerance: Love's Struggle Throughout the Ages,1916.0,USA,123.0,,Not Rated,,385907.0
1,2,Over the Hill to the Poorhouse,1920.0,USA,110.0,,,3000000.0,100000.0
2,3,The Big Parade,1925.0,USA,151.0,,Not Rated,,245000.0
3,4,Metropolis,1927.0,Germany,145.0,German,Not Rated,26435.0,6000000.0
4,5,Pandora's Box,1929.0,Germany,110.0,German,Not Rated,9950.0,


In [5]:
roles = pd.read_csv("dataset/roles.csv", header=None)
roles.columns = ['id',	'film_id',	'person_id',	'role']

roles.head()

Unnamed: 0,id,film_id,person_id,role
0,1,1,1630,director
1,2,1,4843,actor
2,3,1,5050,actor
3,4,1,8175,actor
4,5,2,3000,director


# Aggregate functions and data types

Aggregate functions are another valuable tool for the SQL programmer. They are used extensively across businesses to calculate important metrics, such as the average cost of making a film.

You know five different aggregate functions:

- `AVG()`
- `SUM()`
- `MIN()`
- `MAX()`
- `COUNT()`

Test your knowledge of what data types they are compatible with.

<center><img src="images/03.01.png"  style="width: 400px, height: 300px;"/></center>

# Practice with aggregate functions

Now let's try extracting summary information from a table using these new aggregate functions. Summarizing is helpful in real life when extracting top-line details from your dataset. Perhaps you'd like to know how old the oldest film in the films table is, what the most expensive film is, or how many films you have listed.

Now it's your turn to get more insights about the `films` table!

In [7]:
query = """
-- Query the sum of film durations
SELECT SUM(duration) AS total_duration
FROM films
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,total_duration
0,534882.0


In [8]:
query = """
-- Calculate the average duration of all films
SELECT AVG(duration) AS average_duration
FROM films
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,average_duration
0,107.947931


In [9]:
query = """
-- Find the latest release_year
SELECT MAX(release_year) AS latest_year
FROM films
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,latest_year
0,2016.0


In [10]:
query = """
-- Find the duration of the shortest film
SELECT MIN(duration) AS shortest_film
FROM films
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,shortest_film
0,7.0


# Combining aggregate functions with WHERE

When combining aggregate functions with `WHERE`, you get a powerful tool that allows you to get more granular with your insights, for example, to get the total budget of movies made from the year 2010 onwards.

This combination is useful when you only want to summarize a subset of your data. In your film-industry role, as an example, you may like to summarize each certification category to compare how they each perform or if one certification has a higher average budget than another.

In [11]:
query = """
-- Calculate the sum of gross from the year 2000 or later
SELECT SUM(gross) AS total_gross
FROM films
WHERE release_year >= 2000
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,total_gross
0,150900900000.0


In [12]:
query = """
-- Calculate the average gross of films that start with A
SELECT AVG(gross) AS avg_gross_A
FROM films
WHERE title LIKE 'A%'
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,avg_gross_A
0,47893240.0


In [13]:
query = """
-- Calculate the lowest gross film in 1994
SELECT MIN(gross) AS lowest_gross
FROM films
WHERE release_year = 1994
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,lowest_gross
0,125169.0


In [14]:
query = """
-- Calculate the highest gross film released between 2000-2012
SELECT MAX(gross) AS highest_gross
FROM films
WHERE release_year BETWEEN 2000 AND 2012
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,highest_gross
0,760505847.0


# Using ROUND()

Aggregate functions work great with numerical values; however, these results can sometimes get unwieldy when dealing with long decimal values. Luckily, SQL provides you with the `ROUND()` function to tame these long decimals.

If asked to give the average budget of your films, ten decimal places is not necessary. Instead, you can round to two decimal places to create results that make more sense for currency.

In [15]:
query = """
-- Round the average number of facebook_likes to one decimal place
SELECT ROUND(AVG(facebook_likes), 1) AS avg_facebook_likes
FROM reviews
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,avg_facebook_likes
0,7802.9


# ROUND() with a negative parameter

A useful thing you can do with `ROUND()` is have a negative number as the decimal place parameter. This can come in handy if your manager only needs to know the average number of facebook_likes to the hundreds since granularity below one hundred likes won't impact decision making.

Social media plays a significant role in determining success. If a movie trailer is posted and barely gets any likes, the movie itself may not be successful. Remember how 2020's "Sonic the Hedgehog" movie got a revamp after the public saw the trailer?

Let's apply this to other parts of the dataset and see what the benchmark is for movie budgets so, in the future, it's clear whether the film is above or below budget.

In [17]:
query = """
-- Calculate the average budget rounded to the thousands
SELECT ROUND(AVG(budget), -3) AS avg_budget_thousands
FROM films
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,avg_budget_thousands
0,39902826.0


# Using arithmetic

SQL arithmetic comes in handy when your table is missing a metric you want to review. Suppose you have some data on movie ticket sales, but the table only has fields for ticket price and discount. In that case, you could combine these by subtracting the discount from the ticket price to get the amount the film-goer paid.

You have seen that SQL can act strangely when dividing integers. What is the result if you divide a `discount` of two dollars by the `paid_price` of ten dollars to get the discount percentage?

In [19]:
query = """
SELECT (2/10)
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,(2/10)
0,0


In [18]:
query = """
SELECT (2.0/10.0)
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,(2.0/10.0)
0,0.2


# Aliasing with functions

Aliasing can be a lifesaver, especially as we start to do more complex SQL queries with multiple criteria. Aliases help you keep your code clean and readable. For example, if you want to find the `MAX()` value of several fields without aliasing, you'll end up with the result with several columns called `max` and no idea which is which. You can fix this with aliasing.

In [20]:
query = """
-- Calculate the title and duration_hours from films
SELECT title, duration/60.0 AS duration_hours
FROM films;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,title,duration_hours
0,Intolerance: Love's Struggle Throughout the Ages,2.05
1,Over the Hill to the Poorhouse,1.833333
2,The Big Parade,2.516667
3,Metropolis,2.416667
4,Pandora's Box,1.833333


In [21]:
query = """
-- Calculate the percentage of people who are no longer alive
SELECT COUNT(deathdate) * 100.0 / COUNT(*) AS percentage_dead
FROM people;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,percentage_dead
0,9.372395


In [22]:
query = """
-- Find the number of decades in the films table
SELECT (MAX(release_year) - MIN(release_year)) / 10.0 AS number_of_decades
FROM films;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,number_of_decades
0,10.0


# Rounding results

You found some valuable insights in the previous exercise, but many of the results were inconveniently long. We forgot to round! We won't make you redo them all; however, you'll update the worst offender in this exercise.

In [23]:
query = """
-- Round duration_hours to two decimal places
SELECT title, ROUND(duration / 60.0, 2) AS duration_hours
FROM films;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,title,duration_hours
0,Intolerance: Love's Struggle Throughout the Ages,2.05
1,Over the Hill to the Poorhouse,1.83
2,The Big Parade,2.52
3,Metropolis,2.42
4,Pandora's Box,1.83
