# Helper Function and Data Imports


In [2]:
from pandasql import sqldf
import pandas as pd

# Create helper function for easier query execution
execute = lambda q: sqldf(q, globals())

In [3]:
import pandas as pd
reviews = pd.read_csv("dataset/reviews.csv", header=None)
reviews = reviews.reset_index()
reviews.columns = ['id',	'film_id',	'num_user',	'num_critic',	'imdb_score',	'num_votes',	'facebook_likes']
print(reviews.columns)
reviews.head()


Index(['id', 'film_id', 'num_user', 'num_critic', 'imdb_score', 'num_votes',
       'facebook_likes'],
      dtype='object')


Unnamed: 0,id,film_id,num_user,num_critic,imdb_score,num_votes,facebook_likes
0,0,3934,588.0,432.0,7.1,203461,46000
1,1,3405,285.0,267.0,6.4,149998,0
2,2,478,65.0,29.0,3.2,8465,491
3,3,74,83.0,25.0,7.6,7071,930
4,4,1254,1437.0,224.0,8.0,241030,13000


In [4]:
people = pd.read_csv("dataset/people.csv", header=None)
people.columns = ['id',	'name',	'birthdate',	'deathdate']
people.head()

Unnamed: 0,id,name,birthdate,deathdate
0,1,50 Cent,1975-07-06,
1,2,A. Michael Baldwin,1963-04-04,
2,3,A. Raven Cruz,,
3,4,A.J. Buckley,1978-02-09,
4,5,A.J. DeLucia,,


In [5]:
films = pd.read_csv("dataset/films.csv", header=None)
films.columns = ['id',	'title',	'release_year',	'country',	'duration',	'language',	'certification',	'gross',	'budget']

films.head()

Unnamed: 0,id,title,release_year,country,duration,language,certification,gross,budget
0,1,Intolerance: Love's Struggle Throughout the Ages,1916.0,USA,123.0,,Not Rated,,385907.0
1,2,Over the Hill to the Poorhouse,1920.0,USA,110.0,,,3000000.0,100000.0
2,3,The Big Parade,1925.0,USA,151.0,,Not Rated,,245000.0
3,4,Metropolis,1927.0,Germany,145.0,German,Not Rated,26435.0,6000000.0
4,5,Pandora's Box,1929.0,Germany,110.0,German,Not Rated,9950.0,


In [6]:
roles = pd.read_csv("dataset/roles.csv", header=None)
roles.columns = ['id',	'film_id',	'person_id',	'role']

roles.head()

Unnamed: 0,id,film_id,person_id,role
0,1,1,1630,director
1,2,1,4843,actor
2,3,1,5050,actor
3,4,1,8175,actor
4,5,2,3000,director


# Sorting text

SQL provides you with the `ORDER BY` keyword to sort one or more fields from your data. It can do this multi-directionally and helps make results easy to interpret.

How does` ORDER BY` sort a column of text values by default?

- Alphabetically (A-Z)

# Sorting single fields

Now that you understand how `ORDER BY` works, you'll put it into practice. In this exercise, you'll work on sorting single fields only. This can be helpful to extract quick insights such as the top-grossing or top-scoring film.

The following exercises will help you gain further insights into the `film` database.

In [7]:
query = """
-- Select name from people and sort alphabetically
SELECT name
FROM people
ORDER BY name ASC
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,name
0,50 Cent
1,A. Michael Baldwin
2,A. Raven Cruz
3,A.J. Buckley
4,A.J. DeLucia


In [8]:
query = """
-- Select the title and duration from longest to shortest film
SELECT title, duration
FROM films
ORDER BY duration DESC
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,title,duration
0,Carlos,334.0
1,"Blood In, Blood Out",330.0
2,Heaven's Gate,325.0
3,The Legend of Suriyothai,300.0
4,Das Boot,293.0


# Sorting multiple fields

`ORDER BY` can also be used to sort on multiple fields. It will sort by the first field specified, then sort by the next, and so on. As an example, you may want to sort the `people` data by age and keep the names in alphabetical order.

In [9]:
query = """
-- Select the release year, duration, and title sorted by release year and duration
SELECT release_year, duration, title
FROM films
ORDER BY release_year, duration
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,release_year,duration,title
0,,,Wolf Creek
1,,22.0,"10,000 B.C."
2,,22.0,Anger Management
3,,24.0,Lovesick
4,,24.0,Yu-Gi-Oh! Duel Monsters


In [10]:
query = """
-- Select the certification, release year, and title sorted by certification and release year
SELECT certification, release_year, title
FROM films
ORDER BY certification, release_year
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,certification,release_year,title
0,,,"10,000 B.C."
1,,,A Touch of Frost
2,,,Anger Management
3,,,Animal Kingdom
4,,,BrainDead


# GROUP BY single fields

`GROUP BY` is a SQL keyword that allows you to group and summarize results with the additional use of aggregate functions. For example, films can be grouped by the certification and language before counting the film titles in each group. This allows you to see how many films had a particular certification and language grouping.

In [11]:
query = """
-- Find the release_year and film_count of each year
SELECT release_year, COUNT(*) AS film_count
FROM films
GROUP BY release_year
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,release_year,film_count
0,,42
1,1916.0,1
2,1920.0,1
3,1925.0,1
4,1927.0,1


In [12]:
query = """
-- Find the release_year and average duration of films for each year
SELECT release_year, AVG(duration) AS avg_duration
FROM films
GROUP BY release_year
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,release_year,avg_duration
0,,77.439024
1,1916.0,123.0
2,1920.0,110.0
3,1925.0,151.0
4,1927.0,145.0


# GROUP BY multiple fields

`GROUP BY` becomes more powerful when used across multiple fields or combined with `ORDER BY` and `LIMIT`.

Perhaps you're interested in learning about budget changes throughout the years in individual countries. You'll use grouping in this exercise to look at the maximum budget for each country in each year there is data available.

In [13]:
query = """
-- Find the release_year, country, and max_budget, then group and order by release_year and country
SELECT release_year, country, MAX(budget) AS max_budget
FROM films
GROUP BY release_year, country
ORDER BY release_year, country
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,release_year,country,max_budget
0,,,
1,,Australia,15000000.0
2,,Canada,
3,,France,
4,,Iceland,


# Answering business questions

In the real world, every SQL query starts with a business question. Then it is up to you to decide how to write the query that answers the question. Let's try this out.

Which `release_year` had the most language diversity?

Take your time to translate this question into code. We'll get you started then it's up to you to test your queries in the console.

In [14]:
query = """
-- Find the release_year, country, and max_budget, then group and order by release_year and country
SELECT release_year, COUNT(DISTINCT language) AS language_count
FROM films
GROUP BY release_year
ORDER BY language_count DESC
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,release_year,language_count
0,2006.0,16
1,2015.0,15
2,2005.0,14
3,2013.0,13
4,2008.0,13


# Filter with HAVING

`HAVING` works similarly to `WHERE` in that it is a filtering clause, with the difference that `HAVING` filters grouped data.

Filtering grouped data can be especially handy when working with a large dataset. When working with thousands or even millions of rows, `HAVING` will allow you to filter for just the group of data you want, such as films over two hours in length!

Practice using `HAVING` to find out which countries (or country) have the most varied film certifications.

In [15]:
query = """
-- Select the country and distinct count of certification as certification_count
SELECT country, COUNT(DISTINCT certification) AS certification_count
FROM films
-- Group by country
GROUP BY country
-- Filter results to countries with more than 10 different certifications
HAVING COUNT(DISTINCT certification) > 10
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,country,certification_count
0,USA,12


# HAVING and sorting

Filtering and sorting go hand in hand and gives you greater interpretability by ordering our results.

Let's see this magic at work by writing a query showing what countries have the highest average film budgets.

In [16]:
query = """
-- Select the country and average_budget from films
SELECT country, AVG(budget) AS average_budget
FROM films
-- Group by country
GROUP BY country
-- Filter to countries with an average_budget of more than one billion
HAVING AVG(budget) > 1000000000
-- Order by descending order of the aggregated budget
ORDER BY average_budget DESC
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,country,average_budget
0,South Korea,1383960000.0
1,Hungary,1260000000.0


# All together now

This is good preparation for using SQL in the real world where you'll often be asked to write more complex queries since some of the basic queries can be answered by playing around in spreadsheet applications.

In this exercise, you'll write a query that returns the average budget and gross earnings for films each year after 1990 if the average budget is greater than 60 million.

In [17]:
query = """
-- Select the release_year for each film released after 1990, and group by release_year.
SELECT release_year
FROM films
WHERE release_year > 1990
GROUP BY release_year
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,release_year
0,1991.0
1,1992.0
2,1993.0
3,1994.0
4,1995.0


In [18]:
query = """
-- Modify the query to also list the average budget and average gross
SELECT release_year, AVG(budget) AS avg_budget, AVG(gross) AS avg_gross
FROM films
WHERE release_year > 1990
GROUP BY release_year;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,release_year,avg_budget,avg_gross
0,1991.0,25176550.0,53844500.0
1,1992.0,25982030.0,63665200.0
2,1993.0,20729790.0,45302090.0
3,1994.0,29013770.0,59395670.0
4,1995.0,32775000.0,44909520.0


In [19]:
query = """
SELECT release_year, AVG(budget) AS avg_budget, AVG(gross) AS avg_gross
FROM films
WHERE release_year > 1990
GROUP BY release_year
-- Modify the query to see only years with an avg_budget of more than 60 million
HAVING AVG(budget) > 60000000;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,release_year,avg_budget,avg_gross
0,2005.0,70323940.0,41159140.0
1,2006.0,93968930.0,39237860.0


In [20]:
query = """
SELECT release_year, AVG(budget) AS avg_budget, AVG(gross) AS avg_gross
FROM films
WHERE release_year > 1990
GROUP BY release_year
HAVING AVG(budget) > 60000000
-- Order the results from highest to lowest average gross and limit to one
ORDER BY avg_gross DESC 
LIMIT 1;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,release_year,avg_budget,avg_gross
0,2005.0,70323940.0,41159140.0
