# Helper Function and Data Imports

In [1]:
from pandasql import sqldf
import pandas as pd

# Create helper function for easier query execution
execute = lambda q: sqldf(q, globals())

In [2]:
import pandas as pd
reviews = pd.read_csv("dataset/reviews.csv", header=None)
reviews = reviews.reset_index()
reviews.columns = ['id',	'film_id',	'num_user',	'num_critic',	'imdb_score',	'num_votes',	'facebook_likes']
print(reviews.columns)
reviews.head()


Index(['id', 'film_id', 'num_user', 'num_critic', 'imdb_score', 'num_votes',
       'facebook_likes'],
      dtype='object')


Unnamed: 0,id,film_id,num_user,num_critic,imdb_score,num_votes,facebook_likes
0,0,3934,588.0,432.0,7.1,203461,46000
1,1,3405,285.0,267.0,6.4,149998,0
2,2,478,65.0,29.0,3.2,8465,491
3,3,74,83.0,25.0,7.6,7071,930
4,4,1254,1437.0,224.0,8.0,241030,13000


In [3]:
people = pd.read_csv("dataset/people.csv", header=None)
people.columns = ['id',	'name',	'birthdate',	'deathdate']
people.head()

Unnamed: 0,id,name,birthdate,deathdate
0,1,50 Cent,1975-07-06,
1,2,A. Michael Baldwin,1963-04-04,
2,3,A. Raven Cruz,,
3,4,A.J. Buckley,1978-02-09,
4,5,A.J. DeLucia,,


In [4]:
films = pd.read_csv("dataset/films.csv", header=None)
films.columns = ['id',	'title',	'release_year',	'country',	'duration',	'language',	'certification',	'gross',	'budget']

films.head()

Unnamed: 0,id,title,release_year,country,duration,language,certification,gross,budget
0,1,Intolerance: Love's Struggle Throughout the Ages,1916.0,USA,123.0,,Not Rated,,385907.0
1,2,Over the Hill to the Poorhouse,1920.0,USA,110.0,,,3000000.0,100000.0
2,3,The Big Parade,1925.0,USA,151.0,,Not Rated,,245000.0
3,4,Metropolis,1927.0,Germany,145.0,German,Not Rated,26435.0,6000000.0
4,5,Pandora's Box,1929.0,Germany,110.0,German,Not Rated,9950.0,


In [5]:
roles = pd.read_csv("dataset/roles.csv", header=None)
roles.columns = ['id',	'film_id',	'person_id',	'role']

roles.head()

Unnamed: 0,id,film_id,person_id,role
0,1,1,1630,director
1,2,1,4843,actor
2,3,1,5050,actor
3,4,1,8175,actor
4,5,2,3000,director


# Filtering results

The `WHERE` clause allows you to filter based on text and numeric values in a table using comparison operators.

In [6]:
query = """
SELECT title
FROM films
WHERE release_year > 2000;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,title
0,15 Minutes
1,3000 Miles to Graceland
2,A Beautiful Mind
3,A Knight's Tale
4,A.I. Artificial Intelligence


# Using WHERE with numbers

Filtering with `WHERE` allows you to analyze your data better. You may have a dataset that includes a range of different movies, and you need to do a case study on the most notable films with the biggest budgets. In this case, you'll want to filter your data to a specific `budget` range.

In [7]:
query = """
-- Select film_ids and imdb_score with an imdb_score over 7.0
SELECT film_id, imdb_score 
FROM reviews
WHERE imdb_score > 7.0
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,film_id,imdb_score
0,3934,7.1
1,74,7.6
2,1254,8.0
3,4841,8.1
4,3252,7.2


In [8]:
query = """
-- Select film_ids and facebook_likes for ten records with less than 1000 likes 
SELECT film_id, facebook_likes 
FROM reviews
WHERE facebook_likes < 1000
LIMIT 10
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,film_id,facebook_likes
0,3405,0
1,478,491
2,74,930
3,740,0
4,2869,689


In [9]:
query = """
-- Count the records with at least 100,000 votes
SELECT COUNT(num_votes) AS films_over_100K_votes 
FROM reviews
WHERE num_votes >=  100000
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,films_over_100K_votes
0,1211


# Using WHERE with text

`WHERE` can also filter string values.

Imagine you are part of an organization that gives cinematography awards, and you have several international categories. Before you confirm an award for every language listed in your dataset, it may be worth seeing if there are enough films of a specific language to make it a fair competition. If there is only one movie or a significant skew, it may be worth considering a different way of giving international awards.

In [10]:
query = """
-- Count the Spanish-language films
SELECT COUNT(language) AS count_spanish
FROM films
WHERE language = 'Spanish'
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,count_spanish
0,40


# Using AND

The following exercises combine `AND` and `OR` with the `WHERE` clause. Using these operators together strengthens your queries and analyses of data.

You will apply these new skills now on the `films` table.

In [12]:
query = """
-- Select the title and release_year for all German-language films released before 2000
SELECT title, release_year
FROM films
WHERE language = 'German' AND release_year < 2000
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,title,release_year
0,Metropolis,1927.0
1,Pandora's Box,1929.0
2,The Torture Chamber of Dr. Sadism,1967.0
3,Das Boot,1981.0
4,Run Lola Run,1998.0


In [13]:
query = """
-- Update the query to see all German-language films released after 2000
SELECT title, release_year
FROM films
WHERE release_year > 2000
	AND language = 'German';
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,title,release_year
0,Good Bye Lenin!,2003.0
1,Downfall,2004.0
2,Summer Storm,2004.0
3,The Lives of Others,2006.0
4,The Baader Meinhof Complex,2008.0


# Using OR

This time you'll write a query to get the `title` and `release_year` of films released in 1990 or 1999, which were in English or Spanish and took in more than $2,000,000 `gross`.

In [21]:
query = """
SELECT title, release_year
FROM films
-- Find the title and year of films from the 1990 or 1999
WHERE (release_year = 1990 OR release_year = 1999)
-- Add a filter to see only English or Spanish-language films
	AND (language = 'English' OR language = 'Spanish')
-- Filter films with more than $2,000,000 gross
	AND gross > 2000000;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,title,release_year
0,Arachnophobia,1990.0
1,Back to the Future Part III,1990.0
2,Child's Play 2,1990.0
3,Dances with Wolves,1990.0
4,Days of Thunder,1990.0


# Using BETWEEN

Let's use `BETWEEN` with `AND` on the `films` database to get the `title` and `release_year` of all Spanish-language films released between 1990 and 2000 (inclusive) with budgets over $100 million

In [20]:
query = """
SELECT title, release_year
FROM films
-- Select the title and release_year for films released between 1990 and 2000
WHERE release_year BETWEEN 1990 AND 2000
-- Narrow down your query to films with budgets > $100 million
	AND budget > 100000000
-- Amend the query to include Spanish or French-language films
	AND (language = 'Spanish' OR language = 'French');
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,title,release_year
0,Les couloirs du temps: Les visiteurs II,1998.0
1,Tango,1998.0


# LIKE and NOT LIKE

The `LIKE` and `NOT LIKE` operators can be used to find records that either match or do not match a specified pattern, respectively. They can be coupled with the wildcards `%` and `_`. The `% `will match zero or many characters, and `_` will match a single character.

In [22]:
query = """
-- Select the names that start with B
SELECT name
FROM people  
WHERE name LIKE 'B%'
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,name
0,B.J. Novak
1,Babak Najafi
2,Babar Ahmed
3,Bahare Seddiqi
4,Bai Ling


In [24]:
query = """
SELECT name
FROM people
-- Select the names that have r as the second letter
WHERE name LIKE '_r%'
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,name
0,Ara Celi
1,Aramis Knight
2,Arben Bajraktaraj
3,Arcelia RamÃ­rez
4,Archie Kao


In [25]:
query = """
SELECT name
FROM people
-- Select names that don't start with A
WHERE name NOT LIKE 'A%'
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,name
0,50 Cent
1,Ãlex Angulo
2,Ãlex de la Iglesia
3,Ãngela Molina
4,B.J. Novak


# WHERE IN

You now know you can query multiple conditions using the `IN` operator and a set of parentheses. It is a valuable piece of code that helps us keep our queries clean and concise.

In [26]:
query = """
-- Find the title and release_year for all films over two hours in length released in 1990 and 2000
SELECT title, release_year
FROM films
WHERE release_year IN (1990, 2000) AND duration > 120
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,title,release_year
0,Dances with Wolves,1990.0
1,Die Hard 2,1990.0
2,Ghost,1990.0
3,Goodfellas,1990.0
4,Mo' Better Blues,1990.0


In [27]:
query = """
-- Find the title and language of all films in English, Spanish, and French
SELECT title, language
FROM films
WHERE language IN ('French', 'English', 'Spanish') 
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,title,language
0,The Broadway Melody,English
1,Hell's Angels,English
2,A Farewell to Arms,English
3,42nd Street,English
4,She Done Him Wrong,English


In [28]:
query = """
-- Find the title, certification, and language all films certified NC-17 or R that are in English, Italian, or Greek
SELECT title, certification, language
FROM films
WHERE language IN ('Italian', 'English', 'Greek') AND certification IN ('NC-17', 'R')
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,title,certification,language
0,Psycho,R,English
1,A Fistful of Dollars,R,Italian
2,Rosemary's Baby,R,English
3,The Wild Bunch,R,English
4,Catch-22,R,English


# Combining filtering and selecting

Time for a little challenge. So far, your SQL vocabulary from this course includes COUNT(), DISTINCT, LIMIT, WHERE, OR, AND, BETWEEN, LIKE, NOT LIKE, and IN. In this exercise, you will try to use some of these together. Writing more complex queries will be standard for you as you become a qualified SQL programmer.

As this query will be a little more complicated than what you've seen so far, we've included a bit of code to get you started. You will be using DISTINCT here too because, surprise, there are two movies named 'Hamlet' in this dataset!

Follow the instructions to find out what 90's films we have in our dataset that would be suitable for English-speaking teens.

In [29]:
query = """
-- Count the unique titles
SELECT COUNT(DISTINCT title) AS nineties_english_films_for_teens
FROM films
-- Filter to release_years to between 1990 and 1999
WHERE release_year BETWEEN 1990 AND 1999
-- Filter to English-language films
	AND language = 'English'
-- Narrow it down to G, PG, and PG-13 certifications
	AND certification IN ( 'G', 'PG', 'PG-13') ;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,nineties_english_films_for_teens
0,310


# What does NULL mean?

- A missing value

# Practice with NULLs

Now that you know what `NULL` means and what it's used for, it's time for some more practice!

Let's explore the `films` table again to better understand what data you have.

In [30]:
query = """
-- List all film titles with missing budgets
SELECT title AS no_budget_info
FROM films
WHERE budget IS NULL;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,no_budget_info
0,Pandora's Box
1,The Prisoner of Zenda
2,The Blue Bird
3,Bambi
4,State Fair


In [31]:
query = """
-- Count the number of films we have language data for
SELECT COUNT(*) AS count_language_known
FROM films
WHERE language IS NOT NULL;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,count_language_known
0,4957
