# Helper Function and Data Imports

In [20]:
from pandasql import sqldf
import pandas as pd

# Create helper function for easier query execution
execute = lambda q: sqldf(q, globals())

In [21]:
import pandas as pd
reviews = pd.read_csv("dataset/reviews.csv", header=None)
reviews = reviews.reset_index()
reviews.columns = ['id',	'film_id',	'num_user',	'num_critic',	'imdb_score',	'num_votes',	'facebook_likes']
print(reviews.columns)
reviews.head()


Index(['id', 'film_id', 'num_user', 'num_critic', 'imdb_score', 'num_votes',
       'facebook_likes'],
      dtype='object')


Unnamed: 0,id,film_id,num_user,num_critic,imdb_score,num_votes,facebook_likes
0,0,3934,588.0,432.0,7.1,203461,46000
1,1,3405,285.0,267.0,6.4,149998,0
2,2,478,65.0,29.0,3.2,8465,491
3,3,74,83.0,25.0,7.6,7071,930
4,4,1254,1437.0,224.0,8.0,241030,13000


In [22]:
people = pd.read_csv("dataset/people.csv", header=None)
people.columns = ['id',	'name',	'birthdate',	'deathdate']
people.head()

Unnamed: 0,id,name,birthdate,deathdate
0,1,50 Cent,1975-07-06,
1,2,A. Michael Baldwin,1963-04-04,
2,3,A. Raven Cruz,,
3,4,A.J. Buckley,1978-02-09,
4,5,A.J. DeLucia,,


In [23]:
films = pd.read_csv("dataset/films.csv", header=None)
films.columns = ['id',	'title',	'release_year',	'country',	'duration',	'language',	'certification',	'gross',	'budget']

films.head()

Unnamed: 0,id,title,release_year,country,duration,language,certification,gross,budget
0,1,Intolerance: Love's Struggle Throughout the Ages,1916.0,USA,123.0,,Not Rated,,385907.0
1,2,Over the Hill to the Poorhouse,1920.0,USA,110.0,,,3000000.0,100000.0
2,3,The Big Parade,1925.0,USA,151.0,,Not Rated,,245000.0
3,4,Metropolis,1927.0,Germany,145.0,German,Not Rated,26435.0,6000000.0
4,5,Pandora's Box,1929.0,Germany,110.0,German,Not Rated,9950.0,


In [24]:
roles = pd.read_csv("dataset/roles.csv", header=None)
roles.columns = ['id',	'film_id',	'person_id',	'role']

roles.head()

Unnamed: 0,id,film_id,person_id,role
0,1,1,1630,director
1,2,1,4843,actor
2,3,1,5050,actor
3,4,1,8175,actor
4,5,2,3000,director


# Learning to COUNT()

Here is a query counting `film_id`. Select the answer below that correctly describes what the query will return.

`SELECT COUNT(film_id) AS count_film_id FROM reviews;`
Run the query in the console to test your theory!

In [5]:
query = """
SELECT COUNT(film_id) AS count_film_id
FROM reviews;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,count_film_id
0,4968


# Practice with COUNT()

`COUNT(*)` tells you how many records are in a table. However, if you want to count the number of non-missing values in a particular field, you can call `COUNT(`) on just that field.

In [6]:
query = """
-- Count the number of records in the people table
SELECT COUNT(*) AS count_records
FROM   people 
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,count_records
0,8397


In [7]:
query = """
-- Count the number of birthdates in the people table
SELECT COUNT(birthdate) AS count_birthdate
FROM  people 
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,count_birthdate
0,6152


In [8]:
query = """
-- Count the languages and countries represented in the films table
-- Count the number of birthdates in the people table
SELECT COUNT(language) AS count_languages, COUNT(country) AS count_countries
FROM  films 
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,count_languages,count_countries
0,4957,4966


# SELECT DISTINCT

Often query results will include many duplicate values. You can use the `DISTINCT` keyword to select the unique values from a field.

In [9]:
query = """
-- Return the unique countries from the films table
SELECT DISTINCT(country)
FROM films
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,country
0,USA
1,Germany
2,Japan
3,Denmark
4,UK


In [10]:
query = """
-- Count the distinct countries from the films table
SELECT COUNT(DISTINCT(country)) AS count_distinct_countries
FROM films
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,count_distinct_countries
0,64


# Order of execution

SQL code is processed differently than other programming languages in that you need to let the processor know where to pull the data from before making selections.

It's essential to know your code's order of execution compared to the order it is written in to understand what results you'll get from your query and how to fix any errors that may come up.

<center><img src="images/01.01.png"  style="width: 400px, height: 300px;"/></center>

# Debugging errors

Debugging is an essential skill for all coders, and it comes from making many mistakes and learning from them.

In [11]:
query = """
-- Debug this code
SELECT certfication
FROM films
LIMIT 5;
"""
result_df = execute(query)

# Show results
result_df.head()

PandaSQLException: (sqlite3.OperationalError) no such column: certfication
[SQL: 
-- Debug this code
SELECT certfication
FROM films
LIMIT 5;
]
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [12]:
query = """
-- certification column
SELECT certification
FROM films
LIMIT 5;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,certification
0,Not Rated
1,
2,Not Rated
3,Not Rated
4,Not Rated


In [13]:
query = """
-- Debug this code
SELECT film_id imdb_score num_votes
FROM reviews;
"""
result_df = execute(query)

# Show results
result_df.head()

PandaSQLException: (sqlite3.OperationalError) near "num_votes": syntax error
[SQL: 
-- Debug this code
SELECT film_id imdb_score num_votes
FROM reviews;
]
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [14]:
query = """
-- Use comma
SELECT film_id, imdb_score, num_votes
FROM reviews;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,film_id,imdb_score,num_votes
0,3934,7.1,203461
1,3405,6.4,149998
2,478,3.2,8465
3,74,7.6,7071
4,1254,8.0,241030


In [15]:
query = """
-- Debug this code
SELECT COUNNT(birthdate) AS count_birthdays
FROM peeple;
"""
result_df = execute(query)

# Show results
result_df.head()

PandaSQLException: (sqlite3.OperationalError) no such table: peeple
[SQL: 
-- Debug this code
SELECT COUNNT(birthdate) AS count_birthdays
FROM peeple;
]
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [16]:
query = """
-- people table
SELECT COUNT(birthdate) AS count_birthdays
FROM people;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,count_birthdays
0,6152


# SQL best practices

SQL style guides outline standard best practices for writing code.

This exercise will present several SQL style tips. Your job will be to decide whether they are considered best practices.

We'll be following Holywell's style guide.

<center><img src="images/01.02.png"  style="width: 400px, height: 300px;"/></center>


# Formatting

Readable code is highly valued in the coding community and professional settings. Without proper formatting, code and results can be difficult to interpret. You'll often be working with other people that need to understand your code or be able to explain your results, so having a solid formatting habit is essential.

In [27]:
query = """
-- Rewrite this query
select person_id, role from roles limit 10
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,person_id,role
0,1630,director
1,4843,actor
2,5050,actor
3,8175,actor
4,3000,director


In [28]:
query = """
-- Query with better formatting
SELECT person_id, role 
FROM roles 
LIMIT 10;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,person_id,role
0,1630,director
1,4843,actor
2,5050,actor
3,8175,actor
4,3000,director


# Non-standard fields

You may occasionally receive a dataset with poorly named fields. Ideally, you would fix these, but you can work around it with some added punctuation in this instance.

A sample query and schema have been provided; imagine you need to be able to run it with a non-standard field name. Select the multiple-choice answer that would correctly fill in the blank to return both a film's id and its number of Facebook likes for all reviews:

<center><img src="images/01.03.jpg"  style="width: 400px, height: 300px;"/></center>


- `"facebook likes"`