In [1]:
import pandas as pd
import psycopg2

def execute_query(sql_query, dbname='sakila', user='postgres', password='postgres', port='5432'):
    # Create a connection to the PostgreSQL database
    conn = psycopg2.connect(dbname=dbname, user=user, password=password, port=port)

    # Use read_sql to execute the query and load the results into a DataFrame
    df = pd.read_sql(sql_query, conn)

    # Close the database connection
    conn.close()

    # Return the DataFrame
    return df



# A review of the LIKE operator

The LIKE operator allows us to filter our queries by matching one or more characters in text data. 

In [2]:
query_result = execute_query(
    """
-- Select all columns
SELECT *
FROM film
-- Select only records that begin with the word 'GOLD'
WHERE title LIKE 'GOLD%';
    """)
query_result.head()

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
0,365,GOLD RIVER,A Taut Documentary of a Database Administrator...,2006,1,,4,4.99,154,21.99,R,2006-02-15 05:03:42,"[Trailers, Commentaries, Deleted Scenes, Behin...",'administr':9 'baloon':21 'databas':8 'documen...
1,366,GOLDFINGER SENSIBILITY,A Insightful Drama of a Mad Scientist And a Hu...,2006,1,,3,0.99,93,29.99,G,2006-02-15 05:03:42,"[Trailers, Commentaries, Behind the Scenes]",'chef':18 'defeat':15 'drama':5 'goldfing':1 '...
2,367,GOLDMINE TYCOON,A Brilliant Epistle of a Composer And a Frisbe...,2006,1,,6,0.99,153,20.99,R,2006-02-15 05:03:42,"[Trailers, Behind the Scenes]",'brilliant':4 'compos':8 'conquer':14 'epistl'...


In [3]:
query_result = execute_query(
    """
SELECT *
FROM film
-- Select only records that end with the word 'GOLD'
WHERE title LIKE '%GOLD';
    """)
query_result.head()

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
0,644,OSCAR GOLD,A Insightful Tale of a Database Administrator ...,2006,1,,7,2.99,115,29.99,PG,2006-02-15 05:03:42,[Behind the Scenes],'administr':9 'databas':8 'dog':12 'face':15 '...
1,870,SWARM GOLD,A Insightful Panorama of a Crocodile And a Boa...,2006,1,,4,0.99,123,12.99,PG-13,2006-02-15 05:03:42,"[Trailers, Commentaries]",'boat':11 'conquer':14 'convent':21 'crocodil'...


In [4]:
query_result = execute_query(
    """
SELECT *
FROM film
-- Select only records that contain the word 'GOLD'
WHERE title LIKE '%GOLD%';
    """)
query_result.head()

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
0,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,2006-02-15 05:03:42,"[Trailers, Deleted Scenes]",'ace':1 'administr':9 'ancient':19 'astound':4...
1,95,BREAKFAST GOLDFINGER,A Beautiful Reflection of a Student And a Stud...,2006,1,,5,4.99,123,18.99,G,2006-02-15 05:03:42,"[Trailers, Commentaries, Deleted Scenes]",'beauti':4 'berlin':18 'breakfast':1 'fight':1...
2,365,GOLD RIVER,A Taut Documentary of a Database Administrator...,2006,1,,4,4.99,154,21.99,R,2006-02-15 05:03:42,"[Trailers, Commentaries, Deleted Scenes, Behin...",'administr':9 'baloon':21 'databas':8 'documen...
3,366,GOLDFINGER SENSIBILITY,A Insightful Drama of a Mad Scientist And a Hu...,2006,1,,3,0.99,93,29.99,G,2006-02-15 05:03:42,"[Trailers, Commentaries, Behind the Scenes]",'chef':18 'defeat':15 'drama':5 'goldfing':1 '...
4,367,GOLDMINE TYCOON,A Brilliant Epistle of a Composer And a Frisbe...,2006,1,,6,0.99,153,20.99,R,2006-02-15 05:03:42,"[Trailers, Behind the Scenes]",'brilliant':4 'compos':8 'conquer':14 'epistl'...


# What is a tsvector?

in this exercise, we are going to dive deeper into what these functions actually return after converting a string to a `tsvector`

In [5]:
query_result = execute_query(
    """
-- Select the film description as a tsvector
SELECT to_tsvector(description)
FROM film;
    """)
query_result.head()

Unnamed: 0,to_tsvector
0,'battl':13 'canadian':18 'drama':3 'epic':2 'f...
1,'administr':7 'ancient':17 'astound':2 'car':1...
2,'astound':2 'baloon':17 'car':9 'factori':18 '...
3,'chase':12 'documentari':3 'fanci':2 'frisbe':...
4,'chef':9 'dentist':12 'documentari':5 'fast':3...


# Basic full-text search

Searching text will become something you do repeatedly when building applications or exploring data sets for data science. Full-text search is helpful when performing exploratory data analysis for a natural language processing model or building a search feature into your application.

In [6]:
query_result = execute_query(
    """
-- Select the title and description
SELECT title, description
FROM film
-- Convert the title to a tsvector and match it against the tsquery 
WHERE to_tsvector(title) @@ to_tsquery('elf');
    """)
query_result.head()

Unnamed: 0,title,description
0,ELF MURDER,A Action-Packed Story of a Frisbee And a Woman...
1,ENCINO ELF,A Astounding Drama of a Feminist And a Teacher...
2,GHOSTBUSTERS ELF,A Thoughtful Epistle of a Dog And a Feminist w...


# User-defined data types

`ENUM` or enumerated data types are great options to use in your database when you have a column where you want to store a fixed list of values that rarely change. 

In [7]:
# query_result = execute_query(
#     """
# -- Create an enumerated data type, compass_position
# CREATE TYPE compass_position AS ENUM (
#   	-- Use the four cardinal directions
#   	'north', 
#   	'south',
#   	'east', 
#   	'west'
# );
#     """)
# query_result.head()

In [8]:
query_result = execute_query(
    """
-- Confirm the new data type is in the pg_type system table
SELECT typcategory
FROM pg_type
WHERE typname='compass_position';
    """)
query_result.head()

Unnamed: 0,typcategory
0,E


# Getting info about user-defined data types

The Sakila database has a user-defined enum data type called mpaa_rating. The rating column in the film table is an mpaa_rating type and contains the familiar rating for that film like PG or R. This is a great example of when an enumerated data type comes in handy. Film ratings have a limited number of standard values that rarely change.

In [9]:
query_result = execute_query(
    """
-- Select the column name, data type and udt name columns
SELECT column_name, data_type, udt_name
FROM INFORMATION_SCHEMA.COLUMNS 
-- Filter by the rating column in the film table
WHERE table_name='film'  AND  column_name ='rating';

    """)
query_result.head()

Unnamed: 0,column_name,data_type,udt_name
0,rating,USER-DEFINED,mpaa_rating


In [10]:
query_result = execute_query(
    """
SELECT *
FROM pg_type 
WHERE typname='mpaa_rating'
    """)
query_result.head()

Unnamed: 0,oid,typname,typnamespace,typowner,typlen,typbyval,typtype,typcategory,typispreferred,typisdefined,...,typalign,typstorage,typnotnull,typbasetype,typtypmod,typndims,typcollation,typdefaultbin,typdefault,typacl
0,33102,mpaa_rating,2200,10,4,True,e,E,False,True,...,i,p,False,0,-1,0,0,,,


# User-defined functions in Sakila

If you were running a real-life DVD Rental store, there are many questions that you may need to answer repeatedly like whether a film is in stock at a particular store or the outstanding balance for a particular customer. These types of scenarios are where user-defined functions will come in very handy. The Sakila database has several user-defined functions pre-defined. These functions are available out-of-the-box and can be used in your queries like many of the built-in functions 

In [11]:
query_result = execute_query(
    """
-- Select the film title and inventory ids
SELECT 
	f.title, 
    i.inventory_id
FROM film AS f 
	-- Join the film table to the inventory table
	INNER JOIN inventory AS i ON f.film_id=i.film_id 
    """)
query_result.head()

Unnamed: 0,title,inventory_id
0,ACADEMY DINOSAUR,1
1,ACADEMY DINOSAUR,2
2,ACADEMY DINOSAUR,3
3,ACADEMY DINOSAUR,4
4,ACADEMY DINOSAUR,5


In [12]:
query_result = execute_query(
    """
-- Select the film title, rental and inventory ids
SELECT 
	f.title, 
    i.inventory_id,
    -- Determine whether the inventory is held by a customer
    inventory_held_by_customer(i.inventory_id) AS held_by_cust 
FROM film as f 
	-- Join the film table to the inventory table
	INNER JOIN inventory AS i ON f.film_id=i.film_id 
    """)
query_result.head()

Unnamed: 0,title,inventory_id,held_by_cust
0,ACADEMY DINOSAUR,1,
1,ACADEMY DINOSAUR,2,
2,ACADEMY DINOSAUR,3,
3,ACADEMY DINOSAUR,4,
4,ACADEMY DINOSAUR,5,


In [13]:
query_result = execute_query(
    """
-- Select the film title and inventory ids
SELECT 
	f.title, 
    i.inventory_id,
    -- Determine whether the inventory is held by a customer
    inventory_held_by_customer(i.inventory_id) as held_by_cust
FROM film as f 
	INNER JOIN inventory AS i ON f.film_id=i.film_id 
WHERE
	-- Only include results where the held_by_cust is not null
    inventory_held_by_customer(i.inventory_id) IS NOT NULL
    """)
query_result.head()

Unnamed: 0,title,inventory_id,held_by_cust
0,ACADEMY DINOSAUR,6,554
1,ACE GOLDFINGER,9,366
2,AFFAIR PREJUDICE,21,111
3,AFRICAN EGG,25,590
4,ALI FOREVER,70,108


# Enabling extensions

Before you can use the capabilities of an extension it must be enabled. As you have previously learned, most PostgreSQL distributions come pre-bundled with many useful extensions to help extend the native features of your database. You will be working with `fuzzystrmatch` and `pg_trgm`

In [14]:
# query_result = execute_query(
#     """
# -- Enable the pg_trgm extension
# CREATE EXTENSION IF NOT EXISTS pg_trgm;
#     """)
# query_result.head()

In [15]:
query_result = execute_query(
    """
-- Select all rows extensions
SELECT * 
FROM pg_extension;
    """)
query_result.head()

Unnamed: 0,oid,extname,extowner,extnamespace,extrelocatable,extversion,extconfig,extcondition
0,13740,plpgsql,10,11,False,1.0,,
1,33757,pg_trgm,10,2200,True,1.6,,
2,33838,fuzzystrmatch,10,2200,True,1.1,,


# Measuring similarity between two strings

we will measure the similarity between the title and description from the `film` table of the `Sakila` database.

In [16]:
query_result = execute_query(
    """
-- Select the title and description columns
SELECT 
  title, 
  description, 
  -- Calculate the similarity
  similarity(title, description)
FROM 
  film
    """)
query_result.head()

Unnamed: 0,title,description,similarity
0,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,0.02
1,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,0.041237
2,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,0.045455
3,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,0.010204
4,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,0.009524


# Levenshtein distance examples

Now let's take a closer look at how we can use the `levenshtein` function to match strings against text data. If you recall, the `levenshtein` distance represents the number of edits required to convert one string to another string being compared.

In [17]:


# query_result = execute_query(
#     """
# CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
#     """)
# query_result.head()

In [18]:
query_result = execute_query(
    """
-- Select the title and description columns
SELECT  
  title, 
  description, 
  -- Calculate the levenshtein distance
  levenshtein(title, 'JET NEIGHBOR') AS distance
FROM 
  film
ORDER BY 3
    """)
query_result.head()

Unnamed: 0,title,description,distance
0,JET NEIGHBORS,A Amazing Display of a Lumberjack And a Teache...,1
1,HILLS NEIGHBORS,A Epic Display of a Hunter And a Feminist who ...,6
2,BED HIGHBALL,A Astounding Panorama of a Lumberjack And a Do...,7
3,EGG IGBY,A Beautiful Documentary of a Boat And a Sumo W...,8
4,WEST LION,A Intrepid Drama of a Butler And a Lumberjack ...,8


In [19]:
query_result = execute_query(
    """
-- Select all rows extensions
SELECT * 
FROM pg_extension;
    """)
query_result.head()

Unnamed: 0,oid,extname,extowner,extnamespace,extrelocatable,extversion,extconfig,extcondition
0,13740,plpgsql,10,11,False,1.0,,
1,33757,pg_trgm,10,2200,True,1.6,,
2,33838,fuzzystrmatch,10,2200,True,1.1,,


# Putting it all together

In this exercise, we are going to use many of the techniques and concepts we learned throughout the course to generate a data set that we could use to predict whether the words and phrases used to describe a film have an impact on the number of rentals.

In [20]:
query_result = execute_query(
    """
-- Select the title and description columns
SELECT  
  title, 
  description 
FROM 
  film
WHERE 
  -- Match "Astounding Drama" in the description
  to_tsvector(description) @@ 
  to_tsquery('Astounding & Drama');
    """)
query_result.head()

Unnamed: 0,title,description
0,BIKINI BORROWERS,A Astounding Drama of a Astronaut And a Cat wh...
1,CAMPUS REMEMBER,A Astounding Drama of a Crocodile And a Mad Co...
2,COWBOY DOOM,A Astounding Drama of a Boy And a Lumberjack w...
3,ENCINO ELF,A Astounding Drama of a Feminist And a Teacher...
4,GLASS DYING,A Astounding Drama of a Frisbee And a Astronau...


In [21]:
query_result = execute_query(
    """
SELECT 
  title, 
  description, 
  -- Calculate the similarity
  similarity(description, 'Astounding Drama')
FROM 
  film 
WHERE 
  to_tsvector(description) @@ 
  to_tsquery('Astounding & Drama') 
ORDER BY 
	similarity(description, 'Astounding Drama') DESC;
    """)
query_result.head()

Unnamed: 0,title,description,similarity
0,COWBOY DOOM,A Astounding Drama of a Boy And a Lumberjack w...,0.246377
1,GLASS DYING,A Astounding Drama of a Frisbee And a Astronau...,0.239437
2,CAMPUS REMEMBER,A Astounding Drama of a Crocodile And a Mad Co...,0.236111
3,ENCINO ELF,A Astounding Drama of a Feminist And a Teacher...,0.22973
4,BIKINI BORROWERS,A Astounding Drama of a Astronaut And a Cat wh...,0.195402
