# Final Project 
This project aims to demonstrate use of various software libraries, programming techniques and an understanding of data systems and architectural principles.

### Import Required Libraries

In [0]:
#!pip install pymongo
import os
import json
import pymongo
import pyspark.pandas as pd 
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BinaryType
from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, DecimalType

### Instantiate Global Variables

In [0]:
# Azure MySQL Server Connection Information ###################
jdbc_hostname = "<zta9cq-mysql>.mysql.database.azure.com"
jdbc_port = 3306
src_database = "sakila_p"

connection_properties = {
  "user" : "zta9cq",
  "password" : "700038Za",
  "driver" : "org.mariadb.jdbc.Driver"
}

# MongoDB Atlas Connection Information ########################
atlas_cluster_name = "clusterza"
atlas_database_name = "sakila_p"
atlas_user_name = "zta9cq"
atlas_password = "700038Za"

# Data Files (JSON) Information ###############################
dst_database = "sakila_dlh"

base_dir = "dbfs:/FileStore/ds2002-finalProject"
database_dir = f"{base_dir}/{dst_database}"

data_dir = f"{base_dir}/final_source_data"
batch_dir = f"{data_dir}/batchf"
stream_dir = f"{data_dir}/streamf"

inventory_stream_dir = f"{stream_dir}/inventory"
rental_stream_dir = f"{stream_dir}/rental"

inventory_output_bronze = f"{database_dir}/fact_inventory/bronze"
inventory_output_silver = f"{database_dir}/fact_inventory/silver"
inventory_output_gold   = f"{database_dir}/fact_inventory/gold"

rental_output_bronze = f"{database_dir}/fact_rental/bronze"
rental_output_silver = f"{database_dir}/fact_rental/silver"
rental_output_gold   = f"{database_dir}/fact_rental/gold"


# Delete the Streaming Files ################################## 
dbutils.fs.rm(f"{database_dir}/fact_inventory", True) 
dbutils.fs.rm(f"{database_dir}/fact_rental", True) 

# Delete the Database Files ###################################
dbutils.fs.rm(database_dir, True)

Out[2]: True

### Define Global Functions

In [0]:
# ######################################################################################################################
# Use this Function to Fetch a DataFrame from the MongoDB Atlas database server Using PyMongo.
# ######################################################################################################################
def get_mongo_dataframe(user_id, pwd, cluster_name, db_name, collection, conditions, projection, sort):
    '''Create a client connection to MongoDB'''
    mongo_uri = f"mongodb+srv://{'zta9cq'}:{'700038Za'}@{'clusterza'}.i4wcg5v.mongodb.net/{'sakila_p'}"
    
    client = pymongo.MongoClient(mongo_uri)

    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = client[db_name]
    if conditions and projection and sort:
        dframe = pd.DataFrame(list(db[collection].find(conditions, projection).sort(sort)))
    elif conditions and projection and not sort:
        dframe = pd.DataFrame(list(db[collection].find(conditions, projection)))
    else:
        dframe = pd.DataFrame(list(db[collection].find()))

    client.close()
    
    return dframe

# ######################################################################################################################
# Use this Function to Create New Collections by Uploading JSON file(s) to the MongoDB Atlas server.
# ######################################################################################################################
def set_mongo_collection(user_id, pwd, cluster_name, db_name, src_file_path, json_files):
    '''Create a client connection to MongoDB'''
    mongo_uri = f"mongodb+srv://zta9cq:700038Za@clusterza.i4wcg5v.mongodb.net/sakila_p"
    client = pymongo.MongoClient(mongo_uri)
    db = client[db_name]
    
    '''Read in a JSON file, and Use It to Create a New Collection'''
    for file in json_files:
        db.drop_collection(file)
        json_file = os.path.join(src_file_path, json_files[file])
        with open(json_file, 'r') as openfile:
            json_object = json.load(openfile)
            file = db[file]
            result = file.insert_many(json_object)

    client.close()
    
    return result

## Populate Dimensions by Ingesting Reference (Cold-path) Data 
#### 1.0. Fetch Reference Data From an Azure MySQL Database

### Create a new Databricks metadata Database sakila_dlh

In [0]:
%sql
DROP DATABASE IF EXISTS sakila_dlh CASCADE;

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS sakila_dlh
COMMENT "DS-2002 Final Project database"
LOCATION "dbfs:/FileStore/ds2002-finalProject/sakila_dlh"
WITH DBPROPERTIES (contains_pii = true, purpose = "Final Project");

### Create a New Table that Sources Date Dimension Data from a Table in an Azure MySQL database.

#### The Date Dimension Table

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW view_date
USING org.apache.spark.sql.jdbc
OPTIONS (
  url "jdbc:mysql://zta9cq-mysql.mysql.database.azure.com:3306/sakila_p",
  dbtable "dim_date",
  user "zta9cq",
  password "700038Za"
)

/* needed to load sakila and sakila data scripts and then create the dimension tables with labs 2,3,4*/

In [0]:
%sql
USE DATABASE sakila_dlh;

CREATE OR REPLACE TABLE sakila_dlh.dim_date
COMMENT "Date Dimension Table"
LOCATION "dbfs:/FileStore/ds2002-finalProject/sakila_dlh/dim_date"
AS SELECT * FROM view_date

num_affected_rows,num_inserted_rows


In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_date;

col_name,data_type,comment
date_key,int,
full_date,date,
date_name,string,
date_name_us,string,
date_name_eu,string,
day_of_week,int,
day_name_of_week,string,
day_of_month,int,
day_of_year,int,
weekday_weekend,string,


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_date LIMIT 5

date_key,full_date,date_name,date_name_us,date_name_eu,day_of_week,day_name_of_week,day_of_month,day_of_year,weekday_weekend,week_of_year,month_name,month_of_year,is_last_day_of_month,calendar_quarter,calendar_year,calendar_year_month,calendar_year_qtr,fiscal_month_of_year,fiscal_quarter,fiscal_year,fiscal_year_month,fiscal_year_qtr
20000101,2000-01-01,2000/01/01,01/01/2000,01/01/2000,7,Saturday,1,1,Weekend,52,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000102,2000-01-02,2000/01/02,01/02/2000,02/01/2000,1,Sunday,2,2,Weekend,52,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000103,2000-01-03,2000/01/03,01/03/2000,03/01/2000,2,Monday,3,3,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000104,2000-01-04,2000/01/04,01/04/2000,04/01/2000,3,Tuesday,4,4,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000105,2000-01-05,2000/01/05,01/05/2000,05/01/2000,4,Wednesday,5,5,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3


#### Create a New Table that Sources Staff Dimension Data from an Azure MySQL database.

In [0]:
%sql

-- Create a Temporary View named "view_staff" that extracts data from your MySQL "Sakila database"

CREATE OR REPLACE TEMPORARY VIEW view_staff
USING org.apache.spark.sql.jdbc
OPTIONS (
  url "jdbc:mysql://zta9cq-mysql.mysql.database.azure.com:3306/sakila_p",
  dbtable "dim_staff",
  user "zta9cq",
  password "700038Za"
)

In [0]:
%sql
USE DATABASE sakila_dlh;

-- Create a new table named "sakila_dlh.dim_staff" using data from the view named "view_staff"

CREATE OR REPLACE TABLE sakila_dlh.dim_staff
COMMENT "Date Staff Table"
LOCATION "dbfs:/FileStore/ds2002-finalProject/sakila_dlh/dim_staff"
AS SELECT * FROM view_staff



num_affected_rows,num_inserted_rows


In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_staff;

col_name,data_type,comment
staff_key,bigint,
first_name,string,
last_name,string,
address_key,bigint,
email,string,
store_key,bigint,
active,bigint,
username,string,
,,
# Detailed Table Information,,


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_staff

staff_key,first_name,last_name,address_key,email,store_key,active,username
1,Mike,Hillyer,3,Mike.Hillyer@sakilastaff.com,1,1,Mike
2,Jon,Stephens,4,Jon.Stephens@sakilastaff.com,2,1,Jon


### Fetch Reference Data from a MongoDB Atlas Database

#### View the Data Files on the Databricks File System

In [0]:
display(dbutils.fs.ls(batch_dir))

path,name,size,modificationTime
dbfs:/FileStore/ds2002-finalProject/final_source_data/batchf/sakila_dimCustomer.csv,sakila_dimCustomer.csv,33843,1682018055000
dbfs:/FileStore/ds2002-finalProject/final_source_data/batchf/sakila_dimStore.json,sakila_dimStore.json,149,1682018055000


#### Create a New MongoDB Database, and Load JSON Data Into a New MongoDB Collection
**NOTE:** The following cell **can** be run more than once because the **set_mongo_collection()** function **is** idempotent.

In [0]:
source_dir = '/dbfs/FileStore/ds2002-finalProject/final_source_data/batchf'
json_file = {"store" : 'sakila_dimStore.json'}

set_mongo_collection(atlas_user_name, atlas_password, atlas_cluster_name, atlas_database_name, source_dir, json_file) 

Out[14]: <pymongo.results.InsertManyResult at 0x7f4f9feece00>

##### Get store dimension data from the new MongoDB collection

In [0]:
%scala
import com.mongodb.spark._

val df_store = spark.read.format("com.mongodb.spark.sql.DefaultSource")
.option("database", "sakila_p").option("collection", "store").load()
.select("store_key","manager_staff_key","address_key")

display(df_store)

store_key,manager_staff_key,address_key
1,1,1
2,2,2


In [0]:
%scala
df_store.printSchema()

##### Use the Spark dataFrame to create store dimension table in the Databricks Metadata Database

In [0]:
%scala
df_store.write.format("delta").mode("overwrite").saveAsTable("sakila_dlh.dim_store")

In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_store

col_name,data_type,comment
store_key,int,
manager_staff_key,int,
address_key,int,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,sakila_dlh,
Table,dim_store,
Type,MANAGED,
Location,dbfs:/FileStore/ds2002-finalProject/sakila_dlh/dim_store,


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_store LIMIT 5

store_key,manager_staff_key,address_key
1,1,1
2,2,2


### Fetch Data from a File System

#### PySpark to read from a CSV file

##### Customer dimension table

In [0]:
customer_csv = f"{batch_dir}/sakila_dimCustomer.csv"

df_customer = spark.read.format('csv').options(header='true', inferSchema='true').load(customer_csv)
display(df_customer)

customer_key,store_key,first_name,last_name,email,address_key
1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5
2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6
3,1,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,7
4,2,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,8
5,1,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,9
6,2,JENNIFER,DAVIS,JENNIFER.DAVIS@sakilacustomer.org,10
7,1,MARIA,MILLER,MARIA.MILLER@sakilacustomer.org,11
8,2,SUSAN,WILSON,SUSAN.WILSON@sakilacustomer.org,12
9,2,MARGARET,MOORE,MARGARET.MOORE@sakilacustomer.org,13
10,1,DOROTHY,TAYLOR,DOROTHY.TAYLOR@sakilacustomer.org,14


In [0]:
df_customer.printSchema()

root
 |-- customer_key: integer (nullable = true)
 |-- store_key: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- address_key: integer (nullable = true)



In [0]:
df_customer.write.format("delta").mode("overwrite").saveAsTable("sakila_dlh.dim_customer")

In [0]:
%sql
DESCRIBE EXTENDED sakila_dlh.dim_customer

col_name,data_type,comment
customer_key,int,
store_key,int,
first_name,string,
last_name,string,
email,string,
address_key,int,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,sakila_dlh,


In [0]:
%sql
SELECT * FROM sakila_dlh.dim_customer LIMIT 5;

customer_key,store_key,first_name,last_name,email,address_key
1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5
2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6
3,1,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,7
4,2,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,8
5,1,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,9


##### Verify Dimension Tables
date and staff dimension from MySQL, store dimension from MongoDB, customer dimension from DBFS file system

In [0]:
%sql
USE sakila_dlh;
SHOW TABLES

database,tableName,isTemporary
sakila_dlh,dim_customer,False
sakila_dlh,dim_date,False
sakila_dlh,dim_staff,False
sakila_dlh,dim_store,False
,view_date,True
,view_staff,True


### Integrate Reference Data with Real-Time Data

### Use AutoLoader to Process Streaming Rental Fact Data 
##### Bronze Table: Process 'Raw' JSON Data

In [0]:
# spark.readStream and the AutoLoader to read in the JSON files in the "rental_stream_dir"
# directory and then create a TempView named "rental_raw_tempview".
# set the "cloudFiles.schemaLocation" Option using the "rental_output_bronze" directory
(spark.readStream
 .format("cloudFiles")
 .option("cloudFiles.format", "json")
 .option("cloudFiles.schemaHints", "rental_key BIGINT")
 .option("cloudFiles.schemaHints", "rental_date DATE")
 .option("cloudFiles.schemaHints", "inventory_key BIGINT")
 .option("cloudFiles.schemaHints", "customer_key BIGINT") 
 .option("cloudFiles.schemaHints", "return_date DATE")
 .option("cloudFiles.schemaHints", "staff_key BIGINT")
 .option("cloudFiles.schemaHints", "last_update DATE")
 .option("cloudFiles.schemaLocation", rental_output_bronze)
 .option("cloudFiles.inferColumnTypes", "true")
 .option("multiLine", "true")
 .load(rental_stream_dir)
 .createOrReplaceTempView("rental_raw_tempview"))

In [0]:
%sql
/* Add Metadata for Traceability */
CREATE OR REPLACE TEMPORARY VIEW rental_bronze_tempview AS (
  SELECT *, current_timestamp() receipt_time, input_file_name() source_file
  FROM rental_raw_tempview
)

In [0]:
%sql
SELECT * FROM rental_bronze_tempview

customer_id,inventory_id,last_update,rental_date,rental_id,return_date,staff_id,_rescued_data,receipt_time,source_file
130,367,2006-02-15,2005-05-24 22:53:30,1,2005-05-26 22:04:30,1,,2023-04-27T15:29:10.574+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
459,1525,2006-02-15,2005-05-24 22:54:33,2,2005-05-28 19:40:33,1,,2023-04-27T15:29:10.574+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
408,1711,2006-02-15,2005-05-24 23:03:39,3,2005-06-01 22:12:39,1,,2023-04-27T15:29:10.574+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
333,2452,2006-02-15,2005-05-24 23:04:41,4,2005-06-03 01:43:41,2,,2023-04-27T15:29:10.574+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
222,2079,2006-02-15,2005-05-24 23:05:21,5,2005-06-02 04:33:21,1,,2023-04-27T15:29:10.574+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
549,2792,2006-02-15,2005-05-24 23:08:07,6,2005-05-27 01:32:07,1,,2023-04-27T15:29:10.574+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
269,3995,2006-02-15,2005-05-24 23:11:53,7,2005-05-29 20:34:53,2,,2023-04-27T15:29:10.574+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
239,2346,2006-02-15,2005-05-24 23:31:46,8,2005-05-27 23:33:46,2,,2023-04-27T15:29:10.574+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
126,2580,2006-02-15,2005-05-25 00:00:40,9,2005-05-28 00:22:40,1,,2023-04-27T15:29:10.574+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
399,1824,2006-02-15,2005-05-25 00:02:21,10,2005-05-31 22:44:21,2,,2023-04-27T15:29:10.574+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json


In [0]:
(spark.table("rental_bronze_tempview")
      .writeStream
      .format("delta")
      .option("checkpointLocation", f"{rental_output_bronze}/_checkpoint")
      .outputMode("append")
      .table("fact_rental_bronze"))

Out[27]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f4f9fe94520>

##### Silver Table: Include Reference Data

In [0]:
(spark.readStream
  .table("fact_rental_bronze")
  .createOrReplaceTempView("rental_silver_tempview"))

In [0]:
%sql
SELECT * FROM rental_silver_tempview

customer_id,inventory_id,last_update,rental_date,rental_id,return_date,staff_id,_rescued_data,receipt_time,source_file
130,367,2006-02-15,2005-05-24 22:53:30,1,2005-05-26 22:04:30,1,,2023-04-27T15:29:30.493+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
459,1525,2006-02-15,2005-05-24 22:54:33,2,2005-05-28 19:40:33,1,,2023-04-27T15:29:30.493+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
408,1711,2006-02-15,2005-05-24 23:03:39,3,2005-06-01 22:12:39,1,,2023-04-27T15:29:30.493+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
333,2452,2006-02-15,2005-05-24 23:04:41,4,2005-06-03 01:43:41,2,,2023-04-27T15:29:30.493+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
222,2079,2006-02-15,2005-05-24 23:05:21,5,2005-06-02 04:33:21,1,,2023-04-27T15:29:30.493+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
549,2792,2006-02-15,2005-05-24 23:08:07,6,2005-05-27 01:32:07,1,,2023-04-27T15:29:30.493+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
269,3995,2006-02-15,2005-05-24 23:11:53,7,2005-05-29 20:34:53,2,,2023-04-27T15:29:30.493+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
239,2346,2006-02-15,2005-05-24 23:31:46,8,2005-05-27 23:33:46,2,,2023-04-27T15:29:30.493+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
126,2580,2006-02-15,2005-05-25 00:00:40,9,2005-05-28 00:22:40,1,,2023-04-27T15:29:30.493+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json
399,1824,2006-02-15,2005-05-25 00:02:21,10,2005-05-31 22:44:21,2,,2023-04-27T15:29:30.493+0000,dbfs:/FileStore/ds2002-finalProject/final_source_data/streamf/rental/sakila_rental.json


In [0]:
%sql
DESCRIBE EXTENDED rental_silver_tempview

col_name,data_type,comment
customer_id,bigint,
inventory_id,bigint,
last_update,date,
rental_date,string,
rental_id,bigint,
return_date,string,
staff_id,bigint,
_rescued_data,string,
receipt_time,timestamp,
source_file,string,


In [0]:
%sql
-- Create a new Temporary View named "rental_silver_tempview" by selecting data from
-- "rental_silver_tempview" and joining it to the Staff, Store, Customer and Date dimension tables
CREATE OR REPLACE TEMPORARY VIEW fact_rental_silver_tempview AS (
  SELECT r.rental_id AS rental_key,
      r.rental_date,
      r.inventory_id AS inventory_key,
      r.customer_id AS customer_key,
      r.return_date,
      r.staff_id AS staff_key,
      r.last_update,
      c.first_name,
      c.last_name,
      c.email AS customer_email,
      c.address_key,
      st.email AS staff_email,
      rid.day_name_of_week AS rental_date_day_name_of_week,
      rid.day_of_month AS rental_date_day_of_month,
      rid.weekday_weekend AS rental_date_weekday_weekend,
      rid.month_name AS rental_date_month_name,
      rid.calendar_quarter AS rental_date_calendar_quarter,
      rid.calendar_year AS rental_date_calendar_year,
      rrd.day_name_of_week AS return_date_name_of_week,
      rrd.day_of_month AS return_date_day_of_month,
      rrd.weekday_weekend AS return_date_weekday_weekend,
      rrd.month_name AS return_date_month_name,
      rrd.calendar_quarter AS return_date_calendar_quarter,
      rrd.calendar_year AS return_date_calendar_year,
      rld.day_name_of_week AS last_update_name_of_week,
      rld.day_of_month AS last_update_day_of_month,
      rld.weekday_weekend AS last_update_weekday_weekend,
      rld.month_name AS last_update_month_name,
      rld.calendar_quarter AS last_update_calendar_quarter,
      rld.calendar_year AS last_update_calendar_year
  FROM rental_silver_tempview AS r
  INNER JOIN sakila_dlh.dim_staff AS st
  ON st.staff_key = r.staff_id
  INNER JOIN sakila_dlh.dim_customer AS c
  ON c.customer_key = r.customer_id
  INNER JOIN sakila_dlh.dim_store AS s
  ON s.manager_staff_key = r.staff_id
  LEFT OUTER JOIN sakila_dlh.dim_date AS rid
  ON rid.full_date = r.rental_date
  LEFT OUTER JOIN sakila_dlh.dim_date AS rrd
  ON rrd.full_date = r.return_date
  LEFT OUTER JOIN sakila_dlh.dim_date AS rld
  ON rld.full_date = r.last_update
  )

In [0]:
(spark.table("fact_rental_silver_tempview")
      .writeStream
      .format("delta")
      .option("checkpointLocation", f"{rental_output_silver}/_checkpoint")
      .outputMode("append")
      .table("fact_rental_silver"))

Out[32]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f4f9feedca0>

In [0]:
%sql
SELECT * FROM fact_rental_silver

rental_key,rental_date,inventory_key,customer_key,return_date,staff_key,last_update,first_name,last_name,customer_email,address_key,staff_email,rental_date_day_name_of_week,rental_date_day_of_month,rental_date_weekday_weekend,rental_date_month_name,rental_date_calendar_quarter,rental_date_calendar_year,return_date_name_of_week,return_date_day_of_month,return_date_weekday_weekend,return_date_month_name,return_date_calendar_quarter,return_date_calendar_year,last_update_name_of_week,last_update_day_of_month,last_update_weekday_weekend,last_update_month_name,last_update_calendar_quarter,last_update_calendar_year
1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15,CHARLOTTE,HUNTER,CHARLOTTE.HUNTER@sakilacustomer.org,134,Mike.Hillyer@sakilastaff.com,Tuesday,24,Weekday,May,2,2005,Thursday,26,Weekday,May,2,2005,Wednesday,15,Weekday,February,1,2006
2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-15,TOMMY,COLLAZO,TOMMY.COLLAZO@sakilacustomer.org,464,Mike.Hillyer@sakilastaff.com,Tuesday,24,Weekday,May,2,2005,Saturday,28,Weekend,May,2,2005,Wednesday,15,Weekday,February,1,2006
3,2005-05-24 23:03:39,1711,408,2005-06-01 22:12:39,1,2006-02-15,MANUEL,MURRELL,MANUEL.MURRELL@sakilacustomer.org,413,Mike.Hillyer@sakilastaff.com,Tuesday,24,Weekday,May,2,2005,Wednesday,1,Weekday,June,2,2005,Wednesday,15,Weekday,February,1,2006
4,2005-05-24 23:04:41,2452,333,2005-06-03 01:43:41,2,2006-02-15,ANDREW,PURDY,ANDREW.PURDY@sakilacustomer.org,338,Jon.Stephens@sakilastaff.com,Tuesday,24,Weekday,May,2,2005,Friday,3,Weekday,June,2,2005,Wednesday,15,Weekday,February,1,2006
5,2005-05-24 23:05:21,2079,222,2005-06-02 04:33:21,1,2006-02-15,DELORES,HANSEN,DELORES.HANSEN@sakilacustomer.org,226,Mike.Hillyer@sakilastaff.com,Tuesday,24,Weekday,May,2,2005,Thursday,2,Weekday,June,2,2005,Wednesday,15,Weekday,February,1,2006
6,2005-05-24 23:08:07,2792,549,2005-05-27 01:32:07,1,2006-02-15,NELSON,CHRISTENSON,NELSON.CHRISTENSON@sakilacustomer.org,555,Mike.Hillyer@sakilastaff.com,Tuesday,24,Weekday,May,2,2005,Friday,27,Weekday,May,2,2005,Wednesday,15,Weekday,February,1,2006
7,2005-05-24 23:11:53,3995,269,2005-05-29 20:34:53,2,2006-02-15,CASSANDRA,WALTERS,CASSANDRA.WALTERS@sakilacustomer.org,274,Jon.Stephens@sakilastaff.com,Tuesday,24,Weekday,May,2,2005,Sunday,29,Weekend,May,2,2005,Wednesday,15,Weekday,February,1,2006
8,2005-05-24 23:31:46,2346,239,2005-05-27 23:33:46,2,2006-02-15,MINNIE,ROMERO,MINNIE.ROMERO@sakilacustomer.org,243,Jon.Stephens@sakilastaff.com,Tuesday,24,Weekday,May,2,2005,Friday,27,Weekday,May,2,2005,Wednesday,15,Weekday,February,1,2006
9,2005-05-25 00:00:40,2580,126,2005-05-28 00:22:40,1,2006-02-15,ELLEN,SIMPSON,ELLEN.SIMPSON@sakilacustomer.org,130,Mike.Hillyer@sakilastaff.com,Wednesday,25,Weekday,May,2,2005,Saturday,28,Weekend,May,2,2005,Wednesday,15,Weekday,February,1,2006
10,2005-05-25 00:02:21,1824,399,2005-05-31 22:44:21,2,2006-02-15,DANNY,ISOM,DANNY.ISOM@sakilacustomer.org,404,Jon.Stephens@sakilastaff.com,Wednesday,25,Weekday,May,2,2005,Tuesday,31,Weekday,May,2,2005,Wednesday,15,Weekday,February,1,2006


In [0]:
%sql
DESCRIBE EXTENDED fact_rental_silver

col_name,data_type,comment
rental_key,bigint,
rental_date,string,
inventory_key,bigint,
customer_key,bigint,
return_date,string,
staff_key,bigint,
last_update,date,
first_name,string,
last_name,string,
customer_email,string,


##### Gold Table: Perform Aggregations

In [0]:
%sql
-- query that returns the number of rentals from each staff key grouped by staff
SELECT r.staff_key,
  COUNT(staff_key) AS staffAmount
FROM sakila_dlh.fact_rental_silver AS r
GROUP BY staff_key
ORDER BY staffAmount DESC 

staff_key,staffAmount
2,521
1,479


In [0]:
%sql
-- query that returns inventory count grouped by whether the rental was on a weekday or weekend
SELECT rental_date_weekday_weekend
  ,COUNT(inventory_key) AS inventoryCount
FROM sakila_dlh.fact_rental_silver
GROUP BY rental_date_weekday_weekend




rental_date_weekday_weekend,inventoryCount
Weekday,650
Weekend,350


#### 9.0. Clean up the File System

In [0]:
%fs rm -r /FileStore/ds2002-finalProject/