In [1]:
import boto3
import os
import configparser
from datetime import datetime
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col, isnan, when, count, trim, desc, sum, asc
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.functions import countDistinct, explode, split, concat_ws, collect_list
from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Date

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# STEP 1: Get the params of the created redshift cluster 
- We need:
    - The redshift cluster <font color='red'>endpoint</font>
    - The <font color='red'>IAM role ARN</font> that give access to Redshift to read from S3

In [2]:
config = configparser.ConfigParser()

#Normally this file should be in ~/.aws/credentials
config.read_file(open('dwh.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

os.environ["AWS_ACCESS_KEY_ID"]= config['AWS']['KEY']
os.environ["AWS_SECRET_ACCESS_KEY"]= config['AWS']['SECRET']

In [3]:
# FILL IN THE REDSHIFT ENPOINT HERE
# e.g. DWH_ENDPOINT="redshift-cluster-1.csmamz5zxmle.us-west-2.redshift.amazonaws.com" 
DWH_ENDPOINT="dwhcluster.ci2m6m74tbzm.us-west-2.redshift.amazonaws.com" 
    
#FILL IN THE IAM ROLE ARN you got in step 2.2 of the previous exercise
#e.g DWH_ROLE_ARN="arn:aws:iam::988332130976:role/dwhRole"
DWH_ROLE_ARN="arn:aws:iam::264680862608:role/dwhRole"

In [4]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

# Step 2: Explore and Assess the Data

### Part 1: Load Data from S3 and clean dataframe

In [5]:
movieSchema = R([
            Fld("movieId",Int()),
            Fld("title",Str()),
            Fld("genres",Str())
            ])

In [6]:
ratingSchema = R([
            Fld("userId",Int()),
            Fld("movieId",Int()),
            Fld("rating",Dbl()),
            Fld("ts",Str())
            ])

In [83]:
tagSchema = R([
            Fld("userId",Int()),
            Fld("movieId",Int()),
            Fld("tag",Str()),
            Fld("ts",Str())
            ])

In [98]:
# read movies, ratings, and tags csv
dfmovies = spark.read.csv("s3a://udacity-input/ml-latest-small/movies.csv", header=True, schema=movieSchema)
dfratings = spark.read.csv("s3a://udacity-input/ml-latest-small/ratings.csv", header = True, schema=ratingSchema)
dftags = spark.read.csv("s3a://udacity-input/ml-latest-small/tags.csv", header = True, schema=tagSchema)

In [9]:
# read awards txt
dfawards = spark.read.option("header", "true") \
    .option("delimiter", "|") \
    .option("inferSchema", "true") \
    .csv("s3a://udacity-input/ml-latest-small/Awards.txt")

dfawards.show(10, truncate=False)

+--------------------------------------------------------+--------+----------+-----------+
|Film                                                    |Year    |Awards    |Nominations|
+--------------------------------------------------------+--------+----------+-----------+
|Parasite                                                |2019    |4.0       |6.0        |
|Ford v Ferrari                                          |2019    |2.0       |4.0        |
|Learning to Skateboard in a Warzone (If You're a Girl)  |2019    |1.0       |1.0        |
|The Neighbors' Window                                   |2019    |1.0       |1.0        |
|Little Women                                            |2019    |1.0       |6.0        |
|Marriage Story                                          |2019    |1.0       |6.0        |
|Jojo Rabbit                                             |2019    |1.0       |6.0        |
|Toy Story 4                                             |2019    |1.0       |2.0        |

In [10]:
# read award_corrected txt
dfawards2 = spark.read.option("header", "true") \
    .option("delimiter", "|") \
    .option("inferSchema", "true") \
    .csv("s3a://udacity-input/ml-latest-small/Award_corrected.txt")

dfawards2.show(10, truncate=False)

+--------------------+-------+----------+-----------+
|Film                |Year   |Awards    |Nominations|
+--------------------+-------+----------+-----------+
|Becket              |1964.0 |1.0       |12         |
|Ben-Hur             |1959.0 |11.0      |12         |
|Dances with Wolves  |1990.0 |7.0       |12         |
|The English Patient |1996.0 |9.0       |12         |
|Gladiator           |2000.0 |5.0       |12         |
|Johnny Belinda      |1948.0 |1.0       |12         |
|Lincoln             |2012.0 |2.0       |12         |
|Mrs. Miniver        |1942.0 |6.0       |12         |
|My Fair Lady        |1964.0 |8.0       |12         |
|On the Waterfront   |1954.0 |8.0       |12         |
+--------------------+-------+----------+-----------+
only showing top 10 rows



In [11]:
dfmovies.printSchema()
dfmovies.show(5, truncate = False)
dfmovies.count()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
+-------+----------------------------------+-------------------------------------------+
only showing top 5 rows



9742

In [85]:
dfratings = dfratings.withColumn(
    "rate_time",
    F.to_timestamp(F.from_unixtime((col("ts")) , 'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp")
).drop("ts")

In [86]:
dfratings = dfratings.withColumn("year", F.year("rate_time"))

In [87]:
dfratings.printSchema()
dfratings.show(5)
dfratings.count()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- rate_time: timestamp (nullable = true)
 |-- year: integer (nullable = true)

+------+-------+------+-------------------+----+
|userId|movieId|rating|          rate_time|year|
+------+-------+------+-------------------+----+
|     1|      1|   4.0|2000-07-30 18:45:03|2000|
|     1|      3|   4.0|2000-07-30 18:20:47|2000|
|     1|      6|   4.0|2000-07-30 18:37:04|2000|
|     1|     47|   5.0|2000-07-30 19:03:35|2000|
|     1|     50|   5.0|2000-07-30 18:48:51|2000|
+------+-------+------+-------------------+----+
only showing top 5 rows



100836

In [101]:
dftags = dftags.withColumn("tag_time", F.to_timestamp(col("ts") / 1)).drop("ts")
dftags = dftags.withColumn("year", F.year("tag_time"))

In [102]:
dftags.printSchema()
dftags.show(5)
dftags.count()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- tag_time: timestamp (nullable = true)
 |-- year: integer (nullable = true)

+------+-------+---------------+-------------------+----+
|userId|movieId|            tag|           tag_time|year|
+------+-------+---------------+-------------------+----+
|     2|  60756|          funny|2015-10-24 19:29:54|2015|
|     2|  60756|Highly quotable|2015-10-24 19:29:56|2015|
|     2|  60756|   will ferrell|2015-10-24 19:29:52|2015|
|     2|  89774|   Boxing story|2015-10-24 19:33:27|2015|
|     2|  89774|            MMA|2015-10-24 19:33:20|2015|
+------+-------+---------------+-------------------+----+
only showing top 5 rows



3683

In [17]:
dfawards.columns

['Film   ', 'Year   ', 'Awards    ', 'Nominations']

In [18]:
dfawards = dfawards.withColumn("title", dfawards['Film   '].cast(Str())).drop('Film   ')
dfawards = dfawards.withColumn("year", dfawards['Year   '].cast(Date())).drop("Year   ")
dfawards = dfawards.withColumn("year", F.year("year"))
dfawards = dfawards.withColumn("awards", dfawards['Awards    '].cast(Dbl())).drop("Awards    ")
dfawards = dfawards.withColumn("nominations", dfawards['Nominations'].cast(Int()))

In [19]:
dfawards.columns

['nominations', 'title', 'year', 'awards']

In [20]:
dfawards2.columns

['Film   ', 'Year   ', 'Awards    ', 'Nominations']

In [21]:
dfawards2 = dfawards2.withColumn("title", dfawards2['Film   '].cast(Str())).drop('Film   ')
dfawards2 = dfawards2.withColumn("date", F.to_timestamp(col('Year   '))).drop('Year   ')
dfawards2 = dfawards2.withColumn("year", F.year("date")).drop("date")
dfawards2 = dfawards2.withColumn("awards", dfawards2['Awards    '].cast(Dbl())).drop("Awards    ")
dfawards2 = dfawards2.withColumn("nominations", dfawards2['Nominations'].cast(Int()))

In [22]:
dfawards.printSchema()
dfawards.show(5, truncate = False)
dfawards.count()

root
 |-- nominations: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- awards: double (nullable = true)

+-----------+--------------------------------------------------------+----+------+
|nominations|title                                                   |year|awards|
+-----------+--------------------------------------------------------+----+------+
|6          |Parasite                                                |2019|4.0   |
|4          |Ford v Ferrari                                          |2019|2.0   |
|1          |Learning to Skateboard in a Warzone (If You're a Girl)  |2019|1.0   |
|1          |The Neighbors' Window                                   |2019|1.0   |
|6          |Little Women                                            |2019|1.0   |
+-----------+--------------------------------------------------------+----+------+
only showing top 5 rows



1316

In [23]:
dfawards2.printSchema()
dfawards2.show(5, truncate = False)
dfawards2.count()

root
 |-- nominations: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- awards: double (nullable = true)

+-----------+--------------------+----+------+
|nominations|title               |year|awards|
+-----------+--------------------+----+------+
|12         |Becket              |1970|1.0   |
|12         |Ben-Hur             |1970|11.0  |
|12         |Dances with Wolves  |1970|7.0   |
|12         |The English Patient |1970|9.0   |
|12         |Gladiator           |1970|5.0   |
+-----------+--------------------+----+------+
only showing top 5 rows



76

In [24]:
# write to S3
dfawards.write.parquet("s3a://sparkifytest/movies/awards/", mode="overwrite")

In [25]:
dfawards2.write.parquet("s3a://sparkifytest/movies/awards2/", mode="overwrite")

In [25]:
dfmovies.write.parquet("s3a://sparkifytest/movies/movies/", mode="overwrite")

In [55]:
dfratings.write.parquet("s3a://sparkifytest/movies/ratings/", mode="overwrite")

In [27]:
dftags.write.parquet("s3a://sparkifytest/movies/tags/", mode="overwrite")

In [28]:
dfawards = spark.read.parquet("s3a://sparkifytest/movies/awards/*")
dfawards2 = spark.read.parquet("s3a://sparkifytest/movies/awards2/*")
dfmovies = spark.read.parquet("s3a://sparkifytest/movies/movies/*")
dfratings = spark.read.parquet("s3a://sparkifytest/movies/ratings/*")
dftags = spark.read.parquet("s3a://sparkifytest/movies/tags/*")

### Part 2: Explore the Data and data normalization

### Data Wrangling with DataFrames
Identify data quality issues, like missing values, duplicate data, etc.

In [28]:
# check for null values
dfmovies.select([count(when(col(c).isNull(), c)).alias(c) for c in dfmovies.columns]).show()
dfratings.select([count(when(col(c).isNull(), c)).alias(c) for c in dfratings.columns]).show()
dfawards.select([count(when(col(c).isNull(), c)).alias(c) for c in dfawards.columns]).show()
dfawards2.select([count(when(col(c).isNull(), c)).alias(c) for c in dfawards2.columns]).show()

+-------+-----+------+
|movieId|title|genres|
+-------+-----+------+
|      0|    0|     0|
+-------+-----+------+

+------+-------+------+---------+----+
|userId|movieId|rating|rate_time|year|
+------+-------+------+---------+----+
|     0|      0|     0|        0|   0|
+------+-------+------+---------+----+

+-----------+-----+----+------+
|nominations|title|year|awards|
+-----------+-----+----+------+
|          0|    0|  73|     0|
+-----------+-----+----+------+

+-----------+-----+----+------+
|nominations|title|year|awards|
+-----------+-----+----+------+
|          0|    0|   0|     0|
+-----------+-----+----+------+



In [29]:
# show records with year is NaN
dfawards.filter(dfawards.year.isNull()).show(5, truncate = False)

+-----------+----------------------------------------+----+------+
|nominations|title                                   |year|awards|
+-----------+----------------------------------------+----+------+
|1          |Joker   2019                            |null|1.0   |
|0          |Once Upon a Time in Hollywood   2019    |null|1.0   |
|0          |1917    2019                            |null|1.0   |
|0          |Roma    2018                            |null|1.0   |
|0          |The Favourite   2018                    |null|1.0   |
+-----------+----------------------------------------+----+------+
only showing top 5 rows



In [30]:
# check records in dfawards2
dfawards2.filter(trim(dfawards2.title) == "Joker").show()
dfawards2.filter(trim(dfawards2.title) == "Once Upon a Time in Hollywood").show()
dfawards2.filter(trim(dfawards2.title) == "1917").show()
dfawards2.filter(trim(dfawards2.title) == "Roma").show()
dfawards2.filter(trim(dfawards2.title) == "The Favourite").show()

+-----------+--------+----+------+
|nominations|   title|year|awards|
+-----------+--------+----+------+
|         11|Joker   |1970|   2.0|
+-----------+--------+----+------+

+-----------+--------------------+----+------+
|nominations|               title|year|awards|
+-----------+--------------------+----+------+
|         10|Once Upon a Time ...|1970|   2.0|
+-----------+--------------------+----+------+

+-----------+--------+----+------+
|nominations|   title|year|awards|
+-----------+--------+----+------+
|         10|1917    |1970|   3.0|
+-----------+--------+----+------+

+-----------+--------+----+------+
|nominations|   title|year|awards|
+-----------+--------+----+------+
|         10|Roma    |1970|   3.0|
+-----------+--------+----+------+

+-----------+----------------+----+------+
|nominations|           title|year|awards|
+-----------+----------------+----+------+
|         10|The Favourite   |1970|   1.0|
+-----------+----------------+----+------+



In [31]:
# drop records with year == NaN
dfawards = dfawards.dropna(subset=["year"])

In [32]:
dfawards.select([count(when(col(c).isNull(), c)).alias(c) for c in dfawards.columns]).show()
dfawards.show(5, truncate = False)
dfawards.count()

+-----------+-----+----+------+
|nominations|title|year|awards|
+-----------+-----+----+------+
|          0|    0|   0|     0|
+-----------+-----+----+------+

+-----------+--------------------------------------------------------+----+------+
|nominations|title                                                   |year|awards|
+-----------+--------------------------------------------------------+----+------+
|6          |Parasite                                                |2019|4.0   |
|4          |Ford v Ferrari                                          |2019|2.0   |
|1          |Learning to Skateboard in a Warzone (If You're a Girl)  |2019|1.0   |
|1          |The Neighbors' Window                                   |2019|1.0   |
|6          |Little Women                                            |2019|1.0   |
+-----------+--------------------------------------------------------+----+------+
only showing top 5 rows



1243

In [106]:
# union dfawards and dfawards2, and remove duplicates (actually no duplicates in there two dataset)
dfawards3 = dfawards.union(dfawards2).distinct().filter(~col("year").isin([0]) & col("year").isNotNull()).sort(desc('year'))
dfawards3.show(5, truncate = False)

+-----------+--------------------------------------------------------+----+------+
|nominations|title                                                   |year|awards|
+-----------+--------------------------------------------------------+----+------+
|6          |Marriage Story                                          |2019|1.0   |
|1          |Learning to Skateboard in a Warzone (If You're a Girl)  |2019|1.0   |
|4          |Ford v Ferrari                                          |2019|2.0   |
|6          |Parasite                                                |2019|4.0   |
|1          |Rocketman                                               |2019|1.0   |
+-----------+--------------------------------------------------------+----+------+
only showing top 5 rows



In [34]:
# show records with year is not in the right range
dfawards3.where(dfawards3.year < 1927).show(5, truncate = False)

+-----------+-----+----+------+
|nominations|title|year|awards|
+-----------+-----+----+------+
+-----------+-----+----+------+



In [None]:
dfawards3.write.partitionBy("year").parquet("s3a://sparkifytest/movies/awards3/", mode="overwrite")

In [37]:
dfawards.count()

1243

In [38]:
dfawards2.count()

76

In [110]:
dfawards3.count()

1319

### How to deal with null values
#### 1. Deleting Rows 
This method commonly used to handle the null values. Here, we either delete a particular row if it has a null value for a particular feature and a particular column if it has more than 70-75% of missing values. This method is advised only when there are enough samples in the data set. 
#### 2. Replacing With Mean/Median/Mode  
This strategy can be applied on a feature which has numeric data like the age of a person or the rating score. We can calculate the mean, median or mode of the feature and replace it with the missing values. This is an approximation which can add variance to the data set. 
#### 3. Assigning An Unique Category  
A categorical feature will have a definite number of possibilities, such as gender, for example. Since they have a definite number of classes, we can assign another class for the missing values like unknown.
#### 4. Predicting The Missing Values  
Using the features which do not have missing values, we can predict the nulls with the help of a machine learning algorithm. 
#### 5. Using Algorithms Which Support Missing Values  
KNN is a machine learning algorithm which works on the principle of distance measure. This algorithm can be used when there are nulls present in the dataset. While the algorithm is applied, KNN considers the missing values by taking the majority of the K nearest values. 

In [40]:
# check duplicate data, also confirm the dataset is on which level
dfmovies.count()

9742

In [41]:
dfmovies[['movieId']].drop_duplicates().count()

9742

In [42]:
dfratings.count()

100836

In [43]:
# dfratings is on movieid and userid level
dfratings[['movieId', 'userId']].drop_duplicates().count()

100836

In [44]:
dfawards3.count()

1319

In [123]:
# dfawards3 is on title and year level
dfawards3[['title', 'year']].drop_duplicates().count()

1319

In [46]:
# basic count
# number of movies in the dataset
distinct_movie = dfmovies.select("movieId").distinct().count()
print('{} movies in the movies dataset'.format(distinct_movie))

9742 movies in the movies dataset


In [47]:
# number of users in the dataset
distinct_user = dfratings.select("userId").distinct().count()
print('{} users rated the movies'.format(distinct_user))

610 users rated the movies


In [124]:
# number of movies receiving awards
distinct_award = dfawards3.select("title", "year").distinct().count()
print('{} movies received awards'.format(distinct_award))

1319 movies received awards


In [27]:
df1=dfawards3.groupBy("title").count().filter("count > 1")
df1.show(truncate = False)

+------------------------------------------------+-----+
|title                                           |count|
+------------------------------------------------+-----+
|Cyrano de Bergerac                              |2    |
|King Kong                                       |2    |
|Henry V                                         |2    |
|The Lord of the Rings: The Return of the King   |2    |
|A Star Is Born                                  |3    |
|Crouching Tiger, Hidden Dragon                  |2    |
|Little Women                                    |3    |
|The Great Gatsby                                |2    |
|The Old Man and the Sea                         |2    |
|Titanic                                         |2    |
|Schindler's List                                |2    |
|Braveheart                                      |2    |
|Dances with Wolves                              |2    |
|Saving Private Ryan                             |2    |
|Up                            

In [28]:
dfawards3.filter(trim(dfawards3.title) == "A Star Is Born").show()
dfawards3.filter(trim(dfawards3.title) == "Titanic").show()

+-----------+----------------+----+------+
|nominations|           title|year|awards|
+-----------+----------------+----+------+
|          7|A Star Is Born  |1937|   1.0|
|          4|A Star Is Born  |1976|   1.0|
|          8|A Star Is Born  |2018|   1.0|
+-----------+----------------+----+------+

+-----------+--------+----+------+
|nominations|   title|year|awards|
+-----------+--------+----+------+
|         14|Titanic |1997|  11.0|
|          2|Titanic |1953|   1.0|
+-----------+--------+----+------+



In [125]:
# show movies receiving more than 10 awards
dfawards3.where(dfawards3.awards > 10).show(truncate = False)

+-----------+------------------------------------------------+----+------+
|nominations|title                                           |year|awards|
+-----------+------------------------------------------------+----+------+
|14         |Titanic                                         |1997|11.0  |
|11         |The Lord of the Rings: The Return of the King   |2003|11.0  |
|12         |Ben-Hur                                         |1970|11.0  |
|11         |The Lord of the Rings: The Return of the King   |1970|11.0  |
+-----------+------------------------------------------------+----+------+



In [127]:
# movie received the most award
awards_cnt = dfawards3.groupBy("title", "year").agg(F.sum("awards").alias('cnt')).orderBy(desc('cnt'))

In [128]:
awards_cnt.show(truncate = False)

+------------------------------------------------+----+----+
|title                                           |year|cnt |
+------------------------------------------------+----+----+
|Titanic                                         |1997|11.0|
|The Lord of the Rings: The Return of the King   |1970|11.0|
|Ben-Hur                                         |1970|11.0|
|The Lord of the Rings: The Return of the King   |2003|11.0|
|West Side Story                                 |1970|10.0|
|The Last Emperor                                |1987|9.0 |
|Gigi                                            |1958|9.0 |
|The English Patient                             |1970|9.0 |
|On the Waterfront                               |1970|8.0 |
|Gandhi                                          |1970|8.0 |
|From Here to Eternity                           |1953|8.0 |
|My Fair Lady                                    |1970|8.0 |
|Slumdog Millionaire                             |1970|8.0 |
|Cabaret                

In [96]:
# Minimum number of ratings per user
# Minimum number of ratings per movie 
tmp1 = dfratings.groupBy("userID").count().toPandas()['count'].min()
tmp2 = dfratings.groupBy("movieId").count().toPandas()['count'].min()
print('For the users that rated movies and the movies that were rated:')
print('Minimum number of ratings per user is {}'.format(tmp1))
print('Minimum number of ratings per movie is {}'.format(tmp2))

For the users that rated movies and the movies that were rated:
Minimum number of ratings per user is 20
Minimum number of ratings per movie is 1


In [97]:
# number of movies rated by only one user
tmp1 = sum(dfratings.groupBy("movieId").count().toPandas()['count'] == 1)
tmp2 = dfratings.select('movieId').distinct().count()
print('{} out of {} movies are rated by only one user'.format(tmp1, tmp2))

3446 out of 9724 movies are rated by only one user


In [26]:
# split the mixed genres by '|'
dfmovies2 = dfmovies.withColumn('genre', explode(split(dfmovies.genres, '\|')))

In [27]:
dfmovies2.show(11)

+-------+--------------------+--------------------+---------+
|movieId|               title|              genres|    genre|
+-------+--------------------+--------------------+---------+
|      1|    Toy Story (1995)|Adventure|Animati...|Adventure|
|      1|    Toy Story (1995)|Adventure|Animati...|Animation|
|      1|    Toy Story (1995)|Adventure|Animati...| Children|
|      1|    Toy Story (1995)|Adventure|Animati...|   Comedy|
|      1|    Toy Story (1995)|Adventure|Animati...|  Fantasy|
|      2|      Jumanji (1995)|Adventure|Childre...|Adventure|
|      2|      Jumanji (1995)|Adventure|Childre...| Children|
|      2|      Jumanji (1995)|Adventure|Childre...|  Fantasy|
|      3|Grumpier Old Men ...|      Comedy|Romance|   Comedy|
|      3|Grumpier Old Men ...|      Comedy|Romance|  Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|   Comedy|
+-------+--------------------+--------------------+---------+
only showing top 11 rows



In [28]:
dfgenre = dfmovies2.select("movieId", "genre").dropDuplicates().dropna(subset=["movieId", "genre"]).withColumn("genreId", F.monotonically_increasing_id())

In [29]:
#dfgenre.filter(dfgenre.title.contains('Toy Story (1995)')).show()
dfgenre.filter(dfgenre.movieId == 1).show()

+-------+---------+-------------+
|movieId|    genre|      genreId|
+-------+---------+-------------+
|      1|   Comedy|  77309411328|
|      1|Adventure| 206158430208|
|      1|  Fantasy| 773094113280|
|      1| Children|1348619730944|
|      1|Animation|1434519076864|
+-------+---------+-------------+



In [None]:
dfgenre.write.parquet("s3a://sparkifytest/movies/genres/", mode="overwrite")

In [34]:
dfgenre.count()

22074

In [48]:
# use the dataframe dfmovies2 to match every movie to a single genre
genre_movies = dfmovies2 \
                    .groupBy(dfmovies2.genre) \
                    .agg(concat_ws(',', collect_list(dfmovies2.movieId)) \
                    .alias('MovieIds')) \
                    .orderBy('genre')

In [49]:
genre_movies.show()

+------------------+--------------------+
|             genre|            MovieIds|
+------------------+--------------------+
|(no genres listed)|114335,122888,122...|
|            Action|6,9,10,15,20,23,4...|
|         Adventure|1,2,8,10,13,15,29...|
|         Animation|1,13,48,239,313,3...|
|          Children|1,2,8,13,27,34,38...|
|            Comedy|1,3,4,5,7,11,12,1...|
|             Crime|6,16,20,21,22,23,...|
|       Documentary|77,99,108,116,128...|
|             Drama|4,11,14,16,17,20,...|
|           Fantasy|1,2,29,44,60,126,...|
|         Film-Noir|164,320,347,913,9...|
|            Horror|12,22,70,92,93,15...|
|              IMAX|150,364,595,1797,...|
|           Musical|48,107,199,242,34...|
|           Mystery|22,29,32,47,50,10...|
|           Romance|3,4,7,11,15,17,25...|
|            Sci-Fi|24,29,32,66,76,10...|
|          Thriller|6,10,20,21,22,23,...|
|               War|41,73,110,151,155...|
|           Western|163,210,266,303,3...|
+------------------+--------------

In [50]:
# count number of movies in each genre
df2=dfmovies2.groupBy("genre").count().filter(trim(dfmovies2.genre) != '(no genres listed)').sort(desc('count'))
df2.show(truncate = False)

+-----------+-----+
|genre      |count|
+-----------+-----+
|Drama      |4361 |
|Comedy     |3756 |
|Thriller   |1894 |
|Action     |1828 |
|Romance    |1596 |
|Adventure  |1263 |
|Crime      |1199 |
|Sci-Fi     |980  |
|Horror     |978  |
|Fantasy    |779  |
|Children   |664  |
|Animation  |611  |
|Mystery    |573  |
|Documentary|440  |
|War        |382  |
|Musical    |334  |
|Western    |167  |
|IMAX       |158  |
|Film-Noir  |87   |
+-----------+-----+



In [39]:
# pick distinct genre
genres_dummies = dfmovies2.filter(dfmovies2.genre != '(no genres listed)') \
                     .select(dfmovies2.genre).distinct() \
                     .orderBy(dfmovies2.genre)

In [40]:
genres_dummies.show()

+-----------+
|      genre|
+-----------+
|     Action|
|  Adventure|
|  Animation|
|   Children|
|     Comedy|
|      Crime|
|Documentary|
|      Drama|
|    Fantasy|
|  Film-Noir|
|     Horror|
|       IMAX|
|    Musical|
|    Mystery|
|    Romance|
|     Sci-Fi|
|   Thriller|
|        War|
|    Western|
+-----------+



### Data Wrangling with Spark SQL and OLAP

In [77]:
dfratings.createOrReplaceTempView("ratings")     #userid, movieid, rating, timestamp, year
dfmovies.createOrReplaceTempView("movies")       #movieid, title, genre
dftags.createOrReplaceTempView("tags")           #userid, movieid, tag, timestamp, year
dfawards3.createOrReplaceTempView("awards")      #title, year, awards, nominations
dfgenre.createOrReplaceTempView("genres")        #genreid, genre, movieId

In [78]:
# Split title and release year in separate columns in movies dataframe.       
movies = spark.sql("select movieId, substr(title, 0, length(title)-7) as title, substr(title, -5, 4) as year from movies")
movies.show()
movies.createOrReplaceTempView("movies") 

+-------+--------------------+----+
|movieId|               title|year|
+-------+--------------------+----+
|      1|           Toy Story|1995|
|      2|             Jumanji|1995|
|      3|    Grumpier Old Men|1995|
|      4|   Waiting to Exhale|1995|
|      5|Father of the Bri...|1995|
|      6|                Heat|1995|
|      7|             Sabrina|1995|
|      8|        Tom and Huck|1995|
|      9|        Sudden Death|1995|
|     10|           GoldenEye|1995|
|     11|American Presiden...|1995|
|     12|Dracula: Dead and...|1995|
|     13|               Balto|1995|
|     14|               Nixon|1995|
|     15|    Cutthroat Island|1995|
|     16|              Casino|1995|
|     17|Sense and Sensibi...|1995|
|     18|          Four Rooms|1995|
|     19|Ace Ventura: When...|1995|
|     20|         Money Train|1995|
+-------+--------------------+----+
only showing top 20 rows



In [90]:
# year of movies in the dataset
spark.sql("""select 
             min(year) as min_year,
             max(year) as max_year
             from movies 
             where year > 0
""").show()

+--------+--------+
|min_year|max_year|
+--------+--------+
|    1902|    2018|
+--------+--------+



In [83]:
# year of rating in the dataset
spark.sql("""select 
             min(year) as min_year,
             max(year) as max_year
             from ratings
""").show()

+--------+--------+
|min_year|max_year|
+--------+--------+
|    1996|    2018|
+--------+--------+



In [84]:
# year of awards in the dataset
spark.sql("""select 
             min(year) as min_year,
             max(year) as max_year
             from awards
""").show()

+--------+--------+
|min_year|max_year|
+--------+--------+
|    1927|    2019|
+--------+--------+



In [129]:
# number of movies not rated
spark.sql("""select 
          count(distinct movies.movieId)
          from movies 
          where movies.movieId not in
          (select distinct ratings.movieId from ratings)
          """).show()

+-----------------------+
|count(DISTINCT movieId)|
+-----------------------+
|                     18|
+-----------------------+



In [143]:
# number of movies not rated but receiving awards
# 431 movies receiving awards and shown in ratings dataset
spark.sql("""select count(distinct movieId) as in_ratings from 
          (select distinct a.title, a.year, m.movieId as movieId
          from awards as a inner join movies as m on trim(a.title) == trim(m.title) and a.year = m.year
          where a.year > 0 and m.year > 0) t
          where movieId in 
          (select distinct ratings.movieId from ratings)
          """).show()

+----------+
|in_ratings|
+----------+
|       431|
+----------+



In [144]:
# the top 5 movies with high ratings
avg_rating = spark.sql("""select distinct
    m.title as title,
    m.year as year,
    sum(case when r.rating >= 0 then 1 else 0 end) as num_rating,
    avg(r.rating) as avg_rating
    from movies as m inner join ratings as r on m.movieId = r.movieId
    group by m.title, m.year
    order by avg_rating desc
""")
avg_rating.show(5)
avg_rating.createOrReplaceTempView("avg_rating") 

+--------------------+----+----------+----------+
|               title|year|num_rating|avg_rating|
+--------------------+----+----------+----------+
|SORI: Voice from ...|2016|         1|       5.0|
|National Lampoon'...|2007|         1|       5.0|
|      Blue Planet II|2017|         1|       5.0|
|                9/11|2002|         1|       5.0|
|Sun Alley (Sonnen...|1999|         1|       5.0|
+--------------------+----+----------+----------+
only showing top 5 rows



In [145]:
# the awards a movie got
tot_awards = spark.sql("""select distinct
                    title,
                    year,
                    sum(awards) as tot_awards
                    from awards
                    group by title, year
                    order by tot_awards desc
""")
tot_awards.show(5)
tot_awards.createOrReplaceTempView("tot_awards") 

+--------------------+----+----------+
|               title|year|tot_awards|
+--------------------+----+----------+
|            Titanic |1997|      11.0|
|The Lord of the R...|1970|      11.0|
|            Ben-Hur |1970|      11.0|
|The Lord of the R...|2003|      11.0|
|    West Side Story |1970|      10.0|
+--------------------+----+----------+
only showing top 5 rows



In [146]:
# the rating score of movie with awards
movie_awards_rating = spark.sql("""select distinct
             a.title,
             a.year,
             a.tot_awards,
             r.avg_rating
             from tot_awards as a inner join avg_rating as r on trim(a.title) == trim(r.title) and a.year == r.year
             where a.year > 0 and r.year > 0
             order by tot_awards desc, avg_rating desc
""")
movie_awards_rating.show()
movie_awards_rating.createOrReplaceTempView("movie_awards_rating") 

+--------------------+----+----------+------------------+
|               title|year|tot_awards|        avg_rating|
+--------------------+----+----------+------------------+
|            Titanic |1997|      11.0| 3.414285714285714|
|            Gigi    |1958|       9.0|              3.25|
|From Here to Eter...|1953|       8.0|3.9545454545454546|
|Gone with the Wind  |1939|       8.0|3.6444444444444444|
|Schindler's List    |1993|       7.0|             4.225|
|            Patton  |1970|       7.0| 4.121212121212121|
|Dances with Wolves  |1990|       7.0|3.8353658536585367|
|Shakespeare in Love |1998|       7.0| 3.777173913043478|
|    All About Eve   |1950|       6.0| 4.229166666666667|
|    Forrest Gump    |1994|       6.0| 4.164133738601824|
|           Chicago  |2002|       6.0|3.7244897959183674|
|        La La Land  |2016|       6.0| 3.388888888888889|
|It Happened One N...|1934|       5.0| 4.321428571428571|
|In the Heat of th...|1967|       5.0| 4.181818181818182|
|Saving Privat

In [147]:
spark.sql("select count(*) from tot_awards").show()
spark.sql("select count(*) from avg_rating").show()
spark.sql("select count(*) from movie_awards_rating").show()

+--------+
|count(1)|
+--------+
|    1319|
+--------+

+--------+
|count(1)|
+--------+
|    9719|
+--------+

+--------+
|count(1)|
+--------+
|     430|
+--------+



# STEP 3: Connect to the Redshift Cluster

### Part 1: Extract data and transform into fact and dimension tables

In [69]:
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                     )

s3bucket =  s3.Bucket("sparkifytest") # private

s3_data = iter(s3bucket.objects.filter(Prefix="movies/"))
for _ in range(5): print(next(s3_data))


s3.ObjectSummary(bucket_name='sparkifytest', key='movies/awards/_SUCCESS')
s3.ObjectSummary(bucket_name='sparkifytest', key='movies/awards/part-00000-077834ae-64f4-439c-b206-3fec87ef79ea-c000.snappy.parquet')
s3.ObjectSummary(bucket_name='sparkifytest', key='movies/awards2/_SUCCESS')
s3.ObjectSummary(bucket_name='sparkifytest', key='movies/awards2/part-00000-3b457c38-e2bd-4101-9241-50451481a9e4-c000.snappy.parquet')
s3.ObjectSummary(bucket_name='sparkifytest', key='movies/awards3/_temporary/0/_temporary/attempt_20200720024818_0064_m_000189_447/part-00189-ff818c24-24ee-4217-8f44-00b4f69e4d84-c000.snappy.parquet')


In [60]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [61]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.ci2m6m74tbzm.us-west-2.redshift.amazonaws.com:5439/dwh


'Connected: dwhuser@dwh'

In [74]:
%%sql
DROP TABLE IF EXISTS dimDate CASCADE;
CREATE TABLE dimDate
(
  date_key timestamp NOT NULL PRIMARY KEY,
  year smallint NOT NULL,
  month smallint NOT NULL,
  day smallint NOT NULL,
  week smallint NOT NULL
);

DROP TABLE IF EXISTS dimRatings;
CREATE TABLE dimRatings
(
  userId             smallint NOT NULL,
  movieId            smallint NOT NULL,
  rating             numeric NOT NULL,
  rate_time          timestamp REFERENCES dimdate (date_key),
  year               smallint NOT NULL,
  PRIMARY KEY(userId, movieId)
);

DROP TABLE IF EXISTS dimGenres;
CREATE TABLE dimGenres
(
  genreId            smallint NOT NULL PRIMARY KEY,
  genre              varchar(10) NOT NULL,
  movieId            smallint NOT NULL
);

DROP TABLE IF EXISTS dimAwards;
CREATE TABLE dimAwards
(
  title        varchar(45) NOT NULL,
  year         smallint NOT NULL,
  awards       smallint NOT NULL,
  nominations  smallint NOT NULL,
  PRIMARY KEY(title, year)
);

DROP TABLE IF EXISTS dimMovies0;
CREATE TABLE dimMovies0
(
  movieId      smallint NOT NULL PRIMARY KEY,
  title        varchar(45) NOT NULL,
  genres       varchar(300) NOT NULL
);

DROP TABLE IF EXISTS dimMovies;
CREATE TABLE dimMovies
(
  movieId      smallint NOT NULL PRIMARY KEY,
  title        varchar(45) NOT NULL,
  year         smallint NOT NULL
);

 * postgresql://dwhuser:***@dwhcluster.ci2m6m74tbzm.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.


[]

In [64]:
dfawards = spark.read.parquet("s3a://sparkifytest/movies/awards/*")
dfawards2 = spark.read.parquet("s3a://sparkifytest/movies/awards2/*")
dfmovies = spark.read.parquet("s3a://sparkifytest/movies/movies/*")
dfratings = spark.read.parquet("s3a://sparkifytest/movies/ratings/*")
dftags = spark.read.parquet("s3a://sparkifytest/movies/tags/*")

In [70]:
%%time

qry = """
    copy dimRatings from 's3://sparkifytest/movies/ratings/*.parquet' 
    credentials 'aws_iam_role={}' 
    FORMAT AS PARQUET;
""".format(DWH_ROLE_ARN)

%sql $qry

 * postgresql://dwhuser:***@dwhcluster.ci2m6m74tbzm.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
CPU times: user 4.67 ms, sys: 0 ns, total: 4.67 ms
Wall time: 359 ms


In [71]:
%%time

qry = """
    copy dimMovies0 from 's3://sparkifytest/movies/movies/*.parquet' 
    credentials 'aws_iam_role={}' 
    FORMAT AS PARQUET;
""".format(DWH_ROLE_ARN)

%sql $qry

 * postgresql://dwhuser:***@dwhcluster.ci2m6m74tbzm.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
CPU times: user 6.36 ms, sys: 0 ns, total: 6.36 ms
Wall time: 8.87 s


In [None]:
%%time

qry = """
    copy dimMovies0 from 's3://udacity-input/ml-latest-small/Awards.txt' 
    credentials 'aws_iam_role={}' 
    region 'us-west-2';
""".format(DWH_ROLE_ARN)

%sql $qry

In [None]:
%%time

qry = """
    copy dimAwards from 's3://sparkifytest/movies/awards3/*.parquet' 
    credentials 'aws_iam_role={}' 
    FORMAT AS PARQUET;
""".format(DWH_ROLE_ARN)

%sql $qry

In [None]:
%%time

qry = """
    copy dimGenres from 's3://sparkifytest/movies/genres/*.parquet' 
    credentials 'aws_iam_role={}' 
    FORMAT AS PARQUET;
""".format(DWH_ROLE_ARN)

%sql $qry

In [None]:
%%sql
INSERT INTO dimDate (date_key, year, month, day, week)
SELECT DISTINCT(rate_time)                                       AS date_key,
       EXTRACT(year FROM rate_time)                              AS year,
       EXTRACT(month FROM rate_time)                             AS month,
       EXTRACT(day FROM rate_time)                               AS day,
       EXTRACT(week FROM rate_time)                              AS week
FROM dimRating;

In [None]:
%%sql
INSERT INTO dimMovies (movieId, title, year)
SELECT movieId                                                   AS movieId,
       substr(title, 0, length(title)-7)                         AS title, 
       substr(title, -5, 4)                                      AS year
FROM dimMovies0