In [49]:
import configparser
from datetime import datetime
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.functions import countDistinct, explode, split, concat_ws, collect_list, isnan
from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Date

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# STEP 1: Get the params of the created redshift cluster 
- We need:
    - The redshift cluster <font color='red'>endpoint</font>
    - The <font color='red'>IAM role ARN</font> that give access to Redshift to read from S3

In [77]:
config = configparser.ConfigParser()

#Normally this file should be in ~/.aws/credentials
config.read_file(open('dwh.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


In [78]:
# FILL IN THE REDSHIFT ENPOINT HERE
# e.g. DWH_ENDPOINT="redshift-cluster-1.csmamz5zxmle.us-west-2.redshift.amazonaws.com" 
DWH_ENDPOINT="dwhcluster.ci2m6m74tbzm.us-west-2.redshift.amazonaws.com" 
    
#FILL IN THE IAM ROLE ARN you got in step 2.2 of the previous exercise
#e.g DWH_ROLE_ARN="arn:aws:iam::988332130976:role/dwhRole"
DWH_ROLE_ARN="arn:aws:iam::264680862608:role/dwhRole"

In [None]:
spark = SparkSession.builder\
                    .appName("moive analysis") \
                    .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                    .getOrCreate()

# STEP 2: Connect to the Redshift Cluster

In [None]:
%load_ext sql

In [None]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
print(conn_string)
%sql $conn_string

In [None]:
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                     )

s3bucket =  s3.Bucket("udacity-input") # private

s3_data = iter(s3bucket.objects.filter(Prefix="ml-latest-small"))
for _ in range(5): print(next(s3_data))


# Step 3: Explore and Assess the Data

#### Load Data from S3

### Part 1: Data Wrangling with DataFrames

In [52]:
movieSchema = R([
            Fld("movieId",Int()),
            Fld("title",Str()),
            Fld("genres",Str())
            ])

In [53]:
ratingSchema = R([
            Fld("userId",Int()),
            Fld("movieId",Int()),
            Fld("rating",Dbl()),
            Fld("timestamp",Str())
            ])

In [55]:
tagSchema = R([
            Fld("userId",Int()),
            Fld("movieId",Int()),
            Fld("tag",Str()),
            Fld("timestamp",Str())
            ])

In [None]:
awardSchema = R([
            Fld("title",Str()),
            Fld("year",Int()),
            Fld("awards",Int()),
            Fld("nominations",Int())
            ])

In [56]:
dfmovies = spark.read.load("s3a://udacity-input/ml-latest-small/movies.csv", format='csv', header = True, schema=movieSchema)
dfratings = spark.read.load("s3a://udacity-input/ml-latest-small/ratings.csv", format='csv', header = True, schema=ratingSchema)
dftags = spark.read.load("s3a://udacity-input/ml-latest-small/tags.csv", format='csv', header = True, schema=tagSchema)
dfawards = spark.read.text("s3a://udacity-input/ml-latest-small/Awards.txt", format='txt', header = True, schema=awardSchema)

In [None]:
dfmovies.printSchema()
dfmovies.show(5)
dfmovies.count()

In [None]:
dfratings.printSchema()
dfratings.show(5)
dfratings.count()

In [None]:
dftags.printSchema()
dftags.show(5)
dftags.count()

In [None]:
dfawards.printSchema()
dfawards.show(5)
dfawards.count()

#### Explore the Data
Identify data quality issues, like missing values, duplicate data, etc.

In [None]:
# check for null values
dfmovies.isnull().sum()
dfratings.isnull().sum()
dfawards.isnull().sum()

#### How to deal with null values
#### 1. Deleting Rows 
This method commonly used to handle the null values. Here, we either delete a particular row if it has a null value for a particular feature and a particular column if it has more than 70-75% of missing values. This method is advised only when there are enough samples in the data set. 
#### 2. Replacing With Mean/Median/Mode  
This strategy can be applied on a feature which has numeric data like the age of a person or the rating score. We can calculate the mean, median or mode of the feature and replace it with the missing values. This is an approximation which can add variance to the data set. 
#### 3. Assigning An Unique Category  
A categorical feature will have a definite number of possibilities, such as gender, for example. Since they have a definite number of classes, we can assign another class for the missing values like unknown.
#### 4. Predicting The Missing Values  
Using the features which do not have missing values, we can predict the nulls with the help of a machine learning algorithm. 
#### 5. Using Algorithms Which Support Missing Values  
KNN is a machine learning algorithm which works on the principle of distance measure. This algorithm can be used when there are nulls present in the dataset. While the algorithm is applied, KNN considers the missing values by taking the majority of the K nearest values. 

In [None]:
# check duplicate data, also confirm the dataset is on which level
dfmovies.shape
dfmovies[['movieId']].drop_duplicates().shape

In [None]:
dfratings.shape
dfratings[['movieId', 'userId']].drop_duplicates().shape

In [None]:
dfawards.shape
dfawards[['movieId', 'year']].drop_duplicates().shape

In [None]:
# basic count
distinct_movie = dfmovies['movieId'].nunique()
print('{} movies in the movies dataset'.format(distinct_movie))

In [None]:
distinct_user = dfratings['userId'].nunique()
print('{} users rated the movies'.format(distinct_user))

In [None]:
distinct_award = dfawards['title'].nunique()
print('{} movies received awards'.format(distinct_award))

In [None]:
tmp1 = dfratings.groupBy("userID").count().toPandas()['count'].min()
tmp2 = dfratings.groupBy("movieId").count().toPandas()['count'].min()
print('For the users that rated movies and the movies that were rated:')
print('Minimum number of ratings per user is {}'.format(tmp1))
print('Minimum number of ratings per movie is {}'.format(tmp2))

In [None]:
tmp1 = sum(dfratings.groupBy("movieId").count().toPandas()['count'] == 1)
tmp2 = dfratings.select('movieId').distinct().count()
print('{} out of {} movies are rated by only one user'.format(tmp1, tmp2))

In [None]:
dfratings = dfratings.withColumn(
    "rate_time",
    F.to_timestamp(F.from_unixtime((col("timestamp") / 1000) , 'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp")
)

In [None]:
# check the most recent date in the dataset
tmp1 = dfratings['rate_time'].min()
tmp2 = dfratings['rate_time'].max()
print('ratings were made during {} and {}'.format(tmp1, tmp2))

In [None]:
# split the mixed genres by '|'
dfmovies2 = dfmovies.withColumn('genre', explode(split(dfmovies.genres, '\|')))

In [None]:
dfmovies2.show(5)

In [None]:
# pick distinct genre
genres_dummies = dfmovies2.filter(dfmovies2.genre != '(no genres listed)') \
                     .select(dfmovies2.genre).distinct() \
                     .orderBy(dfmovies2.genre)
display(genres_dummies)

### Part 2: Data Wrangling with Spark SQL and OLAP

In [57]:
dfratings.createOrReplaceTempView("ratings")     #userid, movieid, rating, timestamp
dfmovies.createOrReplaceTempView("movies")       #movieid, title, genres
dflinks.createOrReplaceTempView("links")         #movieid, imdbId, tmdbId
dftags.createOrReplaceTempView("tags")           #userid, movieid, tag, timestamp

In [None]:
movie_not_rated = spark.sql("""select 
                      count(distinct movies.movieId)
                      from movies 
                      where movies.movieId not in
                      (select distinct ratings.movieId from ratings)
                      """)
print('{} movies are not rated'.format(movie_not_rated))

# STEP 3: Create Tables

In [None]:
%%sql
DROP TABLE IF EXISTS dimDate
CREATE TABLE dimDate
(
  date_key timestamp NOT NULL PRIMARY KEY,
  year smallint NOT NULL,
  month smallint NOT NULL,
  day smallint NOT NULL,
  week smallint NOT NULL,
  weekday varchar(3) NOT NULL
);

DROP TABLE IF EXISTS dimRating
CREATE TABLE dimRating
(
  userId             smallint NOT NULL PRIMARY KEY,
  movieId            smallint NOT NULL,
  rating             numeric NOT NULL,
  ts                 timestamp REFERENCES dimdate (date_key)
);

DROP TABLE IF EXISTS dimGenres
CREATE TABLE dimGenres
(
  genreId            smallint NOT NULL PRIMARY KEY
  genres             text NOT NULL,
  movieId            smallint NOT NULL
);

DROP TABLE IF EXISTS factMovies
CREATE TABLE factMovies
(
  MovieId      smallint NOT NULL PRIMARY KEY,
  title        varchar(45) NOT NULL,
  release_year year NOT NULL,
  genreId      smallint REFERENCES dimGenres(genreId),
  awards       smallint NOT NULL,
  Nominations  smallint NOT NULL
);



In [None]:
%%sql
INSERT INTO dimDate (date_key, date, year, month, day, week, is_weekend)
SELECT DISTINCT(TO_CHAR(timestamp :: DATE, 'yyyyMMDD')::integer) AS date_key,
       date(timestamp)                                           AS date,
       EXTRACT(year FROM timestamp)                              AS year,
       EXTRACT(quarter FROM timestamp)                           AS quarter,
       EXTRACT(month FROM timestamp)                             AS month,
       EXTRACT(day FROM timestamp)                               AS day,
       EXTRACT(week FROM timestamp)                              AS week,
       dayofweek(timestamp)                                      AS dow
FROM ratings;

In [75]:
# Split title and release year in separate columns in movies dataframe. Convert year to timestamp.       

# movies = df.select(split(col("title"),"(").getItem(0).as("titleArray").split(col("title"),"(").getItem(-1)).as("titleArray").drop("title"),
dfmovies.createOrReplaceTempView("movies")
spark.sql("select movieId, substr(title, 0, length(title)-7) as title, substr(title, -5, 4) as year from movies").show()



+-------+--------------------+----+
|movieId|               title|year|
+-------+--------------------+----+
|      1|           Toy Story|1995|
|      2|             Jumanji|1995|
|      3|    Grumpier Old Men|1995|
|      4|   Waiting to Exhale|1995|
|      5|Father of the Bri...|1995|
|      6|                Heat|1995|
|      7|             Sabrina|1995|
|      8|        Tom and Huck|1995|
|      9|        Sudden Death|1995|
|     10|           GoldenEye|1995|
|     11|American Presiden...|1995|
|     12|Dracula: Dead and...|1995|
|     13|               Balto|1995|
|     14|               Nixon|1995|
|     15|    Cutthroat Island|1995|
|     16|              Casino|1995|
|     17|Sense and Sensibi...|1995|
|     18|          Four Rooms|1995|
|     19|Ace Ventura: When...|1995|
|     20|         Money Train|1995|
+-------+--------------------+----+
only showing top 20 rows



In [61]:
ratings.printSchema()
ratings.show(5)
ratings.count()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



100836

In [28]:
links.printSchema()
links.show(5)
links.count()

root
 |-- movieId: integer (nullable = true)
 |-- imdbId: integer (nullable = true)
 |-- tmdbId: integer (nullable = true)

+-------+------+------+
|movieId|imdbId|tmdbId|
+-------+------+------+
|      1|114709|   862|
|      2|113497|  8844|
|      3|113228| 15602|
|      4|114885| 31357|
|      5|113041| 11862|
+-------+------+------+
only showing top 5 rows



9742

In [35]:
tags.printSchema()
tags.show(5)
tags.count()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-------+---------------+----------+
|userId|movieId|            tag| timestamp|
+------+-------+---------------+----------+
|     2|  60756|          funny|1445714994|
|     2|  60756|Highly quotable|1445714996|
|     2|  60756|   will ferrell|1445714992|
|     2|  89774|   Boxing story|1445715207|
|     2|  89774|            MMA|1445715200|
+------+-------+---------------+----------+
only showing top 5 rows



3683

In [36]:
tags = tags.withColumn(
    "start_time",
    F.to_timestamp(F.from_unixtime((col("timestamp") / 1000) , 'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp")
)

In [37]:
tags.printSchema()
tags.show(5)
tags.count()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- start_time: timestamp (nullable = true)

+------+-------+---------------+----------+-------------------+
|userId|movieId|            tag| timestamp|         start_time|
+------+-------+---------------+----------+-------------------+
|     2|  60756|          funny|1445714994|1970-01-17 17:35:14|
|     2|  60756|Highly quotable|1445714996|1970-01-17 17:35:14|
|     2|  60756|   will ferrell|1445714992|1970-01-17 17:35:14|
|     2|  89774|   Boxing story|1445715207|1970-01-17 17:35:15|
|     2|  89774|            MMA|1445715200|1970-01-17 17:35:15|
+------+-------+---------------+----------+-------------------+
only showing top 5 rows



3683

In [14]:
ratings = ratings.withColumn(
    "rate_time",
    F.to_timestamp(F.from_unixtime((col("timestamp") / 1000) , 'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp")
)

In [15]:
ratings.printSchema()
ratings.show(5)
ratings.count()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- start_time: timestamp (nullable = true)

+------+-------+------+---------+-------------------+
|userId|movieId|rating|timestamp|         start_time|
+------+-------+------+---------+-------------------+
|     1|      1|   4.0|964982703|1970-01-12 04:03:02|
|     1|      3|   4.0|964981247|1970-01-12 04:03:01|
|     1|      6|   4.0|964982224|1970-01-12 04:03:02|
|     1|     47|   5.0|964983815|1970-01-12 04:03:03|
|     1|     50|   5.0|964982931|1970-01-12 04:03:02|
+------+-------+------+---------+-------------------+
only showing top 5 rows



100836

## time table

In [16]:
time_table = ratings.select("start_time").dropDuplicates() \
.withColumn("hour", hour("start_time")).withColumn("day", dayofmonth("start_time")) \
.withColumn("week", weekofyear("start_time")).withColumn("month", month("start_time")) \
.withColumn("year", year("start_time")).withColumn("weekday", date_format("start_time", "E")).dropna(how="any")

In [17]:
time_table.show(5)
time_table.count()

+-------------------+----+---+----+-----+----+-------+
|         start_time|hour|day|week|month|year|weekday|
+-------------------+----+---+----+-----+----+-------+
|1970-01-17 08:29:27|   8| 17|   3|    1|1970|    Sat|
|1970-01-15 22:39:24|  22| 15|   3|    1|1970|    Thu|
|1970-01-17 17:49:32|  17| 17|   3|    1|1970|    Sat|
|1970-01-18 05:44:58|   5| 18|   3|    1|1970|    Sun|
|1970-01-16 11:13:43|  11| 16|   3|    1|1970|    Fri|
+-------------------+----+---+----+-----+----+-------+
only showing top 5 rows



10061

In [7]:
song_field = ["title", "duration", "year", "artist_id"]
songs_table = df.select(song_field).dropDuplicates().withColumn("song_id", F.monotonically_increasing_id()).filter(~col("year").isin([0]) & col("year").isNotNull() & col("artist_id").isNotNull())

In [8]:
songs_table.printSchema()
songs_table.show(5)
songs_table.count()

root
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- song_id: long (nullable = false)

+--------------------+---------+----+------------------+-----------+
|               title| duration|year|         artist_id|    song_id|
+--------------------+---------+----+------------------+-----------+
|Always And Never ...|  229.642|2005|AR10USD1187B99F3F1| 8589934592|
|    Commercial Reign|283.76771|1990|AR9AM2N1187B9AD2F1|17179869184|
|Terapia De Amor I...|340.29669|1988|ARBJSO81187B9BA09B|17179869187|
|Walking With The ...|152.16281|1966|ARE5F2F1187B9AB7E9|25769803776|
|               Pills| 256.1824|2000|ARW63XP1187FB5AB99|34359738369|
+--------------------+---------+----+------------------+-----------+
only showing top 5 rows



406

In [9]:
artist_field = ["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"]
artists_table = df.select(artist_field).dropDuplicates().dropna(subset=["artist_id","artist_name"])

In [10]:
artists_table.printSchema()
artists_table.show(5)
artists_table.count()

root
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_longitude: double (nullable = true)

+------------------+-----------------+--------------------+---------------+----------------+
|         artist_id|      artist_name|     artist_location|artist_latitude|artist_longitude|
+------------------+-----------------+--------------------+---------------+----------------+
|AR1S3NH1187B98C2BC|        Anthony B|Clarks Town, Jamaica|           null|            null|
|ARPIKA31187FB4C233|       The Action|            New York|       40.71455|       -74.00712|
|ARYL56G11C8A41634E|    Mick Flannery|                    |           null|            null|
|AR1XL241187FB3F4AB|Nortec Collective|                    |           null|            null|
|ARMI4NV1187B99D55D|          Man Man|    Philadelphia, PA|       39.95227|       -75.16237|
+------------------+----

591

In [39]:
songs_table.write.partitionBy("year", "artist_id").parquet("s3a://sparkifytest/songs/", mode="overwrite")

In [40]:
artists_table.write.parquet("s3a://sparkifytest/artists/", mode="overwrite")

# Log Data

In [11]:
df = spark.read.json("s3a://udacity-dend/log_data/2018/11/*.json")

In [12]:
df.printSchema()
df.show(5)
df.count()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+---------------+------+-------------+--------------------+------+
|     artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            lo

8056

In [13]:
df = df.filter(df.page == 'NextSong')

In [14]:
df.printSchema()
df.show(5)
df.count()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|     artist|     auth|firstName|gender|itemInSession|lastName|   length|level|         

6820

In [15]:
user_field = [" userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level"]
users_table = df.selectExpr(user_field).dropDuplicates().dropna(how = "any")

In [16]:
users_table.printSchema()
users_table.show(5)
users_table.count()

root
 |-- user_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     26|      Ryan|    Smith|     M| free|
|      7|    Adelyn|   Jordan|     F| free|
|     71|    Ayleen|     Wise|     F| free|
|     81|    Sienna|    Colon|     F| free|
|     87|    Dustin|      Lee|     M| free|
+-------+----------+---------+------+-----+
only showing top 5 rows



104

In [None]:
users_table.write.parquet("s3a://sparkifytest/users/", mode="overwrite")

In [17]:
df = df.withColumn(
    "start_time",
    F.to_timestamp(F.from_unixtime((col("ts") / 1000) , 'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp")
)

In [18]:
df.printSchema()
df.show(5)
df.count()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)

+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+-------------------+
|     artist|     auth|

6820

In [19]:
df = df.withColumn("test_time", F.to_timestamp(col("ts") / 1000))

In [20]:
df.printSchema()
df.show(5)
df.count()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- test_time: timestamp (nullable = true)

+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+

6820

In [21]:
time_table = df.select("start_time").dropDuplicates() \
.withColumn("hour", hour("start_time")).withColumn("day", dayofmonth("start_time")) \
.withColumn("week", weekofyear("start_time")).withColumn("month", month("start_time")) \
.withColumn("year", year("start_time")).withColumn("weekday", date_format("start_time", "E")).dropna(how="any")

In [22]:
time_table.printSchema()
time_table.show(5)
time_table.count()

root
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekday: string (nullable = true)

+-------------------+----+---+----+-----+----+-------+
|         start_time|hour|day|week|month|year|weekday|
+-------------------+----+---+----+-----+----+-------+
|2018-11-21 10:52:12|  10| 21|  47|   11|2018|    Wed|
|2018-11-21 19:46:29|  19| 21|  47|   11|2018|    Wed|
|2018-11-14 04:37:40|   4| 14|  46|   11|2018|    Wed|
|2018-11-14 12:14:41|  12| 14|  46|   11|2018|    Wed|
|2018-11-14 16:19:02|  16| 14|  46|   11|2018|    Wed|
+-------------------+----+---+----+-----+----+-------+
only showing top 5 rows



6813

In [None]:
time_table.write.partitionBy("year", "month").parquet("s3a://sparkifytest/time/", mode="overwrite")

In [23]:
song_df = spark.read.parquet("s3a://sparkifytest/songs/*/*/*")
artist_df = spark.read.parquet("s3a://sparkifytest/artists/*")

In [24]:
Join_song = df.join(song_df, ((song_df.title == df.song) & (song_df.duration == df.length)))
artists_songs_logs = Join_song.join(artist_df, (Join_song.artist == artist_df.artist_name))
songplays = artists_songs_logs.join(time_table, (artists_songs_logs.start_time == time_table.start_time), 'left').drop(artists_songs_logs.start_time)

In [25]:
songplays.printSchema()
songplays.show(5)
songplays.count()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- test_time: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- song_id: long (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_lat

0

In [26]:
songplays_field = ["start_time", "userId as user_id", "level", "song_id", "artist_id", "sessionid as session_id", "artist_location as location", "userAgent as user_agent", "year", "month"]

In [27]:
songplays_table = songplays.selectExpr(songplays_field).dropDuplicates().dropna(subset=["user_id", "artist_id", \
"song_id"]).withColumn("songplay_id", F.monotonically_increasing_id())

In [28]:
songplays_table.printSchema()
songplays_table.show(5)
songplays_table.count()

root
 |-- start_time: timestamp (nullable = true)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: long (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- songplay_id: long (nullable = false)

+----------+-------+-----+-------+---------+----------+--------+----------+----+-----+-----------+
|start_time|user_id|level|song_id|artist_id|session_id|location|user_agent|year|month|songplay_id|
+----------+-------+-----+-------+---------+----------+--------+----------+----+-----+-----------+
+----------+-------+-----+-------+---------+----------+--------+----------+----+-----+-----------+



0

In [None]:
songplays_table.write.partitionBy("year", "month").parquet(output_date + "songplays/", mode="overwrite")
 

In [None]:
songplays_table.createOrReplaceTempView("songplays")

spark.sql("""
    SELECT month, count(song_id) as song_num
    FROM songplays
    GROUP by month
    order by song_num desc
""").show()