In [1]:
import boto3
import os
import configparser
import getpass

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pyspark.sql.functions as F

# Set up Spark Session Locally and Connect to AWS for S3 Files

In [2]:
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""

In [3]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

In [4]:
spark

## Read Song Data Files

In [13]:
song_raw = spark.read.json("s3a://udacity-dend/song_data/A/A/A/*.json")

In [14]:
song_raw.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [15]:
song_raw.count()

24

In [16]:
song_raw.limit(2).toPandas()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARTC1LV1187B9A4858,51.4536,"Goldsmith's College, Lewisham, Lo",-0.01802,The Bonzo Dog Band,301.40036,1,SOAFBCP12A8C13CC7D,King Of Scurf (2007 Digital Remaster),1972
1,ARA23XO1187B9AF18F,40.57885,"Carteret, New Jersey",-74.21956,The Smithereens,192.522,1,SOKTJDS12AF72A25E5,Drown In My Own Tears (24-Bit Digitally Remast...,0


## Read Events JSON Files

In [17]:
event_raw = spark.read.json("s3a://udacity-dend/log_data/2018/11/2018-11-12-events.json")

In [18]:
event_raw.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [19]:
event_raw.count()

213

In [20]:
event_raw.limit(2).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,,Logged In,Celeste,F,0,Williams,,free,"Klamath Falls, OR",GET,Home,1541078000000.0,438,,200,1541990217796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",53
1,Pavement,Logged In,Sylvie,F,0,Cruz,99.16036,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",PUT,NextSong,1540266000000.0,345,Mercy:The Laundromat,200,1541990258796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",10


# ETL

## Dimension Tables
* users - users in the app
    * user_id, first_name, last_name, gender, level
  
  
* songs - songs in music database
    * song_id, title, artist_id, year, duration
  
  
* artists - artists in music database
    * artist_id, name, location, lattitude, longitude
  
  
* time - timestamps of records in songplays broken down into specific units
    * start_time, hour, day, week, month, year, weekday

In [21]:
event_raw.createOrReplaceTempView("stg_event_raw")

In [39]:
song_raw.createOrReplaceTempView("stg_song_raw")

In [37]:
# users table
users_tbl = spark.sql("""
    select distinct userId as user_id
          ,firstName as first_name
          ,lastName as last_name
          ,gender
          ,level
    from stg_event_raw
    where userId is not null
        and userId <> ''
""")

In [38]:
users_tbl.count()

26

In [42]:
# songs table
songs_tbl = spark.sql("""
    select distinct song_id
          ,title
          ,artist_id
          ,year
          ,duration
    from stg_song_raw
    where song_id is not null
        and song_id <> ''
""")

In [46]:
songs_tbl.sort(F.desc("song_id")).show(5)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOXZYWX12A6310ED0C|     It's About Time|ARC1IHZ1187FB4E920|   0| 246.9873|
|SOTAZDY12AB0187616|            Drillbit|ARZKCQM1257509D107|   0|374.62159|
|SOSMJFC12A8C13DE0C|Is That All There...|AR1KTV21187B9ACD72|   0|343.87546|
|SORRNOC12AB017F52B|The Last Beat Of ...|ARSZ7L31187FB4E610|2004|337.81506|
|SOQPWCR12A6D4FB2A3|A Poor Recipe For...|AR73AIO1187B9AD57B|2005|118.07302|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows



In [49]:
# artists table
artists_tbl = spark.sql("""
    select distinct artist_id
          ,artist_name as name
          ,artist_location as location
          ,artist_latitude as latitude
          ,artist_longitude as longitude
    from stg_song_raw
    where artist_id is not null
        and artist_id <> ''
""")

In [52]:
artists_tbl.count()

24

In [66]:
# time table
spark.sql("""
    select start_time
          ,hour(start_time) as hour
          ,dayofmonth(start_time) as day
    from
    (
        select distinct to_timestamp(ts/1000) as start_time
        from stg_event_raw
        where ts is not null
    )
""").show(5)

+--------------------+----+---+
|          start_time|hour|day|
+--------------------+----+---+
|2018-11-12 10:55:...|  10| 12|
|2018-11-12 10:57:...|  10| 12|
|2018-11-12 14:45:...|  14| 12|
|2018-11-12 00:42:...|   0| 12|
|2018-11-12 15:50:...|  15| 12|
+--------------------+----+---+
only showing top 5 rows



In [64]:
time_tbl = spark.sql("""
    select start_time
          ,hour(start_time) as hour
          ,dayofmonth(start_time) as day
          ,weekofyear(start_time) as week
          ,month(start_time) as month
          ,year(start_time) as year
          ,dayofweek(start_time) as weekday
    from
    (
        select distinct to_timestamp(ts/1000) as start_time
        from stg_event_raw
        where ts is not null
    )
""")

In [68]:
time_tbl.limit(5).toPandas()

Unnamed: 0,start_time,hour,day,week,month,year,weekday
0,2018-11-12 10:55:33.796,10,12,46,11,2018,2
1,2018-11-12 10:57:55.796,10,12,46,11,2018,2
2,2018-11-12 14:45:03.796,14,12,46,11,2018,2
3,2018-11-12 00:42:00.796,0,12,46,11,2018,2
4,2018-11-12 15:50:14.796,15,12,46,11,2018,2


## Facts Table

songplays - records in log data associated with song plays i.e. records with page NextSong
* songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent

In [72]:
spark.sql("""
    select *
    from stg_event_raw as log
    where page = 'NextSong'
""").limit(2).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Pavement,Logged In,Sylvie,F,0,Cruz,99.16036,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",PUT,NextSong,1540266000000.0,345,Mercy:The Laundromat,200,1541990258796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",10
1,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Logged In,Celeste,F,1,Williams,277.15873,free,"Klamath Falls, OR",PUT,NextSong,1541078000000.0,438,Horn Concerto No. 4 in E flat K495: II. Romanc...,200,1541990264796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",53


In [90]:
spark.sql("""
    select monotonically_increasing_id() as songplay_id
          ,to_timestamp(log.ts/1000) as start_time
          ,log.userId as user_id
          ,log.level 
          ,log.sessionId as session_id
          ,log.location
          ,log.userAgent as user_agent
    from stg_event_raw as log
    where page = 'NextSong'
""").limit(5).toPandas()

Unnamed: 0,songplay_id,start_time,user_id,level,session_id,location,user_agent
0,0,2018-11-11 21:37:38.796,10,free,345,"Washington-Arlington-Alexandria, DC-VA-MD-WV","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
1,1,2018-11-11 21:37:44.796,53,free,438,"Klamath Falls, OR","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK..."
2,2,2018-11-11 21:42:21.796,53,free,438,"Klamath Falls, OR","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK..."
3,3,2018-11-11 21:45:52.796,53,free,438,"Klamath Falls, OR","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK..."
4,4,2018-11-11 21:47:22.796,29,paid,389,"Atlanta-Sandy Springs-Roswell, GA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."


In [93]:
songplays_tbl = spark.sql("""
    select monotonically_increasing_id() as songplay_id
          ,to_timestamp(log.ts/1000) as start_time
          ,log.userId as user_id
          ,log.level 
          ,song.song_id
          ,song.artist_id
          ,log.sessionId as session_id
          ,log.location
          ,log.userAgent as user_agent
    from stg_event_raw as log
    join stg_song_raw as song
        on log.artist = song.artist_name
        and log.song = song.title
    where page = 'NextSong'
""")

In [94]:
songplays_tbl.limit(5).toPandas()

Unnamed: 0,songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent


# Write Output

In [95]:
# test

In [96]:
time_tbl.write.mode("overwrite").parquet("./songs_table.parquet/")

In [98]:
os.listdir("songs_table.parquet/")

['.part-00000-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00002-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00006-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00007-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00009-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00010-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00011-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00012-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00013-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00014-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00015-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00016-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',
 '.part-00017-09a632c3-cb32-4cbc-82f4-616450c691db-c000.snappy.parquet.crc',