In [1]:
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [2]:
spark = SparkSession.builder\
                    .master("local")\
                    .appName("SparkifyMusic")\
                    .getOrCreate()

In [3]:
spark

In [4]:
spark.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.name', 'SparkifyMusic'),
 ('spark.app.id', 'local-1561176073104'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', '192.168.1.10'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.port', '39495')]

In [5]:
path = "./data/log-data/2018-11-01-events.json"

In [6]:
user_log = spark.read.json(path)

In [7]:
user_log.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [8]:
user_log.show(1)

+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+----+-----------------+---------+----+------+-------------+--------------------+------+
|artist|     auth|firstName|gender|itemInSession|lastName|length|level|            location|method|page|     registration|sessionId|song|status|           ts|           userAgent|userId|
+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+----+-----------------+---------+----+------+-------------+--------------------+------+
|  null|Logged In|   Walter|     M|            0|    Frye|  null| free|San Francisco-Oak...|   GET|Home|1.540919166796E12|       38|null|   200|1541105830796|"Mozilla/5.0 (Mac...|    39|
+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+----+-----------------+---------+----+------+-------------+--------------------+------+
only showing top 1 row



In [9]:
user_log.take(5)

[Row(artist=None, auth='Logged In', firstName='Walter', gender='M', itemInSession=0, lastName='Frye', length=None, level='free', location='San Francisco-Oakland-Hayward, CA', method='GET', page='Home', registration=1540919166796.0, sessionId=38, song=None, status=200, ts=1541105830796, userAgent='"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"', userId='39'),
 Row(artist=None, auth='Logged In', firstName='Kaylee', gender='F', itemInSession=0, lastName='Summers', length=None, level='free', location='Phoenix-Mesa-Scottsdale, AZ', method='GET', page='Home', registration=1540344794796.0, sessionId=139, song=None, status=200, ts=1541106106796, userAgent='"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"', userId='8'),
 Row(artist="Des'ree", auth='Logged In', firstName='Kaylee', gender='F', itemInSession=1, lastName='Summers', length=246.30812, level='free'

In [10]:
out_path = "./user_log.csv"
user_log.write.save(out_path, format='csv', header = True)

AnalysisException: 'path file:/home/vanducng/git/NanoDegree-Project04/user_log.csv already exists.;'

In [11]:
user_log2 = spark.read.csv(out_path, header = True)

In [12]:
user_log2.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: string (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: string (nullable = true)
 |-- sessionId: string (nullable = true)
 |-- song: string (nullable = true)
 |-- status: string (nullable = true)
 |-- ts: string (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [13]:
user_log2.take(2)

[Row(artist=None, auth='Logged In', firstName='Walter', gender='M', itemInSession='0', lastName='Frye', length=None, level='free', location='San Francisco-Oakland-Hayward, CA', method='GET', page='Home', registration='1.540919166796E12', sessionId='38', song=None, status='200', ts='1541105830796', userAgent='"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"', userId='39'),
 Row(artist=None, auth='Logged In', firstName='Kaylee', gender='F', itemInSession='0', lastName='Summers', length=None, level='free', location='Phoenix-Mesa-Scottsdale, AZ', method='GET', page='Home', registration='1.540344794796E12', sessionId='139', song=None, status='200', ts='1541106106796', userAgent='"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"', userId='8')]

In [15]:
user_log.describe("artist").show()

+-------+---------------+
|summary|         artist|
+-------+---------------+
|  count|             11|
|   mean|           null|
| stddev|           null|
|    min|Black Eyed Peas|
|    max| The Mars Volta|
+-------+---------------+



In [16]:
user_log.describe("sessionId").show()

+-------+------------------+
|summary|         sessionId|
+-------+------------------+
|  count|                15|
|   mean|             127.0|
| stddev|45.639894828976104|
|    min|                 9|
|    max|               169|
+-------+------------------+



In [17]:
user_log.count()

15

In [19]:
user_log.select("page").dropDuplicates().sort("page").show()

+--------+
|    page|
+--------+
|    Home|
|NextSong|
| Upgrade|
+--------+



In [28]:
get_hour = udf(lambda x: datetime.datetime.fromtimestamp(x/1000.0).hour)

In [29]:
user_log = user_log.withColumn("hour", get_hour(user_log.ts))

In [32]:
songs_in_hour = user_log.filter(user_log.page == "NextSong").groupby(user_log.hour).count().orderBy(user_log.hour.cast("float"))

## Process Song Data

In [60]:
import pandas as pd
from pyspark.sql.types import *

In [102]:
song_data = "./data/song-data/song_data/*/*/*/*.json"
# song_data = "./data/song-data/song_data/A/A/A/TRAAAAW128F429D538.json"
df = spark.read.json(song_data)

In [103]:
df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [104]:
df.toPandas()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARDR4AC1187FB371A1,,,,Montserrat Caballé;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti,511.16363,1,SOBAYLL12A8C138AF9,Sono andati? Fingevo di dormire,0
1,AREBBGV1187FB523D2,,"Houston, TX",,Mike Jones (Featuring CJ_ Mello & Lil' Bran),173.66159,1,SOOLYAZ12A6701F4A6,Laws Patrolling (Album Version),0
2,ARMAC4T1187FB3FA4C,40.82624,"Morris Plains, NJ",-74.47995,The Dillinger Escape Plan,207.77751,1,SOBBUGU12A8C13E95D,Setting Fire to Sleeping Giants,2004
3,ARPBNLO1187FB3D52F,40.71455,"New York, NY",-74.00712,Tiny Tim,43.36281,1,SOAOIBZ12AB01815BE,I Hold Your Hand In Mine [Live At Royal Albert Hall],2000
4,ARNF6401187FB57032,40.79086,"New York, NY [Manhattan]",-73.96644,Sophie B. Hawkins,305.16200,1,SONWXQJ12A8C134D94,The Ballad Of Sleeping Beauty,1994
5,ARDNS031187B9924F0,32.67828,Georgia,-83.22295,Tim Wilson,186.48771,1,SONYPOM12A8C13B2D7,I Think My Wife Is Running Around On Me (Taco Hell),2005
6,ARLTWXK1187FB5A3F8,32.74863,"Fort Worth, TX",-97.32925,King Curtis,326.00771,1,SODREIN12A58A7F2E5,A Whiter Shade Of Pale (Live @ Fillmore West),0
7,ARPFHN61187FB575F6,41.88415,"Chicago, IL",-87.63241,Lupe Fiasco,279.97995,1,SOWQTQZ12A58A7B63E,Streets On Fire (Explicit Album Version),0
8,ARI2JSK1187FB496EF,51.50632,"London, England",-0.12714,Nick Ingman;Gavyn Wright,111.62077,1,SODUJBS12A8C132150,Wessex Loses a Bride,0
9,AR9AWNF1187B9AB0B4,,"Seattle, Washington USA",,Kenny G featuring Daryl Hall,236.93016,1,SOZHPGD12A8C1394FE,Baby Come To Me,0


In [115]:
songs_schema = StructType([
    StructField("artist_id", StringType(), False),
    StructField("artist_latitude", DoubleType(), True),
    StructField("artist_location", StringType(), True),
    StructField("artist_longitude", DoubleType(), True),
    StructField("artist_name", StringType(), True),
    StructField("duration", DoubleType(), True),
    StructField("num_songs", LongType(), True),
    StructField("song_id", StringType(), False),
    StructField("title", StringType(), True),
    StructField("year", IntegerType(), True)
])

In [116]:
pd.set_option("max_colWidth", 1500)

In [123]:
df = spark.read.json(song_data,
                    schema=songs_schema,
                    mode="DROPMALFORMED")

In [124]:
df.limit(3).toPandas()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARDR4AC1187FB371A1,,,,Montserrat Caballé;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti,511.16363,1,SOBAYLL12A8C138AF9,Sono andati? Fingevo di dormire,0
1,AREBBGV1187FB523D2,,"Houston, TX",,Mike Jones (Featuring CJ_ Mello & Lil' Bran),173.66159,1,SOOLYAZ12A6701F4A6,Laws Patrolling (Album Version),0
2,ARMAC4T1187FB3FA4C,40.82624,"Morris Plains, NJ",-74.47995,The Dillinger Escape Plan,207.77751,1,SOBBUGU12A8C13E95D,Setting Fire to Sleeping Giants,2004


In [125]:
df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)



In [126]:
df.createOrReplaceTempView("songs")

In [127]:
songs_table = spark.sql("""
                            SELECT song_id, 
                            title,
                            artist_id,
                            year,
                            duration
                            FROM songs
                        """)

In [129]:
songs_table.show(4)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOBAYLL12A8C138AF9|Sono andati? Fing...|ARDR4AC1187FB371A1|   0|511.16363|
|SOOLYAZ12A6701F4A6|Laws Patrolling (...|AREBBGV1187FB523D2|   0|173.66159|
|SOBBUGU12A8C13E95D|Setting Fire to S...|ARMAC4T1187FB3FA4C|2004|207.77751|
|SOAOIBZ12AB01815BE|I Hold Your Hand ...|ARPBNLO1187FB3D52F|2000| 43.36281|
+------------------+--------------------+------------------+----+---------+
only showing top 4 rows



In [130]:
songs_table.write.mode("overwrite").partitionBy("year", "artist_id").parquet("./output_data/songs_table")

In [132]:
artists_table = spark.sql("""
                            SELECT DISTINCT artist_id,
                            artist_name name,
                            artist_location location,
                            artist_latitude latitude,
                            artist_longitude longitude
                            FROM songs
                        """)
artists_table.show(5)

+------------------+------------+---------------+--------+----------+
|         artist_id|        name|       location|latitude| longitude|
+------------------+------------+---------------+--------+----------+
|ARPBNLO1187FB3D52F|    Tiny Tim|   New York, NY|40.71455| -74.00712|
|ARBEBBY1187B9B43DB|   Tom Petty|Gainesville, FL|    null|      null|
|AR0IAWL1187B9A96D0|Danilo Perez|         Panama|  8.4177| -80.11278|
|ARMBR4Y1187B9990EB|David Martin|California - SF|37.77916|-122.42005|
|ARD0S291187B9B7BF5|     Rated R|           Ohio|    null|      null|
+------------------+------------+---------------+--------+----------+
only showing top 5 rows

