In [1]:
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [2]:
spark = SparkSession.builder\
                    .master("local")\
                    .appName("SparkifyMusic")\
                    .getOrCreate()

In [3]:
spark

In [4]:
spark.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.name', 'SparkifyMusic'),
 ('spark.app.id', 'local-1561176073104'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', '192.168.1.10'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.port', '39495')]

In [5]:
path = "./data/log-data/2018-11-01-events.json"

In [6]:
user_log = spark.read.json(path)

In [7]:
user_log.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [8]:
user_log.show(1)

+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+----+-----------------+---------+----+------+-------------+--------------------+------+
|artist|     auth|firstName|gender|itemInSession|lastName|length|level|            location|method|page|     registration|sessionId|song|status|           ts|           userAgent|userId|
+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+----+-----------------+---------+----+------+-------------+--------------------+------+
|  null|Logged In|   Walter|     M|            0|    Frye|  null| free|San Francisco-Oak...|   GET|Home|1.540919166796E12|       38|null|   200|1541105830796|"Mozilla/5.0 (Mac...|    39|
+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+----+-----------------+---------+----+------+-------------+--------------------+------+
only showing top 1 row



In [9]:
user_log.take(5)

[Row(artist=None, auth='Logged In', firstName='Walter', gender='M', itemInSession=0, lastName='Frye', length=None, level='free', location='San Francisco-Oakland-Hayward, CA', method='GET', page='Home', registration=1540919166796.0, sessionId=38, song=None, status=200, ts=1541105830796, userAgent='"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"', userId='39'),
 Row(artist=None, auth='Logged In', firstName='Kaylee', gender='F', itemInSession=0, lastName='Summers', length=None, level='free', location='Phoenix-Mesa-Scottsdale, AZ', method='GET', page='Home', registration=1540344794796.0, sessionId=139, song=None, status=200, ts=1541106106796, userAgent='"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"', userId='8'),
 Row(artist="Des'ree", auth='Logged In', firstName='Kaylee', gender='F', itemInSession=1, lastName='Summers', length=246.30812, level='free'

In [10]:
out_path = "./user_log.csv"
user_log.write.save(out_path, format='csv', header = True)

AnalysisException: 'path file:/home/vanducng/git/NanoDegree-Project04/user_log.csv already exists.;'

In [11]:
user_log2 = spark.read.csv(out_path, header = True)

In [12]:
user_log2.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: string (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: string (nullable = true)
 |-- sessionId: string (nullable = true)
 |-- song: string (nullable = true)
 |-- status: string (nullable = true)
 |-- ts: string (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [13]:
user_log2.take(2)

[Row(artist=None, auth='Logged In', firstName='Walter', gender='M', itemInSession='0', lastName='Frye', length=None, level='free', location='San Francisco-Oakland-Hayward, CA', method='GET', page='Home', registration='1.540919166796E12', sessionId='38', song=None, status='200', ts='1541105830796', userAgent='"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"', userId='39'),
 Row(artist=None, auth='Logged In', firstName='Kaylee', gender='F', itemInSession='0', lastName='Summers', length=None, level='free', location='Phoenix-Mesa-Scottsdale, AZ', method='GET', page='Home', registration='1.540344794796E12', sessionId='139', song=None, status='200', ts='1541106106796', userAgent='"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"', userId='8')]

In [15]:
user_log.describe("artist").show()

+-------+---------------+
|summary|         artist|
+-------+---------------+
|  count|             11|
|   mean|           null|
| stddev|           null|
|    min|Black Eyed Peas|
|    max| The Mars Volta|
+-------+---------------+



In [16]:
user_log.describe("sessionId").show()

+-------+------------------+
|summary|         sessionId|
+-------+------------------+
|  count|                15|
|   mean|             127.0|
| stddev|45.639894828976104|
|    min|                 9|
|    max|               169|
+-------+------------------+



In [17]:
user_log.count()

15

In [19]:
user_log.select("page").dropDuplicates().sort("page").show()

+--------+
|    page|
+--------+
|    Home|
|NextSong|
| Upgrade|
+--------+



In [28]:
get_hour = udf(lambda x: datetime.datetime.fromtimestamp(x/1000.0).hour)

In [29]:
user_log = user_log.withColumn("hour", get_hour(user_log.ts))

In [32]:
songs_in_hour = user_log.filter(user_log.page == "NextSong").groupby(user_log.hour).count().orderBy(user_log.hour.cast("float"))

## Process Song Data

In [40]:
song_data = "./data/song-data/song_data/*/*/*/*.json"
# song_data = "./data/song-data/song_data/A/A/A/TRAAAAW128F429D538.json"
df = spark.read.json(song_data)

In [41]:
df.count()

1