In [1]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc , asc , udf , sum as Fsum
from pyspark.sql.types import StringType , IntegerType

import datetime

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:

configure = SparkConf().setAppName("Sparkify-Data").setMaster("local")
sc = SparkContext(conf = configure)

spark = SparkSession \
        .builder \
        .appName("Sparkify-Data") \
        .getOrCreate()

spark.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('spark.app.id', 'local-1590195565466'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.driver.port', '50968'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.name', 'Sparkify-Data'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.host', 'DESKTOP-I7971JS')]

In [3]:
path = "data/sparkify_data/mini_sparkify_event_data.json"

user_log = spark.read.json(path)

In [4]:
user_log.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [5]:
user_log.createOrReplaceTempView("user_log_table")

In [6]:
spark.sql('''
    SELECT *
    FROM user_log_table
    LIMIT 2
''').show()

+----------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+---------+------+-------------+--------------------+------+
|          artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|     song|status|           ts|           userAgent|userId|
+----------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+---------+------+-------------+--------------------+------+
|  Martha Tilston|Logged In|    Colin|     M|           50| Freeman|277.89016| paid|     Bakersfield, CA|   PUT|NextSong|1538173362000|       29|Rockpools|   200|1538352117000|Mozilla/5.0 (Wind...|    30|
|Five Iron Frenzy|Logged In|    Micah|     M|           79|    Long|236.09424| free|Boston-Cambridge-...|   PUT|NextSong|1538331630000|        8|   Canada|   200|1538352180000|"Moz

In [7]:
spark.sql('''
    SELECT COUNT(*)
    FROM user_log_table

''').show()

+--------+
|count(1)|
+--------+
|  286500|
+--------+



In [8]:
spark.sql('''
    SELECT DISTINCT page
    FROM user_log_table
    ORDER BY page ASC
''').show()

+--------------------+
|                page|
+--------------------+
|               About|
|          Add Friend|
|     Add to Playlist|
|              Cancel|
|Cancellation Conf...|
|           Downgrade|
|               Error|
|                Help|
|                Home|
|               Login|
|              Logout|
|            NextSong|
|            Register|
|         Roll Advert|
|       Save Settings|
|            Settings|
|    Submit Downgrade|
| Submit Registration|
|      Submit Upgrade|
|         Thumbs Down|
+--------------------+
only showing top 20 rows



In [17]:
#UDFs should be registered
spark.udf.register("get_hour",lambda x:int(datetime.datetime.fromtimestamp(x/1000.0).hour))

<function __main__.<lambda>(x)>

In [20]:
spark.sql('''
    SELECT *,get_hour(ts) AS hour
    FROM user_log_table
    LIMIT 2
''').show()

+----------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+---------+------+-------------+--------------------+------+----+
|          artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|     song|status|           ts|           userAgent|userId|hour|
+----------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+---------+------+-------------+--------------------+------+----+
|  Martha Tilston|Logged In|    Colin|     M|           50| Freeman|277.89016| paid|     Bakersfield, CA|   PUT|NextSong|1538173362000|       29|Rockpools|   200|1538352117000|Mozilla/5.0 (Wind...|    30|   5|
|Five Iron Frenzy|Logged In|    Micah|     M|           79|    Long|236.09424| free|Boston-Cambridge-...|   PUT|NextSong|1538331630000|        8|   Canada|   20

In [27]:
songs_in_hour = spark.sql('''
    SELECT get_hour(ts) AS hour , COUNT(*) AS plays_per_hour
    FROM user_log_table
    WHERE page="NextSong"  
    GROUP BY hour
    ORDER BY cast(hour as int) ASC
''')

In [28]:
songs_in_hour.collect()

[Row(hour='0', plays_per_hour=10930),
 Row(hour='1', plays_per_hour=10856),
 Row(hour='2', plays_per_hour=10458),
 Row(hour='3', plays_per_hour=10122),
 Row(hour='4', plays_per_hour=9630),
 Row(hour='5', plays_per_hour=9062),
 Row(hour='6', plays_per_hour=8388),
 Row(hour='7', plays_per_hour=8336),
 Row(hour='8', plays_per_hour=8355),
 Row(hour='9', plays_per_hour=8147),
 Row(hour='10', plays_per_hour=8178),
 Row(hour='11', plays_per_hour=8054),
 Row(hour='12', plays_per_hour=8091),
 Row(hour='13', plays_per_hour=8253),
 Row(hour='14', plays_per_hour=8498),
 Row(hour='15', plays_per_hour=8882),
 Row(hour='16', plays_per_hour=8830),
 Row(hour='17', plays_per_hour=9326),
 Row(hour='18', plays_per_hour=9644),
 Row(hour='19', plays_per_hour=10494),
 Row(hour='20', plays_per_hour=11354),
 Row(hour='21', plays_per_hour=11636),
 Row(hour='22', plays_per_hour=11460),
 Row(hour='23', plays_per_hour=11124)]

In [30]:
song_in_hour_df = songs_in_hour.toPandas()

In [32]:
song_in_hour_df.describe()

Unnamed: 0,plays_per_hour
count,24.0
mean,9504.5
std,1248.671816
min,8054.0
25%,8350.25
50%,9194.0
75%,10584.5
max,11636.0
