In [1]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pandas as pd

In [2]:
spark = SparkSession \
        .builder \
        .master("local[*]") \
        .appName('etl') \
        .getOrCreate()

In [3]:
log_path = 'data/log-data/'
song_path = 'data/song_data/*/*/*/*.json'

In [4]:
user_log = spark.read.json(log_path)

In [5]:
user_log.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [6]:
user_log.describe().show(truncate=10)

+-------+----------+----------+---------+------+-------------+--------+----------+-----+----------+------+-------+------------+----------+----------+----------+----------+----------+----------+
|summary|    artist|      auth|firstName|gender|itemInSession|lastName|    length|level|  location|method|   page|registration| sessionId|      song|    status|        ts| userAgent|    userId|
+-------+----------+----------+---------+------+-------------+--------+----------+-----+----------+------+-------+------------+----------+----------+----------+----------+----------+----------+
|  count|      6820|      8056|     7770|  7770|         8056|    7770|      6820| 8056|      7770|  8056|   8056|        7770|      8056|      6820|      8056|      8056|      7770|      8056|
|   mean|     266.5|      null|     null|  null|   21.1988...|    null|247.032...| null|      null|  null|   null|  1.54077...|598.167...|1388.36...|202.897...|1.54248...|      null|54.4639...|
| stddev|109.002...|      null

In [7]:
user_log.count()

8056

In [8]:
# Fact table contains records associated with song plays (page = NextSong),
user_log = user_log.filter(user_log.page == "NextSong")

In [9]:
user_log.describe().show(truncate=10)

+-------+----------+---------+---------+------+-------------+--------+----------+-----+----------+------+--------+------------+----------+----------+----------+----------+----------+----------+
|summary|    artist|     auth|firstName|gender|itemInSession|lastName|    length|level|  location|method|    page|registration| sessionId|      song|    status|        ts| userAgent|    userId|
+-------+----------+---------+---------+------+-------------+--------+----------+-----+----------+------+--------+------------+----------+----------+----------+----------+----------+----------+
|  count|      6820|     6820|     6820|  6820|         6820|    6820|      6820| 6820|      6820|  6820|    6820|        6820|      6820|      6820|      6820|      6820|      6820|      6820|
|   mean|     266.5|     null|     null|  null|   22.7611...|    null|247.032...| null|      null|  null|    null|  1.54077...|599.181...|1388.36...|     200.0|1.54248...|      null|54.6812...|
| stddev|109.002...|     null|

In [10]:
user_log.count()

6820

In [11]:
#Check for missing values
from pyspark.sql.functions import col

# List of all df columns
columns = user_log.columns
null = []
# Loop
for column in columns:
    # Filter 
    count = user_log.filter(col(column) == '').count()
    null.append(count)

pd.DataFrame({'column': columns, 'nulls': null})

Unnamed: 0,column,nulls
0,artist,0
1,auth,0
2,firstName,0
3,gender,0
4,itemInSession,0
5,lastName,0
6,length,0
7,level,0
8,location,0
9,method,0


In [12]:
song_data = spark.read.json(song_path)

In [13]:
song_data.describe().show()

+-------+------------------+------------------+---------------+------------------+-----------+------------------+---------+------------------+--------------------+-----------------+
|summary|         artist_id|   artist_latitude|artist_location|  artist_longitude|artist_name|          duration|num_songs|           song_id|               title|             year|
+-------+------------------+------------------+---------------+------------------+-----------+------------------+---------+------------------+--------------------+-----------------+
|  count|                71|                31|             71|                31|         71|                71|       71|                71|                  71|               71|
|   mean|              null| 36.55297161290323|           null|-73.25123258064517|       null|239.72967605633804|      1.0|              null|                null|785.9577464788732|
| stddev|              null|12.431023413063542|           null| 36.05807592882608|       n

In [14]:
song_data.show()

+------------------+---------------+--------------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|         artist_id|artist_latitude|     artist_location|artist_longitude|         artist_name| duration|num_songs|           song_id|               title|year|
+------------------+---------------+--------------------+----------------+--------------------+---------+---------+------------------+--------------------+----+
|ARDR4AC1187FB371A1|           null|                    |            null|Montserrat Caball...|511.16363|        1|SOBAYLL12A8C138AF9|Sono andati? Fing...|   0|
|AREBBGV1187FB523D2|           null|         Houston, TX|            null|Mike Jones (Featu...|173.66159|        1|SOOLYAZ12A6701F4A6|Laws Patrolling (...|   0|
|ARMAC4T1187FB3FA4C|       40.82624|   Morris Plains, NJ|       -74.47995|The Dillinger Esc...|207.77751|        1|SOBBUGU12A8C13E95D|Setting Fire to S...|2004|
|ARPBNLO1187FB3D52F|       40.7145

### Dimension Tables

#### USERS:
- user_id, first_name, last_name, gender, level

In [24]:
from pyspark.sql.types import *

users = user_log.selectExpr("userId as user_id",
                            "firstName as first_name",
                            "lastName as last_name",
                            "gender",
                            "level").dropDuplicates(["user_id"])
users = users.withColumn('user_id', users.user_id.cast(IntegerType()))

In [25]:
users.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)



In [26]:
users.show()

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     10|    Sylvie|     Cruz|     F| free|
|    100|     Adler|  Barrera|     M| free|
|    101|    Jayden|      Fox|     M| free|
|     11| Christian|   Porter|     F| free|
|     12|    Austin|  Rosales|     M| free|
|     13|       Ava| Robinson|     F| free|
|     14|  Theodore|   Harris|     M| free|
|     15|      Lily|     Koch|     F| paid|
|     16|     Rylan|   George|     M| paid|
|     17|  Makinley|    Jones|     F| free|
|     18|     Jacob|   Rogers|     M| free|
|     19|   Zachary|   Thomas|     M| free|
|      2|   Jizelle| Benjamin|     F| free|
|     20|     Aiden|  Ramirez|     M| paid|
|     22|      Sean|   Wilson|     F| free|
|     23|    Morris|  Gilmore|     M| free|
|     24|     Layla|  Griffin|     F| paid|
|     25|    Jayden|   Graves|     M| paid|
|     26|      Ryan|    Smith|     M| free|
|     27|    Carlos|   Carter|  

#### SONGS:
- song_id, title, artist_id, year, duration

#### ARTISTS:
- artist_id, name, location, lattitude, longitude

#### TIME:
- start_time, hour, day, week, month, year, weekday

In [None]:
'''Fact Table
songplays - records in log data associated with song plays i.e. records with page NextSong
    songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent'''
