In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, dayofweek


In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']


In [4]:
config['AWS']['AWS_ACCESS_KEY_ID']

'AKIA3MGLR45PNMVQOIFF'

In [3]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

# Process songs data

In [45]:
song_data = './data/song-data/song_data/*/*/*'

In [46]:
df_song =  spark.read.json(song_data)

In [6]:
df_song.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [47]:
songs_table = df_song.select(['song_id', 'title', 'artist_id', 'year', 'duration'])

In [48]:
songs_table.count()

71

In [49]:
songs_table.dropDuplicates().count()

71

In [31]:
songs_table.dropDuplicates(['song_id']).write.partitionBy("year", "artist_id").mode("overwrite").parquet("data/parquet/songs.parquet")

+-------+------------------+--------------------+------------------+-----------------+------------------+
|summary|           song_id|               title|         artist_id|             year|          duration|
+-------+------------------+--------------------+------------------+-----------------+------------------+
|  count|                71|                  71|                71|               71|                71|
|   mean|              null|                null|              null|785.9577464788732|239.72967605633804|
| stddev|              null|                null|              null|980.9571191533839|106.56277912134071|
|    min|SOAOIBZ12AB01815BE|A Higher Place (A...|AR051KA1187B98B2FF|                0|          29.54404|
|    max|SOZVMJI12AB01808AF|   ¿Dónde va Chichi?|ARYKCQI1187FB3B18F|             2008|         599.24853|
+-------+------------------+--------------------+------------------+-----------------+------------------+



In [38]:
artist_table = df.selectExpr(['artist_id', 'artist_name as name', 'artist_location as location', 'artist_latitude as latitude', 'artist_longitude as longitude'])

In [57]:
artist_table = artist_table.dropDuplicates(['artist_id'])

In [58]:
artist_table.count()

69

In [60]:
artist_table.dropDuplicates(['artist_id']).write.mode("overwrite").parquet("data/parquet/artists.parquet")

# Process log data


In [7]:
log_data = './data/log-data/*.json'

In [8]:
df_log = spark.read.json(log_data)

In [None]:
df_log.show()

In [10]:
df_log = df_log.filter(df_log.page == "NextSong")

In [11]:
df_log.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [49]:
users_table = df_log.selectExpr(['userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level'])
users_table.show()

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     26|      Ryan|    Smith|     M| free|
|     26|      Ryan|    Smith|     M| free|
|     26|      Ryan|    Smith|     M| free|
|     61|    Samuel| Gonzalez|     M| free|
|     80|     Tegan|   Levine|     F| paid|
|     80|     Tegan|   Levine|     F| paid|
|     80|     Tegan|   Levine|     F| paid|
|     80|     Tegan|   Levine|     F| paid|
|     80|     Tegan|   Levine|     F| paid|
|     80|     Tegan|   Levine|     F| paid|
|     80|     Tegan|   Levine|     F| paid|
|     15|      Lily|     Koch|     F| paid|
|     80|     Tegan|   Levine|     F| paid|
|     15|      Lily|     Koch|     F| paid|
|     15|      Lily|     Koch|     F| paid|
|     15|      Lily|     Koch|     F| paid|
|     15|      Lily|     Koch|     F| paid|
|     26|      Ryan|    Smith|     M| free|
|     26|      Ryan|    Smith|     M| free|
|     49|     Chloe|   Cuevas|  

In [64]:
users_table.dropDuplicates().write.mode("overwrite").parquet('data/parquet/users.parquet')

In [19]:
from pyspark.sql.types import TimestampType

In [20]:
get_timestamp = udf(lambda x: datetime.fromtimestamp(x/1000.0), TimestampType())

In [21]:
df_log = df_log.withColumn("timestamp", get_timestamp(df_log.ts))

In [22]:
df_log=df_log.withColumn('hour', hour(df_log.timestamp))\
            .withColumn('day', dayofmonth(df_log.timestamp))\
            .withColumn('week', weekofyear(df_log.timestamp))\
            .withColumn('month', month(df_log.timestamp))\
            .withColumn('year', year(df_log.timestamp))\
            .withColumn('weekday', dayofweek(df_log.timestamp))


In [23]:
time_table = df_log.selectExpr(['timestamp as start_time','hour', 'day', 'week','month', 'year', 'weekday'])

In [None]:
df_log.select(df_log.timestamp).distinct().collect()

In [53]:
time_table.select('month').distinct().collect()

[Row(month=11)]

In [18]:
time_table.write.partitionBy("year", "month").mode("overwrite").parquet("data/parquet/time.parquet")

In [24]:
df_log.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekday: integer (nullable = true)



In [13]:
df_song.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [40]:
relation = [df_log.artist == df_song.artist_name, df_log.song == df_song.title]
df = df_log.join(df_song.drop('year'), relation, 'inner')

In [41]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- artist_id: string (nullable = 

In [42]:
songplays = df.selectExpr(['timestamp as start_time',
                          'userId as user_id',
                          'level',
                          'song_id',
                          'artist_id',
                          'sessionId as session_id',
                          'location',
                          'userAgent as user_agent',
                          'month',
                          'year'])

In [43]:
songplays.printSchema()

root
 |-- start_time: timestamp (nullable = true)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)



In [44]:
songplays.write.partitionBy("year", "month").mode("overwrite").parquet('data/parquet/songplays.parquet')