In [3]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [4]:
spark = SparkSession \
    .builder \
    .appName("Sparkify") \
    .getOrCreate()

In [5]:
path = "data/log-data/*.json"
log = spark.read.json(path)

In [7]:
log.take(5)

[Row(artist='Harmonia', auth='Logged In', firstName='Ryan', gender='M', itemInSession=0, lastName='Smith', length=655.77751, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, sessionId=583, song='Sehr kosmisch', status=200, ts=1542241826796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26'),
 Row(artist='The Prodigy', auth='Logged In', firstName='Ryan', gender='M', itemInSession=1, lastName='Smith', length=260.07465, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, sessionId=583, song='The Big Gundown', status=200, ts=1542242481796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26'),
 Row(artist='Train', auth='Logged In'

In [8]:
log.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [31]:
log.select("ts").where(log.ts != 0).collect()

[Row(ts=1542241826796),
 Row(ts=1542242481796),
 Row(ts=1542242741796),
 Row(ts=1542247071796),
 Row(ts=1542252577796),
 Row(ts=1542253449796),
 Row(ts=1542253460796),
 Row(ts=1542260074796),
 Row(ts=1542260277796),
 Row(ts=1542260935796),
 Row(ts=1542261224796),
 Row(ts=1542261356796),
 Row(ts=1542261662796),
 Row(ts=1542261713796),
 Row(ts=1542262057796),
 Row(ts=1542262233796),
 Row(ts=1542262434796),
 Row(ts=1542262456796),
 Row(ts=1542262679796),
 Row(ts=1542262728796),
 Row(ts=1542262893796),
 Row(ts=1542263158796),
 Row(ts=1542263378796),
 Row(ts=1542265424796),
 Row(ts=1542265716796),
 Row(ts=1542265929796),
 Row(ts=1542266927796),
 Row(ts=1542267115796),
 Row(ts=1542267351796),
 Row(ts=1542267665796),
 Row(ts=1542267815796),
 Row(ts=1542267925796),
 Row(ts=1542268043796),
 Row(ts=1542268164796),
 Row(ts=1542268187796),
 Row(ts=1542268205796),
 Row(ts=1542268264796),
 Row(ts=1542268578796),
 Row(ts=1542274783796),
 Row(ts=1542275422796),
 Row(ts=1542275430796),
 Row(ts=15422754

In [31]:
def fromTimeStamp(x):
    return datetime.fromtimestamp(x /1000)
get_hour = udf(lambda x: fromTimeStamp(x).hour)

In [32]:
log = log.withColumn("hour", get_hour(log.ts))

In [33]:
log.head()

Row(artist='Harmonia', auth='Logged In', firstName='Ryan', gender='M', itemInSession=0, lastName='Smith', length=655.77751, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, sessionId=583, song='Sehr kosmisch', status=200, ts=1542241826796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26', hour='7')