# 1.1 Imports

In [1]:
# import libraries

import numpy as np
import pandas as pd

from pyspark.sql import SparkSession, Window
from pyspark import SparkFiles
from pyspark.sql.functions import avg, col, concat, count, desc, \
asc, explode, lit, min, max, split, stddev, udf, isnan, when, rank, \
log, sqrt, cbrt, exp
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.types import IntegerType

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, \
LogisticRegressionModel, RandomForestClassifier, \
RandomForestClassificationModel, GBTClassifier, \
GBTClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.ml.feature import CountVectorizer, IDF, Normalizer, \
PCA, RegexTokenizer, Tokenizer, StandardScaler, StopWordsRemover, \
StringIndexer, VectorAssembler, MaxAbsScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.clustering import KMeans
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
# create parksession

spark = SparkSession.builder.master('local') \
        .appName('Sparkify').getOrCreate()

21/07/15 16:40:58 WARN Utils: Your hostname, Yats-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.192.139.70 instead (on interface en0)
21/07/15 16:40:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/07/15 16:40:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# load data

path = 'mini_sparkify_event_data.json'
df = spark.read.json(path)
df.head()



Row(artist='Martha Tilston', auth='Logged In', firstName='Colin', gender='M', itemInSession=50, lastName='Freeman', length=277.89016, level='paid', location='Bakersfield, CA', method='PUT', page='NextSong', registration=1538173362000, sessionId=29, song='Rockpools', status=200, ts=1538352117000, userAgent='Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0', userId='30')

In [4]:
# check schema
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



# 1.2 Missing Values

# 1.2.1 Observations

In [5]:
# total number of rows
print(f'total number of rows: {df.count()}')

total number of rows: 286500


In [8]:
# number of missing values

for c in df.columns:
    df.select(
        [count(when(df[c].isNull(),True))]
    ).show()

+-----------------------------------------------+
|count(CASE WHEN (artist IS NULL) THEN true END)|
+-----------------------------------------------+
|                                          58392|
+-----------------------------------------------+

+---------------------------------------------+
|count(CASE WHEN (auth IS NULL) THEN true END)|
+---------------------------------------------+
|                                            0|
+---------------------------------------------+

+--------------------------------------------------+
|count(CASE WHEN (firstName IS NULL) THEN true END)|
+--------------------------------------------------+
|                                              8346|
+--------------------------------------------------+

+-----------------------------------------------+
|count(CASE WHEN (gender IS NULL) THEN true END)|
+-----------------------------------------------+
|                                           8346|
+-----------------------------------------

# 1.2.2 Missing Values in userId and sessionId

In [9]:
# from above findings, 0 rows in userId and sessionId is NULL
# so we check for samples

df.select(['userId']).drop_duplicates().orderBy(df['userId']).show(10)

+------+
|userId|
+------+
|      |
|    10|
|   100|
|100001|
|100002|
|100003|
|100004|
|100005|
|100006|
|100007|
+------+
only showing top 10 rows



In [10]:
# the first row is an empty string

In [11]:
# create a temporary view for running SQL queries
df.createOrReplaceTempView('df_table')

In [12]:
# show the page events of the user with empty string userId

spark.sql(
    '''
    SELECT DISTINCT page
    FROM df_table
    WHERE userId == ''
    '''
).show()

+-------------------+
|               page|
+-------------------+
|               Home|
|              About|
|Submit Registration|
|              Login|
|           Register|
|               Help|
|              Error|
+-------------------+



In [None]:
# END