# Installing required packages

In [None]:
# Installing required packages
!pip install pyspark
!pip install findspark
!pip install pandas

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=68ad019f004fe75124b296887dd6b5fb924a82a41901625961dff4da22f66706
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


# Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from matplotlib import pyplot as plt

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score

# Reading the Dataset

In [None]:
df = pd.read_csv('music_streaming.csv')

In [None]:
df.head(3)

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Genre
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,,0.0849,0.899,134.071,234596.0,4,5
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.23,1,0.0406,0.0011,0.00401,0.101,0.569,116.454,251733.0,4,10
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486,0.000196,0.394,0.787,147.681,109667.0,4,6


# Data Preprocessing

In [None]:
# Check the number of rows and columns in the dataset.
df.shape

(15517, 17)

In [None]:
# Some general info about the dataset.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15517 entries, 0 to 15516
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Artist Name         15517 non-null  object 
 1   Track Name          15517 non-null  object 
 2   Popularity          15123 non-null  float64
 3   danceability        15517 non-null  float64
 4   energy              15517 non-null  float64
 5   key                 13774 non-null  float64
 6   loudness            15517 non-null  float64
 7   mode                15517 non-null  int64  
 8   speechiness         15517 non-null  float64
 9   acousticness        15517 non-null  float64
 10  instrumentalness    11930 non-null  float64
 11  liveness            15517 non-null  float64
 12  valence             15517 non-null  float64
 13  tempo               15517 non-null  float64
 14  duration_in min/ms  15517 non-null  float64
 15  time_signature      15517 non-null  int64  
 16  Genr

In [None]:
# Check the null values in each columns.
df.isnull().sum()

Artist Name              0
Track Name               0
Popularity             394
danceability             0
energy                   0
key                   1743
loudness                 0
mode                     0
speechiness              0
acousticness             0
instrumentalness      3587
liveness                 0
valence                  0
tempo                    0
duration_in min/ms       0
time_signature           0
Genre                    0
dtype: int64

# Check for null values

In [None]:
# Check some rows, to see that they are actually "NaN" values.
df[df['Popularity'].isnull()].head(3)

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Genre
21,"IndianRaga, Akshay Anantapadmanabhan, Madhu Iy...",Swagatham Krishna - Mohanam - Adi,,0.548,0.711,10.0,-8.44,1,0.107,0.542,9.6e-05,0.355,0.618,119.675,4.015633,3,0
36,Lilly Wood and The Prick,A Song,,0.618,0.438,11.0,-7.495,1,0.0762,0.407,,0.5,0.651,155.014,209307.0,4,6
43,How Lucky,"Kurt Vile, John Prine",,0.523,0.441,5.0,-9.928,1,0.0311,0.465,0.00268,0.365,0.49,171.246,3.385117,4,0


In [None]:
# We will firstly drop the instrumentalness column as it contains a lot of null values.
# We cannot do imputation because the some values are not numerical.
# And dropping all rows with instrumentalness null values will drop too many rows.
# So the best solution seems to drop that column and ignore it.
df = df.drop('instrumentalness', axis=1)

In [None]:
# Then we will drop the rows with null values in the "key" column,
# because it isnt a very important column. And dropping the rows wont affect a lot of rows.
df = df.dropna(subset=['key'])

In [None]:
# We will impute the missing values in the popularity column since it's an important column.
# We will do multivariate imputation using the mean of the popularity grouped by the artist name.
# The artist name doesnt contain too many unique values, so we can calculate the mean using multiple rows.
df['Artist Name'].nunique()

7600

In [None]:
# Multivariate imputation.
df['Popularity'] = df.groupby('Artist Name')['Popularity'].transform(lambda x: x.fillna(x.mean()))

In [None]:
df['Popularity'].isnull().sum()
# We see that we still have some null values in the popularity column.
# Those are the rows that have only one artist name. So calculating the mean doesn't really work.
# So we will drop the rest of these rows.

147

In [None]:
# Drop the rest of the rows with null values in the popularity column.
df = df.dropna(subset=['Popularity'])

In [None]:
# We have successfully removed all null values.
df.isnull().sum()

Artist Name           0
Track Name            0
Popularity            0
danceability          0
energy                0
key                   0
loudness              0
mode                  0
speechiness           0
acousticness          0
liveness              0
valence               0
tempo                 0
duration_in min/ms    0
time_signature        0
Genre                 0
dtype: int64

In [None]:
# And we stilll have a lot of rows.
df.shape

(13627, 16)

# Check for duplicated rows

In [None]:
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows:", len(duplicate_rows))

Number of duplicate rows: 1


In [None]:
# Drop any duplicated rows.
df = df.drop_duplicates()
df.shape

(13626, 16)

# Feature Engineering

In [None]:
# Convert the duration from min/ms into minutes only.
df['duration in min'] = df['duration_in min/ms'] / 60000
df = df.drop('duration_in min/ms', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['duration in min'] = df['duration_in min/ms'] / 60000


In [None]:
df.head(2)

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,liveness,valence,tempo,time_signature,Genre,duration in min
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,0.0849,0.899,134.071,4,5,3.909933
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.23,1,0.0406,0.0011,0.101,0.569,116.454,4,10,4.19555


# Spark Set Up

In [None]:
# Creating a spark context class.
sc = SparkContext()

# Creating a spark session.
spark = SparkSession \
    .builder \
    .appName("Music Genre Dataset") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
spark

## Creating the Spark Dataframe (sdf) and showing some general info

In [None]:
# Create the spark dataframe.
sdf = spark.createDataFrame(df)

new_columns = [c.replace(' ', '_') for c in sdf.columns]
sdf = sdf.toDF(*new_columns)

In [None]:
# print the spark dataframe schema.
sdf.printSchema()

root
 |-- Artist_Name: string (nullable = true)
 |-- Track_Name: string (nullable = true)
 |-- Popularity: double (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: long (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: long (nullable = true)
 |-- Genre: long (nullable = true)
 |-- duration_in_min: double (nullable = true)



In [None]:
sdf.show(2)

+-----------+--------------------+----------+------------+------+---+--------+----+-----------+------------+--------+-------+-------+--------------+-----+------------------+
|Artist_Name|          Track_Name|Popularity|danceability|energy|key|loudness|mode|speechiness|acousticness|liveness|valence|  tempo|time_signature|Genre|   duration_in_min|
+-----------+--------------------+----------+------------+------+---+--------+----+-----------+------------+--------+-------+-------+--------------+-----+------------------+
| Bruno Mars|That's What I Lik...|      60.0|       0.854| 0.564|1.0|  -4.964|   1|     0.0485|      0.0171|  0.0849|  0.899|134.071|             4|    5|3.9099333333333335|
|     Boston|        Hitch a Ride|      54.0|       0.382| 0.814|3.0|   -7.23|   1|     0.0406|      0.0011|   0.101|  0.569|116.454|             4|   10|           4.19555|
+-----------+--------------------+----------+------------+------+---+--------+----+-----------+------------+--------+-------+-----

In [None]:
sdf.count()

13626

## Create a Table View

In [None]:
sdf.createOrReplaceTempView("music")

# Spark SQL Queries Part


## a) Which genre has the highest average popularity?

In [None]:
# Group by genre and calculate the average popularity for the highest average.
highest_genre_avg_popularity =  spark.sql('select Genre, avg(Popularity) as AVGPopularity from music group by Genre order by AVGPopularity desc limit 1').show()



+-----+------------------+
|Genre|     AVGPopularity|
+-----+------------------+
|    4|57.345565749235476|
+-----+------------------+



## b) Display which artists have recorded the most number of songs with a duration of more than 5 minutes

In [None]:
spark.sql('select Artist_Name, count(*) as songCount from music where duration_in_min > 5 group by Artist_Name order by songCount desc').show()

+--------------------+---------+
|         Artist_Name|songCount|
+--------------------+---------+
|           Metallica|       18|
|                TOOL|       11|
|        Led Zeppelin|       11|
|         Arcade Fire|        8|
|           Pearl Jam|        8|
|         Sonic Youth|        6|
|               Kyuss|        6|
|Kenny Wayne Shepherd|        6|
|       Wooden Shjips|        6|
|             Pantera|        6|
|       Joe Bonamassa|        6|
|           Aerosmith|        6|
|      Monster Magnet|        5|
|                  U2|        5|
|               Opeth|        5|
|         Patti Smith|        5|
|        Dire Straits|        5|
|     Bernard Allison|        5|
|                 Yes|        4|
|       The Beta Band|        4|
+--------------------+---------+
only showing top 20 rows



## c) How many songs are included in every Genre?

In [None]:
spark.sql('select Genre, count(*) as numOfSongs from music group by Genre order by numOfSongs desc').show()

+-----+----------+
|Genre|numOfSongs|
+-----+----------+
|   10|      3753|
|    6|      1993|
|    9|      1594|
|    8|      1543|
|    1|      1113|
|    5|      1100|
|    2|      1003|
|    0|       485|
|    7|       377|
|    3|       338|
|    4|       327|
+-----+----------+



## d) Which artists dominated the charts?

In [None]:
spark.sql('select  Artist_Name, count(*) as numOfSongs from music group by Artist_Name order by numOfSongs desc limit 1').show()


+---------------+----------+
|    Artist_Name|numOfSongs|
+---------------+----------+
|Backstreet Boys|        57|
+---------------+----------+



## e) Recommend at least 5 fun/not-boring songs that can be played at a party, you can use features like energy, danceability etc.. to represent cheerfulness.

In [None]:
spark.sql("select Track_Name from music where energy >= 0.7 AND danceability >= 0.7 AND valence >= 0.7 AND tempo >= 100 LIMIT 5").show()

+--------------------+
|          Track_Name|
+--------------------+
| Since You Been Gone|
|   Shots In The Dark|
|Combo (feat. Mr E...|
|             B Mine?|
|          Get to You|
+--------------------+



# SparkML Part

# Preparing Data for Machine Learning

In [None]:
# Defining the features columns.
# All columns except for the Artist Name, Track Name and Genre because the Artist Name and Track Name
# are irrelevant data features in Mashine Learning.
# We only care about a songs specific features, that make the songs themselves.
# And the Genre is the feature which we will predict, so it is not added.
numericCols = ['Popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness','liveness', 'valence', 'tempo','time_signature','duration_in_min']

# Featurization Process (Pipeline).
featurizationPipeline = Pipeline(stages = [VectorAssembler(inputCols = numericCols, outputCol = "feature_vector")])

featurizationPipelineModel = featurizationPipeline.fit(sdf)
sdf = featurizationPipelineModel.transform(sdf)

# Split the data into train and test sets.
train, test = sdf.randomSplit([0.8, 0.2], seed = 2018)

# First classification method: **Logistic Regression**

In [None]:
# Train the model.
# Regularization parameter: helps prevent overfitting.
# Standardization parameter: standardize features before fitting the model
lr = LogisticRegression(featuresCol = 'feature_vector', labelCol = 'Genre', maxIter=50, regParam=0.0001, standardization=True)
lr_Model = lr.fit(train)

In [None]:
# Make predictions.
lr_predictions = lr_Model.transform(test)

In [None]:
# Model Evaluation.
true_labels = lr_predictions.select('Genre')
lr_predict = lr_predictions.select('prediction')

# Calculate accuracy.
lr_accuracy = accuracy_score(true_labels.toPandas(), lr_predict.toPandas())
print("Logistic Regression Accuracy =", lr_accuracy*100, "%")

Logistic Regression Accuracy = 48.314190440904035 %


# Second classification method: **Decision Trees**

In [None]:
# Train the model.
dt = DecisionTreeClassifier(featuresCol = 'feature_vector', labelCol = 'Genre')
dt_Model = dt.fit(train)

In [None]:
# Make predictions.
dt_predictions = dt_Model.transform(test)

In [None]:
# Model Evaluation.
true_labels = dt_predictions.select('Genre')
dt_predict = dt_predictions.select('prediction')

# Calculate accuracy.
dt_accuracy = accuracy_score(true_labels.toPandas(), dt_predict.toPandas())
print("Decision Tree Accuracy =", dt_accuracy*100, "%")

Decision Tree Accuracy = 42.645424231196735 %


# Third classification method: **Random Forests**

In [None]:
# Train the model.
rf = RandomForestClassifier(featuresCol = 'feature_vector', labelCol = 'Genre', numTrees=80, maxDepth=7)
rf_Model = rf.fit(train)

In [None]:
# Make predictions.
rf_predictions = rf_Model.transform(test)

In [None]:
# Model Evaluation.
true_labels = rf_predictions.select('Genre')
rf_predict = rf_predictions.select('prediction')

# Calculate accuracy.
rf_accuracy = accuracy_score(true_labels.toPandas(), rf_predict.toPandas())
print("Random Forest Accuracy =", rf_accuracy*100, "%")

Random Forest Accuracy = 49.16635791033716 %


In [None]:
# The best model is the Random Forest Model, all though they are all pretty bad.
# When tuning some model parameters in each model, they get an accuracy around the range
# 25 - 48. But none really excell more than the other.