**Name**: Melvin Vellera  
**Analytic Goal**: Predict genre of a song  
**Machine Learning Algorithms**: Random Forest, Logistic Regression  
**Cluster**: 3 Node i3.xlarge cluster with v.10.3 (includes Apache Spark 3.2.1, Scala 2.12)

In [0]:
from time import time
start = time()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

In [0]:
def indexStringColumns(train, test, cols):
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        si_model = si.fit(train)
        train = si_model.transform(train)
        test = si_model.transform(test)
        train = train.drop(c).withColumnRenamed(c+"-num", c)
        test = test.drop(c).withColumnRenamed(c+"-num", c)
    return train, test

def oneHotEncodeColumns(train, test, cols):
    for c in cols:
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(train)
        train = ohe_model.transform(train)
        test = ohe_model.transform(test)
        train = train.drop(c).withColumnRenamed(c+"-onehot", c)
        test = test.drop(c).withColumnRenamed(c+"-onehot", c)
    return train, test

In [0]:
spark = SparkSession.builder.config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1')\
            .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.1')\
            .config("spark.network.timeout", "36000000s")\
            .config("spark.executor.heartbeatInterval", "3600s")\
            .config("spark.dynamicAllocation.enabled", "false")\
            .appName("OMS") \
            .getOrCreate()


In [0]:
# Specify MongoDB configuration for pulling data
database = 'oms'
collection = 'song_genre'
user_name = 'oms'
password = 'oms'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"

#### Import data from MongoDb

In [0]:
# Read data from MongoDb
df = spark.read.format("mongo").option("uri",connection_string).load()

In [0]:
# View data
df.show(5)

#### Preprocess data to make it suitable for modeling

In [0]:
# Filter the dataframe to discard columns that are ids or have missing values
filtered_df = df.select("year", "danceability", "duration", "energy", "key", "loudness", "mode", 
                         "start_of_fade_out", "tempo", "time_signature", "lyrics_embedding", "genre")
filtered_df.cache()
filtered_df.count()

In [0]:
# Get summary of the selected columns
filtered_df.summary().display()

summary,year,danceability,duration,energy,key,loudness,mode,start_of_fade_out,tempo,time_signature,genre
count,116758,116758.0,116758.0,116758.0,116758.0,116758.0,116758.0,116758.0,116758.0,116758.0,116758
mean,,0.0,238.1666238087324,0.0,5.282490279038695,-8.37901927919287,0.6832936501139109,228.96376347658523,126.72507448740244,3.6539252128333817,
stddev,,0.0,90.43637062858336,0.0,3.5637849128748544,4.096195070708321,0.4651938211965309,88.58162056559385,34.507761103844594,1.126713853940391,
min,1920s,0.0,0.522,0.0,0.0,-47.441,0.0,0.522,0.0,0.0,Avant Garde
25%,,0.0,189.962,0.0,2.0,-10.578,0.0,181.841,100.007,4.0,
50%,,0.0,228.98893,0.0,5.0,-7.391,1.0,219.469,123.323,4.0,
75%,,0.0,272.3522,0.0,9.0,-5.368,1.0,261.851,148.763,4.0,
max,2010s,0.0,3024.66567,0.0,11.0,2.865,1.0,3020.301,280.955,7.0,Vocal


We can see from the above summary that danceability and energy columns have a single value only (constant columns)

In [0]:
#Remove constant columns
filtered_df = filtered_df.drop("danceability", "energy")
filtered_df.cache()
filtered_df.count()

In [0]:
# Check if any null values are present in any column
{col:filtered_df.filter(filtered_df[col].isNull()).count() for col in filtered_df.columns}

In [0]:
# Check unique genres and the number of output classes (genre)
unique_genre = filtered_df.select('genre').distinct()
n_output_classes = unique_genre.count()
print(n_output_classes)
unique_genre.show()

#### Split data into train and test

In [0]:
# Split data into train and test
train_test_df = filtered_df.randomSplit([0.8, 0.2], 1)
train_df = train_test_df[0]
test_df = train_test_df[1]
train_df.cache()
test_df.cache()
print(train_df.count())
print(test_df.count())

#### Convert categorical columns to indexes

In [0]:
# Convert categorical columns to indexes
categorical_cols = ["year", "genre"]
train_df_numeric, test_df_numeric = indexStringColumns(train_df, test_df, categorical_cols)

In [0]:
train_df_numeric.display()

duration,key,loudness,mode,start_of_fade_out,tempo,time_signature,lyrics_embedding,year,genre
277.15873,1,-16.184,1,273.908,107.924,4,"List(-0.031483667242415074, -0.10618563572670653, 0.5031864509825975, -1.5874433166089472, 0.7748701095651014)",9.0,12.0
183.35302,9,-15.101,1,173.935,97.915,4,"List(-0.11660222896747291, 0.03755403051525354, 0.21827486750483513, -1.950095944352448, 0.7970992401689291)",7.0,12.0
183.48363,8,-15.415,1,175.177,84.918,4,"List(-0.5511955075175026, -0.026105823945634217, 0.15994950634788493, -2.8463053813831385, 0.7133604537657633)",7.0,12.0
184.73751,10,-14.749,0,179.241,99.602,4,"List(-0.9219694815482944, -0.13360975734516978, 0.09295503515005112, -1.179882988223806, 0.6906660223901272)",7.0,11.0
196.44036,7,-15.593,1,189.608,153.755,3,"List(-0.6670095400995243, 0.11506408996538701, 0.5347637996172249, -2.3407715423431217, 0.8673367663365552)",7.0,3.0
210.75546,8,-15.274,1,205.694,94.662,5,"List(-0.819624266526194, 0.13409620176138576, -0.5517601467929292, -2.367925111946597, 1.1398944478511492)",7.0,8.0
245.44608,9,-21.15,0,239.543,93.092,4,"List(-0.4799413123495273, 0.644405758270138, -0.3276586180603182, -2.585679351573659, 0.919800955620972)",7.0,12.0
250.3571,10,-12.886,1,243.688,70.407,5,"List(0.41723413544814814, 0.5724066215122274, 0.8571092899283165, -2.908158897130929, 1.1910362219061759)",7.0,8.0
314.43546,10,-23.093,0,291.172,75.182,1,"List(-0.9050677694751234, 0.8024701687647424, -0.17525894834139083, -2.67306925962537, 0.8380430637527321)",7.0,12.0
118.30812,2,-18.034,1,113.093,79.765,1,"List(-0.10264137551098704, 0.8392042293920216, 0.06756274577185332, -3.720788072570261, 1.6302583571299014)",8.0,8.0


#### One Hot Encoding of categorical columns

In [0]:
# Get one-hot encoding of the categorical columns - except genre, which is the dependent variable
categorical_cols.remove('genre')
train_df_ohe, test_df_ohe = oneHotEncodeColumns(train_df_numeric, test_df_numeric, categorical_cols)

In [0]:
test_df_ohe.display()

duration,key,loudness,mode,start_of_fade_out,tempo,time_signature,lyrics_embedding,genre,year
194.11546,0,-12.99,1,188.801,160.108,1,"List(0.06951100105858024, 0.918235227561501, 0.3457141685354359, -2.4571679100048427, 0.32572315203792906)",12.0,"Map(vectorType -> sparse, length -> 10, indices -> List(7), values -> List(1.0))"
148.34893,7,-15.937,1,143.343,86.701,4,"List(-0.10093192497098988, 0.5270097218792547, -0.20730323425748132, -2.6212513351474294, 0.8508528613908724)",3.0,"Map(vectorType -> sparse, length -> 10, indices -> List(8), values -> List(1.0))"
153.28608,9,-19.7,1,149.722,103.632,1,"List(-0.16473731404495617, 0.44556468488216133, 0.1399134922595251, -3.0198523972242595, 1.1343182717405613)",12.0,"Map(vectorType -> sparse, length -> 10, indices -> List(8), values -> List(1.0))"
158.64118,10,-20.742,1,148.706,166.442,4,"List(-1.095739191377544, -0.33569403283332505, 0.5700708655295549, -1.514370242771865, 0.18855515158631733)",8.0,"Map(vectorType -> sparse, length -> 10, indices -> List(8), values -> List(1.0))"
162.58567,6,-14.574,1,155.91,115.29,4,"List(-0.07279156959490983, 0.6489708108724807, -1.155317559720963, -2.6320624480323724, 0.8732775573116383)",12.0,"Map(vectorType -> sparse, length -> 10, indices -> List(8), values -> List(1.0))"
110.86322,10,-8.991,1,107.462,180.031,4,"List(-0.14080675253971367, -0.058336252980587776, -0.33362146721205166, -2.1315276742349134, 0.7512298199429847)",3.0,"Map(vectorType -> sparse, length -> 10, indices -> List(6), values -> List(1.0))"
118.282,10,-11.087,0,111.868,101.252,4,"List(-0.15470334921439408, -0.06986111088048833, -0.13852788872926342, -1.5099580670350612, 0.615566268291792)",3.0,"Map(vectorType -> sparse, length -> 10, indices -> List(6), values -> List(1.0))"
119.09179,8,-12.564,0,113.847,92.749,4,"List(-0.04737658899378132, 0.5871402372245316, -0.3557645679191426, -2.3197033848437303, 1.3889184022271954)",8.0,"Map(vectorType -> sparse, length -> 10, indices -> List(6), values -> List(1.0))"
128.49587,7,-11.757,1,123.855,87.996,4,"List(0.13386038961862004, 0.63374834393026, 0.12477801731829323, -2.7262932199067365, 0.834827324903723)",3.0,"Map(vectorType -> sparse, length -> 10, indices -> List(6), values -> List(1.0))"
130.95138,6,-8.865,0,126.973,85.738,4,"List(-0.25718177392220887, 0.3255126940939814, -0.3637891166701252, -2.8861798196962205, 0.8777768702826605)",4.0,"Map(vectorType -> sparse, length -> 10, indices -> List(6), values -> List(1.0))"


#### VectorAssembler

In [0]:
# Convert lyrics embedding array to dense vector
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
train_df_ohe = train_df_ohe.withColumn("lyrics_vector", list_to_vector_udf("lyrics_embedding")).drop('lyrics_embedding')
test_df_ohe = test_df_ohe.withColumn("lyrics_vector", list_to_vector_udf("lyrics_embedding")).drop('lyrics_embedding')

In [0]:
# Use VectorAssembler to prepare data for the model
input_cols = train_df_ohe.columns[:]
input_cols.remove('genre')
va = VectorAssembler(outputCol="features", inputCols=input_cols) #except the last col.

train = va.transform(train_df_ohe).select("features", "genre").withColumnRenamed("genre", "label")
train = train.select("features", "label")

test = va.transform(test_df_ohe).select("features", "genre").withColumnRenamed("genre", "label")
test = test.select("features", "label")

# Cache the train and test data
train.cache()
test.cache()
print(train.count())
print(test.count())

#### Fit Random Forest Model

Cross validation running time is around 2 minutes. Following 2 code blocks can be commented out.

In [0]:
#Do cross validation to find the best hyperparameters
rf = RandomForestClassifier()
mce = MulticlassClassificationEvaluator()
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [3, 5, 7]).addGrid(rf.minInstancesPerNode, [1, 3, 5]).build()
cv = CrossValidator(estimator=rf, 
                    evaluator=mce, 
                    numFolds=3, 
                    estimatorParamMaps=paramGrid)
cvmodel = cv.fit(train)

In [0]:
# Check hyperparameter results
params = [{p.name: v for p, v in m.items()} for m in cvmodel.getEstimatorParamMaps()]
import pandas as pd
pd.DataFrame.from_dict([
    {cvmodel.getEvaluator().getMetricName(): metric, **ps} 
    for ps, metric in zip(params, cvmodel.avgMetrics)
])

Unnamed: 0,f1,maxDepth,minInstancesPerNode
0,0.645683,3,1
1,0.645683,3,3
2,0.645683,3,5
3,0.64964,5,1
4,0.649566,5,3
5,0.649764,5,5
6,0.650723,7,1
7,0.651053,7,3
8,0.650976,7,5


We can maxDepth=7 and minInstancesPerNode=3 as the parameters for the Random Forest model based on the cross validation scores

In [0]:
# Build a Random Forest Classifier to predict the probability of a song belonging to a certain genre
rf = RandomForestClassifier(maxDepth=7, minInstancesPerNode=3)
rfmodel = rf.fit(train)

#### Test Set Evaluation

In [0]:
# Get predictions on the test set
rfpredicts = rfmodel.transform(test)

# Calculate F1 Score
metrics = MulticlassClassificationEvaluator()
print(f"F1: {metrics.evaluate(rfpredicts):.4f}")

# Calculate Accuracy
metrics.setMetricName("accuracy") 
print(f"Accuracy: {metrics.evaluate(rfpredicts):.4f}")

#### Logistic Regression Model

In [0]:
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(train)

In [0]:
# Get predictions on the test set
lrpredicts = lrmodel.transform(test)

# Calculate F1 Score
metrics.setMetricName("f1") 
print(f"F1: {metrics.evaluate(lrpredicts):.4f}")

# Calculate Accuracy
metrics.setMetricName("accuracy") 
print(f"Accuracy: {metrics.evaluate(lrpredicts):.4f}")

Logistic Regression did equally good as the Random Forest model. We could choose either model.

#### Try class weights for Random Forest model

In [0]:
# Try classes weights
weight_df = train.groupby('label').count()
weight_df.cache()
train_len = train.count()
n_classes = weight_df.count()
weight_df = weight_df.withColumn('weights', log(train_len / (weight_df['count'])))

# Merge weights with original df
weight_df = weight_df.drop('count')
train_w = train.join(weight_df, 'label')
test_w = test.join(weight_df, 'label')

# Cache train and test 
train_w.cache()
test_w.cache()
print(train_w.count())
print(test_w.count())

In [0]:
# Train weighted RF model
rf = RandomForestClassifier(maxDepth=7, minInstancesPerNode=3, 
                              featuresCol='features', labelCol='label', weightCol='weights', seed=0)
rfmodel = rf.fit(train_w)

# Get predictions on the test set
rfpredicts = rfmodel.transform(test_w)
# Calculate F1 Score
metrics = MulticlassClassificationEvaluator()
print(f"F1: {metrics.evaluate(rfpredicts):.4f}")
# Calculate Accuracy
metrics.setMetricName("accuracy") 
print(f"Accuracy: {metrics.evaluate(rfpredicts):.4f}")

Unfortunately, the weighted RF model did not improve the previous model. Hence, we can choose the original unweighted RF model as the final model.

In [0]:
print(f'Time taken to run: {time() - start:.2f} seconds')

In [0]:
spark.stop()