# Goal: Song Popularity Prediction using non-linear models
## Name: Ashwani Rajan
### Machine Learning Algo: Decision Tree and Random Forest

In [0]:
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import SparkSession 
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer

from pyspark.sql import SparkSession
from pyspark.sql.types import *

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

In [0]:
spark

## Reading Data

In [0]:
database = 'oms'
collection = 'song_popularity_data'
user_name = 'oms'
password = 'oms'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"

In [0]:
df_oms = spark.read.format("mongo").option("uri",connection_string).load()

In [0]:
%%time
df_oms.show(4)

CPU times: user 28.2 ms, sys: 12.8 ms, total: 41 ms
Wall time: 11.5 s

### Train Test Split
Take all non null values as train and do prediction on the values where "song_hotttnesss" is Null

In [0]:
df_test = df_oms.filter(df_oms['song_hotttnesss'].isNull())
df_train = df_oms.filter(df_oms['song_hotttnesss'].isNotNull())

In [0]:
%%time
df_test.count()
df_train.count()

CPU times: user 30 ms, sys: 11.2 ms, total: 41.2 ms
Wall time: 11.7 s

In [0]:
### Select needed columns
df_train = df_train.select('duration','start_of_fade_out', 'time_signature','tempo','key','mode','year','loudness','song_hotttnesss')
df_test = df_test.select('duration','start_of_fade_out', 'time_signature','tempo','key','mode','year','loudness','song_hotttnesss')
df_test.cache()
df_train.cache()

### train valdiation split

In [0]:
data_split = df_train.randomSplit([0.8,0.2], seed = 10)

train_data = data_split[0]
val_data = data_split[1]

train_data.cache()
val_data.cache()

In [0]:
train_data.display()

duration,start_of_fade_out,time_signature,tempo,key,mode,year,loudness,song_hotttnesss
33.41016,33.41,3,94.111,9,0,1960s,-14.766,0.0
44.93016,44.93,1,120.177,2,1,2000s,-16.527,0.0
53.15873,49.319,1,188.972,7,0,1960s,-2.751,0.4191258929591558
53.28934,50.614,4,123.068,0,1,2000s,-10.458,0.3603705827121406
61.67465,61.675,4,124.363,10,0,1960s,-7.409,0.0
61.962,61.962,4,79.293,0,0,2000s,-11.74,0.7118776226158788
63.03302,63.033,4,122.795,10,1,1960s,-15.395,0.450992311636255
74.91873,71.686,3,160.805,3,1,1990s,-23.334,0.5946422445649223
77.19138,77.191,4,73.749,8,1,1960s,-9.435,0.3943948561005826
77.89669,77.897,4,191.936,1,1,2000s,-4.773,0.6266128026956008


## String-Indexing

only year column needs string indexing

In [0]:
si = StringIndexer(inputCol='year', outputCol="year-num")
sm = si.fit(train_data)
train_data_num = sm.transform(train_data).drop('year')
valid_data_num = sm.transform(val_data).drop('year')
df_test_num = sm.transform(df_test).drop('year')

train_data_num = train_data_num.withColumnRenamed("year-num", "year").cache()
valid_data_num = valid_data_num.withColumnRenamed("year-num", "year").cache()
df_test_num = df_test_num.withColumnRenamed("year-num", "year").cache()

In [0]:
%%time
train_data_num.display()

duration,start_of_fade_out,time_signature,tempo,key,mode,loudness,song_hotttnesss,year
33.41016,33.41,3,94.111,9,0,-14.766,0.0,0.0
44.93016,44.93,1,120.177,2,1,-16.527,0.0,1.0
53.15873,49.319,1,188.972,7,0,-2.751,0.4191258929591558,0.0
53.28934,50.614,4,123.068,0,1,-10.458,0.3603705827121406,1.0
61.67465,61.675,4,124.363,10,0,-7.409,0.0,0.0
61.962,61.962,4,79.293,0,0,-11.74,0.7118776226158788,1.0
63.03302,63.033,4,122.795,10,1,-15.395,0.450992311636255,0.0
74.91873,71.686,3,160.805,3,1,-23.334,0.5946422445649223,2.0
77.19138,77.191,4,73.749,8,1,-9.435,0.3943948561005826,0.0
77.89669,77.897,4,191.936,1,1,-4.773,0.6266128026956008,1.0


CPU times: user 480 µs, sys: 54 µs, total: 534 µs
Wall time: 620 µs

In [0]:
train_data_num.groupBy('year').count().show()

## Vector Assembler

In [0]:
from pyspark.ml.feature import VectorAssembler
cols = ['duration', 'start_of_fade_out', 'time_signature', 'tempo', 'key', 'mode', 'loudness', 'year']
va = VectorAssembler(outputCol="features", inputCols=cols) 
train_data_vec = va.transform(train_data_num).select("features", "song_hotttnesss").withColumnRenamed('song_hotttnesss', 'label')
valid_data_vec = va.transform(valid_data_num).select("features", "song_hotttnesss").withColumnRenamed('song_hotttnesss', 'label')
test_data_vec = va.transform(df_test_num).select("features", "song_hotttnesss").withColumnRenamed('song_hotttnesss', 'label')

In [0]:
%%time
train_data_vec.display()

features,label
"Map(vectorType -> dense, length -> 8, values -> List(33.41016, 33.41, 3.0, 94.111, 9.0, 0.0, -14.766, 0.0))",0.0
"Map(vectorType -> dense, length -> 8, values -> List(44.93016, 44.93, 1.0, 120.177, 2.0, 1.0, -16.527, 1.0))",0.0
"Map(vectorType -> dense, length -> 8, values -> List(53.15873, 49.319, 1.0, 188.972, 7.0, 0.0, -2.751, 0.0))",0.4191258929591558
"Map(vectorType -> dense, length -> 8, values -> List(53.28934, 50.614, 4.0, 123.068, 0.0, 1.0, -10.458, 1.0))",0.3603705827121406
"Map(vectorType -> dense, length -> 8, values -> List(61.67465, 61.675, 4.0, 124.363, 10.0, 0.0, -7.409, 0.0))",0.0
"Map(vectorType -> dense, length -> 8, values -> List(61.962, 61.962, 4.0, 79.293, 0.0, 0.0, -11.74, 1.0))",0.7118776226158788
"Map(vectorType -> dense, length -> 8, values -> List(63.03302, 63.033, 4.0, 122.795, 10.0, 1.0, -15.395, 0.0))",0.450992311636255
"Map(vectorType -> dense, length -> 8, values -> List(74.91873, 71.686, 3.0, 160.805, 3.0, 1.0, -23.334, 2.0))",0.5946422445649223
"Map(vectorType -> dense, length -> 8, values -> List(77.19138, 77.191, 4.0, 73.749, 8.0, 1.0, -9.435, 0.0))",0.3943948561005826
"Map(vectorType -> dense, length -> 8, values -> List(77.89669, 77.897, 4.0, 191.936, 1.0, 1.0, -4.773, 1.0))",0.6266128026956008


## Decision Tree Regressor with cross-validation

maxDepth: [5,10,15,20,25,30] increasing depth allows more deep tree, i.e. more bushy trees\
maxBins: [16,32,48] increasing maxBins allow more finer splits

Evaluation Metric: RMSE

In [0]:

from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator


evaluator = RegressionEvaluator().setLabelCol("label")\
                .setPredictionCol("prediction")\
                .setMetricName('rmse')

dt = DecisionTreeRegressor()
paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [5,10,15,20,25,30]).addGrid(dt.maxBins, [16,32,48]).build()

cv = CrossValidator(estimator=dt, 
                      evaluator=evaluator, 
                      numFolds=3, 
                      estimatorParamMaps=paramGrid)






In [0]:
%%time
cvmodel = cv.fit(train_data_vec)

In [0]:
%%time
dtpredicts = cvmodel.bestModel.transform(valid_data_vec)

print("Best Model Max depth : %s" % cvmodel.bestModel.getMaxDepth())
print("Best Model Max bins : %s" % cvmodel.bestModel.getMaxBins())
print("RMSE : %s" % evaluator.evaluate(dtpredicts))

## Random Forest Model

In [0]:
from pyspark.ml.regression import RandomForestRegressor

In [0]:

%%time
rf = RandomForestRegressor(seed = 2, maxDepth=20)
rf_model = rf.fit(train_data_vec)
print(rf_model.toDebugString)

RF Model time :5.14 minutes

In [0]:
rfpredicts = rf_model.transform(valid_data_vec)
print("RMSE : %s" % evaluator.evaluate(rfpredicts))

In [0]:
rf_model.getNumTrees

Number of Trees: 20, Max_depth:20