# Goal: Getting the song popularity score using song based features
# Name: Shubham Thakur

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import *

In [0]:
# Set parquet configuration to interpret binary byte array as string
spark = SparkSession.builder.config('spark.sql.parquet.binaryAsString', 'true') \
            .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1')\
            .appName("OMS") \
            .getOrCreate()

# sc = spark.sparkContext
# sc.setLogLevel("ERROR")

In [0]:
spark

# Add configuration for accessing S3

In [0]:
aws_access_key = 'ADD_ACCESS_ID'
aws_secret_key = 'ADD_SECRET_ACCESS_ID'
spark._jsc.hadoopConfiguration().set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.1')
#spark._jsc.hadoopConfiguration().set('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1')
spark._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)

In [0]:
spark._jsc.hadoopConfiguration().set('spark.network.timeout','7200s')
spark._jsc.hadoopConfiguration().set('spark.executor.heartbeatInterval','1200s')

In [0]:
# Read million song dataset
songs_df = spark.read.parquet("s3a://million-song-dataset-16/data")

# Select columns relevant to the genre detection task
selected_columns =['artist_id','artist_latitude','artist_location','artist_longitude','artist_name','danceability','duration','energy','loudness','key','mode','song_id','start_of_fade_out','tempo','time_signature','track_id','year','song_hotttnesss','title']
songs_df2 =songs_df.select(*selected_columns)

# Cache new dataframe

# Check dataframe
songs_df2.display()

artist_id,artist_latitude,artist_location,artist_longitude,artist_name,danceability,duration,energy,loudness,key,mode,song_id,start_of_fade_out,tempo,time_signature,track_id,year,song_hotttnesss,title
AR4C14Z119B86691F7,,,,Gisbert Zu Knyphausen,0.0,307.06893,0.0,-9.716,4,0,SOAQHIM12A8C13CBB0,294.975,130.842,4,TRTBIRS128F42849D2,2008,,Gute Nachrichten
ARNMEN31187B98E773,37.77916,San Francisco,-122.42005,Joyce Cooling,0.0,152.92036,0.0,-17.126,9,0,SOKHLDP12AB0187FE3,142.263,90.745,4,TRTBILX12903CF36F3,1999,0.582918458497712,Out Of A Movie
ARML8X41187B990644,,"Glasgow, Scotland",,Simple Minds,0.0,294.19057,0.0,-13.361,8,1,SOGDUXP12A6D4F3F98,286.581,157.017,4,TRTBIEH128F427C13C,1989,0.6726332362065223,Let It All Come Down
ARAUTIF124549A3D88,,,,Anita Lipnicka I John Porter,0.0,185.15546,0.0,-13.327,3,1,SOGOAHY12A8C138A79,175.305,114.088,4,TRTBIED128F4268E0C,0,0.5040343408054411,Lover Turn Around
AR1D1ES1187FB57228,18.11526,"St. Andrews, Jamaica",-77.27348,Augustus Pablo,0.0,208.97914,0.0,-14.907,9,0,SOHJBHV12AB0183CFC,195.216,140.135,1,TRTBIZX128F932A5E4,1999,0.3458022339653294,Burning Drums
ARM7BUE1187FB5961C,,,,Dada_ Obernik & Harris,0.0,380.99546,0.0,-6.705,7,0,SOLSDSE12AB018288F,353.739,126.969,4,TRTBIPK12903CAC961,0,0.0,Stereo Flo
ARBQH151187B9994FD,,,,Need New Body,0.0,179.09506,0.0,-4.563,10,0,SOVKMED12A8C13C30E,179.095,179.294,4,TRTBISC128F42AB42E,2003,0.3725300393660796,Hot Shot
ARLCKQI1187FB4C111,,,,D'espairsRay,0.0,271.98649,0.0,-5.364,7,1,SOLXVQC12A58A7C14A,255.228,97.378,5,TRTBISR128F92DDC36,2009,0.7480722476363,Lizard
ARNYBIP1187FB3FE37,,,,A Hundred Birds,0.0,348.89098,0.0,-8.813,6,0,SOYBOEE12AC4689E18,331.807,190.865,3,TRTBIJD12903D0EF0D,2008,0.3926041362684404,Jaguar (Main Mix)
ARGIPGX1187FB52D67,,Denmark,,Mike Tramp,0.0,282.56608,0.0,-4.094,1,1,SOKHLXN12AC46878B3,278.721,190.166,4,TRTBIQX12903CED75F,0,,No Tormorrow


In [0]:
songs_df2.count()

Total rows in the dataframe is coming out to be 1,000,000.

# Reading the DataFrame from MongoDB

In [0]:
df.display(5)

_id,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,danceability,duration,energy,key,loudness,mode,song_hotttnesss,song_id,start_of_fade_out,tempo,time_signature,title,track_id,year
List(621bed850fe1903ed3f99097),ARIGNJP1187B9A9751,,,,Paul Westerberg,0.0,191.92118,0.0,11,-4.407,1,,SOKDLSE12AC9072731,187.025,149.912,4,You've Had It With You (Album Version),TRUWJDZ12903CEF5C5,1996
List(621bed850fe1903ed3f99098),ARVUO9S1187B989763,64.8369,"YUKON, ???",-136.79199,Pave The Rocket,0.0,179.61751,0.0,2,-7.353,1,,SODJZJH12AB018D1CB,174.376,111.745,4,Tyro,TRUWJGI12903CD5EF4,2000
List(621bed850fe1903ed3f99099),ARJDZK31187B9A2E44,,Capetown South Africa,,Freshlyground,0.0,212.79302,0.0,7,-14.878,1,0.4301606370184558,SOBFUET12AB0186030,204.759,119.141,4,Crimson Smile,TRUWJGZ128F93205EA,2007
List(621bed850fe1903ed3f9909a),ARHKT7D1187FB4DF8F,,,,Arno Cost,0.0,427.25832,0.0,9,-9.605,0,0.5868032958183274,SOZXEXQ12A6D4F5106,420.798,124.93,4,Magenta,TRUWJHB12903CE1C8A,2005
List(621bed850fe1903ed3f9909b),ARWTCIC11E2835CDD0,,,,Charlie Haden Quartet West,0.0,246.56934,0.0,2,-14.328,0,0.2465991884896518,SOOELMB12A6D4F9889,235.886,175.866,7,Back Home Blues,TRUWJGE128F1498B34,1996
List(621bed850fe1903ed3f9909c),ARYOPH81187B98AEED,40.65507,"Boston, Brooklyn USA",-73.94888,Converge,0.0,468.68853,0.0,10,-5.468,0,,SOEAWCH12A81C238AC,460.579,123.041,4,Tilt,TRUWJUB128F4255387,0
List(621bed850fe1903ed3f9909d),AREMGDO1187FB5C4A2,,,,Scarlett Johansson,0.0,303.46404,0.0,0,-9.384,0,0.7872489195469039,SOORDDG12A8AE461D0,284.694,98.63,3,Town With No Cheer,TRUWJAG128F422B17B,2008
List(621bed850fe1903ed3f9909e),ARIHEHW11F4C841F7B,,,,John Blum,0.0,151.43138,0.0,5,-21.657,1,0.2707759989463275,SOEQELR12A8C13889C,144.823,83.698,3,Dismal Cry,TRUWJWC128F427D92C,0
List(621bed850fe1903ed3f9909f),ARVXXQD1257509E117,,,,La Billo,0.0,185.7824,0.0,3,-22.009,1,,SOSXHVK12AB0184E39,185.782,205.753,1,Que Dira la Gente,TRUWJPT12903CB3B2E,0
List(621bed850fe1903ed3f990a0),ARGR5JB1187B9B5D15,,,,Bombay Dub Orchestra,0.0,137.09016,0.0,4,-26.368,0,0.4994458348006321,SOMJCEP12AAF3B2F4C,127.234,120.134,3,Remembrance,TRUWJFP128F92F91A5,2006


# Data Analysis

**Checking Location based features**

In [0]:
songs_df2.select('artist_location','artist_latitude','artist_longitude' ).describe().display()

summary,artist_location,artist_latitude,artist_longitude
count,1000000,357492.0,357492.0
mean,165.1141304347826,38.99942487376531,-58.37080413099251
stddev,204.17271904035488,15.1963241278978,54.95555544123087
min,,-53.1,-162.4365
max,"�tersund, Sweden",70.69576,178.69096


From the summary, we can see that artist latitude and artist_longitude have large proporation of missing data. Also the artist loaction is empty whenever the latitude and longitude is NA. Thus we decided to remove location based features

Analysis of song based features

In [0]:
song_based_features = [item[0] for item in songs_df2.dtypes if not item[1].startswith('string')] #+ ['_id', 'artist_latitude','artist_longitude']
songs_df2.select(*song_based_features).summary().display()

summary,artist_latitude,artist_longitude,danceability,duration,energy,loudness,key,mode,start_of_fade_out,tempo,time_signature,year,song_hotttnesss
count,357492.0,357492.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,581965.0
mean,38.999424873765136,-58.37080413099294,0.0,249.50075499365008,0.0,-10.12403926,5.321964,0.666408,240.62502357900016,123.88921846999996,3.593947,1030.325652,0.3560510395614871
stddev,15.196324127897798,54.95555544123087,0.0,126.22963612052266,0.0,5.197244740088691,3.601595223535284,0.4714961291936548,124.07992379653606,35.0559812572634,1.2219253063475088,998.745002439739,0.2344409011741647
min,-53.1,-162.4365,0.0,0.31302,0.0,-58.178,0.0,0.0,0.313,0.0,0.0,0.0,0.0
25%,34.07215,-91.52382,0.0,180.71465,0.0,-12.674,2.0,0.0,173.163,97.993,3.0,0.0,0.2150803185092279
50%,40.65507,-77.38023,0.0,228.8322,0.0,-8.964,5.0,1.0,219.742,122.078,4.0,1969.0,0.3775316449767025
75%,47.60356,-2.23218,0.0,289.88036,0.0,-6.387,9.0,1.0,279.673,144.079,4.0,2002.0,0.5319846094668594
max,70.69576,178.69096,0.0,3034.90567,0.0,4.318,11.0,1.0,3030.622,302.3,7.0,2011.0,1.0


from the summary, we can see that the variance of energy and danceability is 0, which suggests that these features won't add any value to our model. We will later  remove id based features from our model while data modeling. Song hotness has around 46% data missing, thus we decided to drop those while creating the training set.

# Feature Engineering

**Rather than particular year, decade generally dictactes the types of songs that are going to be popular**

In [0]:
songs_df2.select('year').groupBy().min().display()

min(year)
0


We see that the min(year) is showns as zero which is not realistic. We then decided to fill those missing value with median of all the availables values

# Missing Values Imputation of Year

In [0]:
def replace_year(x):
    if int(x) < 100:
        return None
    else :
        return x

replace_year_udf = udf(replace_year, IntegerType())
df_yr =songs_df2.select(replace_year_udf('year').alias('year'),'duration', 'key', 'loudness', 'mode', 'song_hotttnesss', 'start_of_fade_out', 'tempo','time_signature', 'title','song_id', 'artist_id','artist_name')

In [0]:
df_yr_imputed = df_yr.na.fill(int(songs_df2.select('year').na.drop().groupBy().agg(percentile_approx('year', 0.5)).first()[0]), ["year"])
df_yr_imputed.cache()

In [0]:
int(songs_df2.select('year').na.drop().groupBy().agg(percentile_approx('year', 0.5)).first()[0])

median of the year is 1969. We will fill this value with all the missing values in the year column

Converting years into decades

In [0]:
def year_decade(x):
    if int(x) < 1930:
        return '1920s'
    elif int(x) < 1940:
        return '1930s'
    elif int(x) < 1950:
        return '1940s'
    elif int(x) < 1960:
        return '1950s'
    elif int(x) < 1970:
        return '1960s'
    elif int(x) < 1980:
        return '1970s'
    elif int(x) < 1990:
        return '1980s'
    elif int(x) < 2000:
        return '1990s'
    elif int(x) < 2010:
        return '2000s'
    elif int(x) < 2020:
        return '2010s'
    else:
        return '2020s'

In [0]:
year_decade_udf = udf(year_decade, StringType())
df_yr_decade =df_yr_imputed.select(year_decade_udf('year').alias('year'),'duration', 'key', 'loudness', 'mode', 'song_hotttnesss', 'start_of_fade_out', 'title','tempo','time_signature', 'song_id', 'artist_id','artist_name'  )

**Writing the data in MongoDB**

In [0]:
database = 'oms'
collection = 'song_popularity_data'
user_name = 'user'
password = 'user'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"
    
df_yr_decade.write.format("mongo").option("uri",connection_string).mode("overwrite").save()

**Reading the data from MongoDB**

In [0]:
database = 'oms'
collection = 'song_popularity_data'
user_name = 'user'
password = 'user'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"
df_yr_decade = spark.read.format("mongo").option("uri",connection_string).load()

In [0]:
df_yr_decade.count()

In [0]:
df_yr_decade.display()

_id,artist_id,artist_name,duration,key,loudness,mode,song_hotttnesss,song_id,start_of_fade_out,tempo,time_signature,title,year
List(622bea71bef0293cbbceb35b),AR9CHCU1187B9A2317,Manchild,331.49342,11,-7.812,0,,SOQJSEV12AB017BD67,331.493,127.439,4,Let Me Tell You Something,1960s
List(622bea71bef0293cbbceb35c),ARUDSNG11F4C83FE36,17,227.05587,7,-8.459,1,,SODMQWC12A8C131E05,213.496,167.907,4,What's Happin,1960s
List(622bea71bef0293cbbceb35d),ARINJOL1187FB3FE42,Methadrone,384.10404,11,-10.185,0,0.3920087714713069,SOUDYLM12AB018C348,372.036,140.353,4,Flight to Nowhere,1960s
List(622bea71bef0293cbbceb35e),AR27YUB1187B9A0939,Jahcoozi,292.46649,7,-4.722,1,,SOVOKHE12AB0189C08,282.308,250.02,3,Namedropper,1960s
List(622bea71bef0293cbbceb35f),ARS371M1187B992F09,Trout Fishing in America,203.4673,5,-14.047,1,0.0,SOPQQLJ12A58A7D689,190.543,116.432,4,Closer to the Truth,1960s
List(622bea71bef0293cbbceb360),ARNBQR21187B9B20E1,Blodwyn Pig,156.55138,4,-6.21,0,,SOOSAZQ12AC4686D6F,145.548,166.652,5,Same Old Story (2006 Digital Remaster),1960s
List(622bea71bef0293cbbceb361),ARY6PB61187FB45550,Dub War,212.45342,4,-7.187,1,0.0,SOYBXAW12AC468EAFC,205.032,142.73,1,Dowit,1990s
List(622bea71bef0293cbbceb362),ARWJ29911A348F030B,Johanna Zeul,194.16771,4,-6.596,0,,SOXANSP12AB0188006,189.841,115.076,4,Schwimm Nicht Zu Weit,2000s
List(622bea71bef0293cbbceb363),ARUUQW41187FB50FFB,Mariem Hassan,258.69016,3,-10.119,1,0.0,SOTKKZJ12AC3DF7138,251.391,206.834,7,Fergan Leyuad,1960s
List(622bea71bef0293cbbceb364),ARD3Y081187B9ABB07,Del Amitri,210.18077,0,-10.291,1,,SOWFOLV12A8C13E40D,199.105,153.559,3,Food For Songs,1990s


As we saw from the summary above, that there is some fraction of missing values in song_popularity. We will predict those missing value after building the model

In [0]:
df_yr_decade.show(10)

We will keep all the Data having not null song popularity in the training data + validation data and then predict those null values.

In [0]:
df_train = df_yr_decade.where(df_yr_decade.song_hotttnesss.isNotNull()).cache()#.drop('song_id')
df_test = df_yr_decade.where(df_yr_decade.song_hotttnesss.isNull()).cache()#.drop('song_id')
print(df_train.count())
print(df_test.count())

We will train the model on training to data and further evaluate the model for validation data. As a initial step, we just picked the features we considered as important for training the model. eg. duration, key, loudness, mode, start_of_fade_out, tempo, time_signature, year, artist_id, song_hotttnesss

In [0]:
test_final = df_test.select('duration', 'key','loudness','mode','start_of_fade_out','tempo','time_signature','year','artist_id','song_hotttnesss').cache()
test_extra = df_test.select('title', 'song_id', 'artist_id','artist_name').cache()

In [0]:
train_val_set = df_train.randomSplit([0.8, 0.2], 1)
train_set = train_val_set[0].select('duration', 'key','loudness','mode','start_of_fade_out','tempo','time_signature','year','artist_id','song_hotttnesss').cache()
val_set = train_val_set[1].select('duration', 'key','loudness','mode','start_of_fade_out','tempo','time_signature','year','artist_id','song_hotttnesss').cache()

train_extra = train_val_set[0].select('title', 'song_id', 'artist_id','artist_name').cache()
val_extra = train_val_set[1].select('title', 'song_id', 'artist_id','artist_name').cache()

In [0]:
val_extra.count()

String Indexing of decade and artist_id

In [0]:

def indexStringColumns(train_set,val_set,test_final, cols):
    newdf_train = train_set
    newdf_val = val_set
    newdf_test = test_final
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+"-num",handleInvalid='keep')
        sm = si.fit(newdf_train)
         
        newdf_train = sm.transform(newdf_train).drop(c)
        newdf_val = sm.transform(newdf_val).drop(c)
        newdf_test = sm.transform(newdf_test).drop(c)
        
        newdf_train = newdf_train.withColumnRenamed(c+"-num", c)
        newdf_val = newdf_val.withColumnRenamed(c+"-num", c)
        newdf_test = newdf_test.withColumnRenamed(c+"-num", c)
    return newdf_train,newdf_val, newdf_test

In [0]:
cols = ['year','artist_id' ]
df_train_si,df_val_si, df_test_si = indexStringColumns(train_set,val_set,test_final,cols)

In [0]:
df_test_si.show(5)

In [0]:
from pyspark.ml.feature import OneHotEncoder
def oneHotEncodeColumns(df_train_si,df_val_si, df_test_si, cols):
    newdf_train_ohi = df_train_si
    newdf_val_ohi = df_val_si
    newdf_test_ohi = df_test_si
    for c in cols:
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False,handleInvalid="keep")
        ohe_model = ohe.fit(newdf_train_ohi)
        
        newdf_train_ohi = ohe_model.transform(newdf_train_ohi).drop(c)
        newdf_val_ohi = ohe_model.transform(newdf_val_ohi).drop(c)
        newdf_test_ohi = ohe_model.transform(newdf_test_ohi).drop(c)
        
        newdf_train_ohi = newdf_train_ohi.withColumnRenamed(c+"-onehot", c)
        newdf_val_ohi = newdf_val_ohi.withColumnRenamed(c+"-onehot", c)
        newdf_test_ohi = newdf_test_ohi.withColumnRenamed(c+"-onehot", c)
    return newdf_train_ohi,newdf_val_ohi, newdf_test_ohi


newdf_train_ohi,newdf_val_ohi, newdf_test_ohi = oneHotEncodeColumns(df_train_si,df_val_si, df_test_si,cols = ['year','artist_id' ]) 

In [0]:
from pyspark.ml.feature import VectorAssembler
input_cols=  ['year','artist_id', 'duration', 'key', 'mode', 'start_of_fade_out', 'tempo', 'time_signature', 'loudness']

va = VectorAssembler(outputCol="features", inputCols=input_cols)

newdf_train_va = va.transform(newdf_train_ohi).select("features", "song_hotttnesss").withColumnRenamed("song_hotttnesss", "label")
newdf_val_va = va.transform(newdf_val_ohi).select("features", "song_hotttnesss").withColumnRenamed("song_hotttnesss", "label")
newdf_test_va = va.transform(newdf_test_ohi).select("features", "song_hotttnesss").withColumnRenamed("song_hotttnesss", "label")

In [0]:
newdf_val_va.show(10)

In [0]:
newdf_train_va.cache()
newdf_val_va.cache()
newdf_test_va.cache()

Machine Learning Models- Linear Regression

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics
lr = LinearRegression(regParam=0.01, maxIter=1000, fitIntercept=True)

In [0]:
lrmodel = lr.fit(newdf_train_va)
valpredicts_lr = lrmodel.transform(newdf_val_va)
#valpredicts = lrmodel.transform(newdf_val_va)
prediction_label_lr = valpredicts_lr.select("prediction", "label").rdd
metrics = RegressionMetrics(prediction_label_lr)

running regression took 2 minutes 40 seconds

In [0]:
metrics.rootMeanSquaredError

We got the root mean squared error of 0.1575 with our regression model

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator().setMetricName('r2') 
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.0001, 0.001, 0.01, 0.1, 0.5]).build()
cv = CrossValidator(estimator=lr, 
                    evaluator=evaluator, 
                    numFolds=3,
                    estimatorParamMaps=paramGrid)

cvmodel = cv.fit(newdf_train_va)

Running cross validation took 4 minutes 22 seconds

In [0]:
cvmodel.bestModel.getRegParam()

In [0]:
valpredicts_cv = cvmodel.transform(newdf_val_va)
prediction_label_cv = valpredicts_cv.select("prediction", "label").rdd
metrics = RegressionMetrics(prediction_label_cv)
metrics.rootMeanSquaredError

Basically we can just use the available ratings in our data and predict on the test data

In [0]:
training_labels = newdf_train_va.select('label').withColumnRenamed("label", "prediction")
validation_labels =  newdf_val_va.select('label').withColumnRenamed("label", "prediction") 
testing_pred = cvmodel.bestModel.transform(newdf_test_va).select("prediction")

In [0]:
from functools import reduce  
  
# explicit functions
def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)
  
all_predictions = unionAll(*[training_labels, validation_labels, testing_pred]).cache()
all_ids = unionAll(*[train_extra, val_extra, test_extra]).cache()

In [0]:
from pyspark.sql.functions import monotonically_increasing_id


df1 = all_predictions.withColumn("id", monotonically_increasing_id())

df2 = all_ids.withColumn("id", monotonically_increasing_id())

df3 = df2.join(df1, "id", "outer").drop("id")

df3.show()

In [0]:
df3.display(10)

title,song_id,artist_id,artist_name,prediction
Flight to Nowhere,SOUDYLM12AB018C348,ARINJOL1187FB3FE42,Methadrone,0.3920087714713069
When The Trees Were Silenced,SOXNBWZ12AB0183882,AR074XP1187B9985C5,Hagalaz' Runedance,0.6464840667661809
I Love You More Everyday,SODLANK12AB018D1A3,AROO1EW1187FB3B2A5,Laurel Aitken,0.2707759989463275
Ubik (The Dance Original Mix),SOZYYRB12A6D4FA327,ARCYE581187B9A1482,Timo Maas,0.3347065490692206
Soumba,SOVQIQZ12A6D4F99BE,ARI0DX71187FB4EE83,Mory Kanté,0.4275828862076803
Liquor Store,SOOIWSG12AB0180811,ARRFN7X1187B9B2CA8,7 Shot Screamers,0.340922755864378
Only In God (Psalm 62),SOXEDPK12AC907512C,ARAPI451187B9B6E6F,John Michael Talbot,0.26586104921065
He Loves Me,SOQNSSS12AAF3B417B,ARC7ZMH1187FB3701D,Elvira Nikolaisen,0.3277366831778408
Ours to Kill,SOVWLHY12AB018443C,ARO46FA1187B9B526C,Sad Lovers & Giants,0.3775316449767025
The Call Out,SOEQATE12A6D4FBEF5,AR8K7FS1187FB3EC9E,Run Kid Run,0.5402729306702049


In [0]:
database = 'oms'
collection = 'song_popularity_predictions_data'
user_name = 'user'
password = 'user'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"

In [0]:
df3.write.format("mongo").option("uri",connection_string).mode("overwrite").save()

In [0]:
df3.write.mode("overwrite").saveAsTable("default.song_popularity_prediction_data")

In [0]:
df_new = spark.read.format("mongo").option("uri",connection_string).load()

In [0]:
df_new.display(10)

_id,artist_id,artist_name,prediction,song_id,title
List(622bf8118d92353dae58df17),ARUUQW41187FB50FFB,Mariem Hassan,0.0,SOTKKZJ12AC3DF7138,Fergan Leyuad
List(622bf8118d92353dae58df18),AR571EH1187B98F500,Mortiis,0.4196742788095532,SOOPVER12AB0183279,Fodt Til A Herske Part 2
List(622bf8118d92353dae58df19),ARRALIO12802FDDD17,Frédéric Fromet,0.0,SOHOIYE12AC468E50C,Faux derches
List(622bf8118d92353dae58df1a),ARIAFCM1187B9A104F,Tenth Avenue North,0.7229999190639794,SOJMVBR12A8AE47629,Hold My Heart
List(622bf8118d92353dae58df1b),ARXI8LF1187FB3B09A,Sidestepper,0.5922132313867565,SOYEJUP12A6D4F951D,Donde va mi soledad
List(622bf8118d92353dae58df1c),ARAGS2Z1187B9BA436,Natiruts,0.0,SOKKEXH12A8C139561,Voo Do Carcara/Jamaica Roots
List(622bf8118d92353dae58df1d),ARJXWRS11E2835CB4D,ALO,0.7689923291933543,SOVAFVF12A8C133F79,Empty Vessel
List(622bf8118d92353dae58df1e),ARI9WBK119B3403B69,Jerry Dimmer,0.0,SOYLFHQ12A58A79D09,Attendre
List(622bf8118d92353dae58df1f),ARU19DA1187B9B4036,Fates Warning,0.5729201479465231,SOKWNYT12AB018490D,Part IX
List(622bf8118d92353dae58df20),ARIRD6J1187FB5A98C,Sugar Minott,0.0,SOGQUMI12A6D4FB93E,Do It Sweet
