In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import *

In [0]:
# Set parquet configuration to interpret binary byte array as string
spark = SparkSession.builder.config('spark.sql.parquet.binaryAsString', 'true') \
            .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1')\
            .appName("OMS") \
            .getOrCreate()

# sc = spark.sparkContext
# sc.setLogLevel("ERROR")

In [0]:
spark

**Load Song Popularity Data**

In [0]:
database = 'oms'
collection = 'song_popularity_predictions_data'
user_name = 'user'
password = 'user'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"
df_song_pop = spark.read.format("mongo").option("uri",connection_string).load()

**Loading Play count Data Set**

In [0]:
database = 'oms'
collection = 'play_counts'
user_name = 'user'
password = 'user'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"

In [0]:
df_playcounts_ = spark.read.format("mongo").option("uri",connection_string).load()
df_playcounts =df_playcounts_.select('play_count', 'song_id', 'user_id').cache()

In [0]:
df_playcounts.display(1)

**Load song Genre**

In [0]:
database = 'oms'
collection = 'song_genre'
user_name = 'user'
password = 'user'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"

In [0]:
df_genre_ = spark.read.format("mongo").option("uri",connection_string).load()
df_genre = df_genre_.select('genre','song_id').cache()

In [0]:
df_genre.display(10)

**Joining the genre to playcount data**

In [0]:
df_genre_playcounts = df_genre.join(df_playcounts,'song_id' ,'inner').cache()

In [0]:
df_genre_playcounts.count()

In [0]:
df_genre_playcounts.display(10)

song_id,genre,play_count,user_id
SOAAZPG12A6D4F8D8B,Pop Rock,1,2199769200b689c44cd8442f51e4f82047ecf679
SOAAZPG12A6D4F8D8B,Pop Rock,1,4af4db5cdd51f8cdd0917cc51fa9f53e7bcb031b
SOAAZPG12A6D4F8D8B,Pop Rock,1,3b4ad4a00c1a8edb790435e9ea4ed6f354445f94
SOAAZPG12A6D4F8D8B,Pop Rock,1,3a73046fe1a5aae7d8fa6173cbd053db85c00988
SOAAZPG12A6D4F8D8B,Pop Rock,1,6fd7851e1d02a3aeaf90855bba34e72f9d256173
SOAAZPG12A6D4F8D8B,Pop Rock,1,4b49956d0855007f80462b498f41e2a5b93a6f6e
SOAAZPG12A6D4F8D8B,Pop Rock,1,f3fd89959b9a004eb8bc31c1e41f2b756a39927a
SOAAZPG12A6D4F8D8B,Pop Rock,1,c6535de39bc7a062daf12a71be78e37e090dd3c6
SOAAZPG12A6D4F8D8B,Pop Rock,1,bb84b605789d898993e2c6fbda4d57a8bc8da369
SOAAZPG12A6D4F8D8B,Pop Rock,1,24145dac630d4f7c33544a12c054affd85358537


In [0]:
df_genre_playcounts_grouped =spark.sql("select genre, user_id, AVG(play_count) as avg_play_count from df_genre_playcounts group by genre, user_id")

In [0]:
df_genre_playcounts.write.saveAsTable('default.df_genre_playcounts_grouped')

In [0]:
top_genre = spark.sql("select genre, user_id, avg_play_count, rank() over (partition by genre order by avg_play_count desc ) as rank_ from df_genre_playcounts_grouped")

In [0]:
top_genre.display(10)

song_id,genre,user_id,play_count,rank_
SOYFYHE12A8C142082,International,dcf6a11b2fea3af24fb001f504851ed23aae9965,686,1
SOZGGXU12A67ADD3E4,International,762c6fa3791c68f5b530b91a3413a8f480df2099,407,2
SOAJDBZ12A6D4FA3C8,International,e3d47f8d33e91da3c476577cdf0b5dc3fe1e2d2a,296,3
SOZGGXU12A67ADD3E4,International,8cbcb30da89e66b46136dca4f3ab5d70893f7db8,229,4
SOAJDBZ12A6D4FA3C8,International,e3f6ac98a2b45f6ac0b0b1e118774828d8be2029,227,5
SOMECGO12AB0186D31,International,d9b255c131ba531ef58a393e3c5fb3f43f2dc53c,224,6
SOCSAOO12AB017E850,International,3cc0475a6842e690fc9ed3dcf232ead0fd01e0c5,167,7
SOUWNIQ12A67ADE5B6,International,64074f44ffae272464a75b3ca3fbcddcec95eda6,160,8
SOSBZQZ12A6D4FA577,International,52eeb1ecde85689adc8bd3c6f393a6ff04f9aaba,156,9
SOAJDBZ12A6D4FA3C8,International,6aa76847a05f6f10fcd7c1bf4ed6f6ab1ac4e90e,150,10


In [0]:
top_genre.write.saveAsTable('default.top_genre_sql')

In [0]:
top_genre.columns

In [0]:
top_genre_final= spark.sql('select song_id, genre, user_id, play_count\
                        from top_genre_sql where rank_=1')

In [0]:
top_genre_final.display(10)

song_id,genre,user_id,play_count
SONSTND12AB018516E,Pop Rock,a263000355e6a46de29ec637820771ac7620369f,2368
SOZMECL12AB0184058,Avant Garde,c53ac12f0b9396ab3f44f7b521d5925b10e991d3,13
SORELVF12AF72A22DF,Folk,8937886b62f28418d62f2556af7edaba74cf703c,606
SOPROWU12A58A7BBDD,Holiday,d4ca0ff508531f8e2f53407924f12a3adfc72a86,64
SOUDLVN12AAFF43658,RnB,98cb9a3feb48d15a8328ce16f1b32609698e11f2,835
SOZBMTI12A8C13A804,Stage,78d77cba6776c72ca78a52aa66f2526eabd86ea7,141
SOKNMJE12A67AE0421,Electronic,780913e6a5d61405f4653aef0231876e6721120b,1369
SOEELNH12A6D4F6522,Reggae,8665ae2a21ff60a45bf638c941a61a0cb75d7687,217
SOYFYHE12A8C142082,International,dcf6a11b2fea3af24fb001f504851ed23aae9965,686
SOBRMPB12A67ADAFE2,Jazz,42b31eaf0cbaa10e593438e2858ce74cdfa01191,242


In [0]:
df_first_page = df_playcounts.select('song_id', 'user_id', 'play_count').join(top_genre_final, 'user_id', 'inner').dropDuplicates()

In [0]:
df_first_page.display()

user_id,song_id,play_count,song_id.1,genre,play_count.1
780913e6a5d61405f4653aef0231876e6721120b,SOBQJBF12AF72A2EE6,2,SOKNMJE12A67AE0421,Electronic,1369
780913e6a5d61405f4653aef0231876e6721120b,SOHXGXU12A8C1413EB,2,SOKNMJE12A67AE0421,Electronic,1369
780913e6a5d61405f4653aef0231876e6721120b,SOIGZMC12A6D4F979B,1,SOKNMJE12A67AE0421,Electronic,1369
780913e6a5d61405f4653aef0231876e6721120b,SOKNMJE12A67AE0421,1369,SOKNMJE12A67AE0421,Electronic,1369
780913e6a5d61405f4653aef0231876e6721120b,SOKWNPY12A8C13A8FA,1,SOKNMJE12A67AE0421,Electronic,1369
780913e6a5d61405f4653aef0231876e6721120b,SOKWVQL12A67ADF7DA,20,SOKNMJE12A67AE0421,Electronic,1369
780913e6a5d61405f4653aef0231876e6721120b,SOOQPPL12AB017D2E3,29,SOKNMJE12A67AE0421,Electronic,1369
780913e6a5d61405f4653aef0231876e6721120b,SOOWJZM12A6D4F7995,1,SOKNMJE12A67AE0421,Electronic,1369
780913e6a5d61405f4653aef0231876e6721120b,SOSFOFO12AF72AD198,2,SOKNMJE12A67AE0421,Electronic,1369
780913e6a5d61405f4653aef0231876e6721120b,SOTVFIU12AC46878B7,1,SOKNMJE12A67AE0421,Electronic,1369


In [0]:
df_first_page.write.saveAsTable('default.df_first_page_data')

In [0]:
spark.read.table(df_first_page)

# For page-2 : Customized songs

In [0]:
database = 'oms'
collection = 'recommendations'
user_name = 'user'
password = 'user'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"

In [0]:
df_song_emb = spark.read.format("mongo").option("uri",connection_string).load()

In [0]:
df_song_emb.printSchema()

In [0]:
df_song_emb2 = df_song_emb.select("user_id-num","recommendations.song_id-num","recommendations.rating")\
.withColumn("tmp", arrays_zip("song_id-num", "rating"))\
.withColumn("tmp", explode("tmp"))\
.select("user_id-num", col("tmp.song_id-num"), col("tmp.rating"))

In [0]:
users_table =spark.read.table( "default.df_first_page_final")
users_profile= users_table.select('user_id').drop_duplicates(subset=['user_id'])

In [0]:
database = 'oms'
collection = 'play_counts'
user_name = 'user'
password = 'user'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"

In [0]:
mapping = spark.read.format("mongo").option("uri",connection_string).load()

In [0]:
user_mapping = mapping.drop_duplicates(subset=['user_id']).select('user_id','user_id-num')
item_mapping = mapping.drop_duplicates(subset=['song_id']).select('song_id','song_id-num') 

In [0]:
df_recom=df_song_emb2.join(user_mapping,on='user_id-num').join(item_mapping,on='song_id-num')

In [0]:
df_recom.show()

In [0]:
from pyspark.sql.functions import broadcast

page2 =df_recom.join(broadcast(users_profile ), on='user_id')

In [0]:
page2.cache()

In [0]:
database = 'oms'
collection = 'song_popularity_predictions_data'
user_name = 'user'
password = 'user'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"
df_song_pop = spark.read.format("mongo").option("uri",connection_string).load()

In [0]:
page2_pop= page2.join(df_song_pop.select('song_id','prediction') ,on='song_id')

In [0]:
page2.count()

In [0]:
page2_pop = broadcast( page2).join( df_song_pop.select('song_id','prediction','artist_name','title') , on ='song_id' )

In [0]:
page2_pop.cache()
page2_pop.display()

song_id,user_id,song_id-num,user_id-num,rating,prediction,artist_name,title
SORSXPU12A8C138ABB,6a145b64e3f73ea3018ed88c790c0804c2176ccd,250662,603137,87.71204376220703,0.2998774882739778,Armchair Martian,The Credible Hulk (1995)
SOFNLOY12A8C1339C7,98cb9a3feb48d15a8328ce16f1b32609698e11f2,272445,586630,1377.10986328125,0.402548583328269,Basement Jaxx,U Dont Know Me (Original Radio Mix)
SOZLVDB12AF72A25EA,6a145b64e3f73ea3018ed88c790c0804c2176ccd,116535,603137,100.81348419189452,0.3347065490692206,Mystikal featuring Snoop Dogg & Silkk The Shocker,Let's Go Do It
SOFRRFT12A8C140C5C,af8b6cf5ad80a0f5696192586824783a49c29805,158233,630473,1019.0083618164062,0.3458022339653294,Jack the Ripper,Words
SOEGPHZ12AB0187E42,d4ca0ff508531f8e2f53407924f12a3adfc72a86,222964,682261,278.8706359863281,0.3969052200664554,Al Jarreau,It's Not Hard To Love You
SOEGPHZ12AB0187E42,42b31eaf0cbaa10e593438e2858ce74cdfa01191,222964,177075,343.1670837402344,0.3969052200664554,Al Jarreau,It's Not Hard To Love You
SOEGPHZ12AB0187E42,9ae2abce15efa19998984882900850b927ebf7e1,222964,140161,796.82373046875,0.3969052200664554,Al Jarreau,It's Not Hard To Love You
SOEGPHZ12AB0187E42,ca9fb6d60f3261f783936a7ff48df2df176d5b3a,222964,82871,221.36373901367188,0.3969052200664554,Al Jarreau,It's Not Hard To Love You
SOWAPQH12A67AE116E,296e5769223ae111e530d96c8bcf95fd63d8af9b,220027,524782,1222.310302734375,0.3026125116666508,The Sagittarian,Liferider
SOLOVPW12A8C136297,78d77cba6776c72ca78a52aa66f2526eabd86ea7,90850,134963,151.1387481689453,0.340922755864378,Bobby Womack,Communication (Single Version)


In [0]:
windowSpec  = Window.partitionBy("user_id").orderBy("rating")
pag2_pop_rank =page2_pop.withColumn("rank",10-row_number().over(windowSpec))

In [0]:
pag2_pop_rank2=pag2_pop_rank.orderBy(['user_id-num','rank'],ascending=[True,True]).cache()

In [0]:
pag2_pop_rank2.display()

song_id,user_id,song_id-num,user_id-num,rating,prediction,artist_name,title,rank
SOASAPE12A8C138AC9,5a44bfcaa1081e8b36ff76ec32c2ef73425ad2e9,326979,61523,241.9148406982422,0.4176416475968032,The Big Dish,Prospect Street,0
SOFJWWF12A8C142012,5a44bfcaa1081e8b36ff76ec32c2ef73425ad2e9,48586,61523,161.59962463378906,0.5069050641974842,Tryo,Abdallâh,1
SONSTND12AB018516E,5a44bfcaa1081e8b36ff76ec32c2ef73425ad2e9,49948,61523,160.41375732421875,0.4925057832453368,Jason Falkner,I Go Astray (LP Version),2
SODUWZY12AB0183594,5a44bfcaa1081e8b36ff76ec32c2ef73425ad2e9,120472,61523,147.87689208984375,0.2998766298926159,The Most Powerful Telescope In the Universe,Posted,3
SOSVFGM12AB0182373,5a44bfcaa1081e8b36ff76ec32c2ef73425ad2e9,251152,61523,142.5316162109375,0.3231121208952123,Rolfe Kent,Without Bill the Jedi Changed,4
SOCDGYC12A8C13B8F6,5a44bfcaa1081e8b36ff76ec32c2ef73425ad2e9,133215,61523,133.59849548339844,0.3277366831778408,This Mortal Coil,Thais II,5
SOVKABC12A6D4F79CB,5a44bfcaa1081e8b36ff76ec32c2ef73425ad2e9,118013,61523,128.5667724609375,0.6397153693318615,Keren Ann,Dans Ma Ville,6
SOPLRDD12AB01845D0,5a44bfcaa1081e8b36ff76ec32c2ef73425ad2e9,112683,61523,127.22706604003906,0.4395256052991028,Fred Hammond & Radical For Christ,We're Blessed,7
SOEGOHE12AB0184A68,5a44bfcaa1081e8b36ff76ec32c2ef73425ad2e9,288510,61523,124.72428131103516,0.2669551862755385,Juice Leskinen,Tule vastaan,8
SODFKWT12AB0186CCE,5a44bfcaa1081e8b36ff76ec32c2ef73425ad2e9,138553,61523,124.12940216064452,0.3985168429652171,Beverlei Brown,In the Summertime,9


**Popular Songs**

In [0]:
database = 'oms'
collection = 'play_counts'
user_name = 'user'
password = 'user'
address = 'oms-cluster.0navm.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"
df_play_counts = spark.read.format("mongo").option("uri",connection_string).load()

In [0]:
df_songs_count =df_play_counts.groupBy('song_id').agg(sum('play_count').alias("total_play_count")).orderBy('total_play_count',ascending=False)

In [0]:
df_songs_count.limit(10).select('song_id').collect()

In [0]:
top_list = ['SOBONKR12A58A7A7E0','SOAUWYT12A81C206F1','SOSXLTC12AF72A7F54','SOFRQTD12A81C233C0','SOEGIYH12A6D4FC0E3','SOAXGDH12A8C13F8A1'\
            ,'SONYKOW12AB01849C9','SOPUCYA12A8C13A694','SOUFTBI12AB0183F65','SOVDSJC12A58A7A271']
popular_songs =df_song_pop.filter(df_song_pop.song_id.isin(top_list)).select('song_id','prediction','artist_name','title')

In [0]:
popular_songs.display()

song_id,prediction,artist_name,title
SOVDSJC12A58A7A271,0.5419947473131744,Sam Cooke,Ain't Misbehavin
SOAUWYT12A81C206F1,0.9374431571868832,Björk,Undo
SOAXGDH12A8C13F8A1,1.0,Florence + The Machine,Dog Days Are Over (Radio Edit)
SOSXLTC12AF72A7F54,0.9184270923627972,Kings Of Leon,Revelry
SOUFTBI12AB0183F65,0.4137398358601097,Tub Ring,Invalid
SONYKOW12AB01849C9,1.0,OneRepublic,Secrets
SOPUCYA12A8C13A694,0.3458022339653294,Five Iron Frenzy,Canada
SOFRQTD12A81C233C0,1.0,Harmonia,Sehr kosmisch
SOBONKR12A58A7A7E0,0.476435219642529,Dwight Yoakam,You're The One
SOEGIYH12A6D4FC0E3,-0.0038567672310621,Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner,Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile)
