In [1]:
import org.apache.spark

Intitializing Scala interpreter ...

Spark Web UI available at http://LAPTOP-PSTRJPQO:4040
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1737639393182)
SparkSession available as 'spark'


import org.apache.spark


In [None]:
val sc = spark.SparkContext.getOrCreate()

In [2]:
val path_to_datasets = "../../../datasets/processed"

val path_to_tracks = path_to_datasets + "/tracks.csv"
val path_to_playlists = path_to_datasets + "/playlists.csv"
val path_to_track_in_playlists = path_to_datasets + "/tracks_in_playlist.csv"
val path_to_artists = path_to_datasets + "/artists.csv"

path_to_datasets: String = ../../../datasets/processed
path_to_tracks: String = ../../../datasets/processed/tracks.csv
path_to_playlists: String = ../../../datasets/processed/playlists.csv
path_to_track_in_playlists: String = ../../../datasets/processed/tracks_in_playlist.csv
path_to_artists: String = ../../../datasets/processed/artists.csv


In [3]:
object CsvParser {

  val noGenresListed = "(no genres listed)"
  val commaRegex = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val pipeRegex = "\\|(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val quotes = "\""

  
  // (PID, playlist_name, num_followers)
  def parsePlayListLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }

  // (track_uri, track_name, duration_ms, artist_uri, album_uri, album_name)
  def parseTrackLine(line: String): Option[(String, String, Int, String, String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt, input(3).trim, input(4).trim, input(5).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (artist_uri, artist_name)
  def parseArtistLine(line: String): Option[(String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (PID, track_uri, pos)
  def parseTrackInPlaylistLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }
}

defined object CsvParser


In [4]:
val rddTracks = sc.textFile(path_to_tracks).
  flatMap(CsvParser.parseTrackLine)

val rddPlaylists = sc.textFile(path_to_playlists).
  flatMap(CsvParser.parsePlayListLine)

val rddTrackInPlaylists = sc.textFile(path_to_track_in_playlists).
  flatMap(CsvParser.parseTrackInPlaylistLine)

val rddArtists = sc.textFile(path_to_artists).
  flatMap(CsvParser.parseArtistLine)

rddTracks: org.apache.spark.rdd.RDD[(String, String, Int, String, String, String)] = MapPartitionsRDD[2] at flatMap at <console>:31
rddPlaylists: org.apache.spark.rdd.RDD[(String, String, Int)] = MapPartitionsRDD[5] at flatMap at <console>:34
rddTrackInPlaylists: org.apache.spark.rdd.RDD[(String, String, Int)] = MapPartitionsRDD[8] at flatMap at <console>:37
rddArtists: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[11] at flatMap at <console>:40


In [30]:
val idSong = "spotify:track:5xlWA3V2l7ZiqYF8Ag5EM8"

idSong: String = spotify:track:5xlWA3V2l7ZiqYF8Ag5EM8


In [31]:
// RDD of (pid, trackUri)
val trackInPlaylistReduce = rddTrackInPlaylists.map(x => (x._1, x._2))

// filter to keep only the id of playlist that contains the specific song
val playlistForTrack = trackInPlaylistReduce.filter  { case (_, trackUri) => trackUri == idSong }.map(x => x._1)


trackInPlaylistReduce: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[76] at map at <console>:28
playlistForTrack: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[78] at map at <console>:31


In [32]:
// trasform the collection in RDD to join with trackInPlaylistReduce
val RDDPlaylistForTrack = playlistForTrack.map((_,null))

// join
val trackInSamePlaylists = trackInPlaylistReduce
        .join(RDDPlaylistForTrack)
        .filter(_._2._1 != idSong)
        .map { case (pid, (trackUri, _)) => (pid, trackUri) } 

// result: RDD of (pid, trackUri) with only playlists that contains the specific song 
// and the relatives tracks

RDDPlaylistForTrack: org.apache.spark.rdd.RDD[(String, Null)] = MapPartitionsRDD[79] at map at <console>:29
trackInSamePlaylists: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[84] at map at <console>:35


In [33]:
// create the pairs of ((mySong, otherSong), 1) for each playlist
val rddTrackPairs = trackInSamePlaylists
    .map { case (_, track) => ((idSong, track), 1)}
    
// rdd of form ((track1,track2), 1) use to count the occurrences

rddTrackPairs: org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[85] at map at <console>:28


In [34]:
// reduce by key to count the occurrences
val occurrencesCount = rddTrackPairs.reduceByKey(_ + _)


occurrencesCount: org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[86] at reduceByKey at <console>:26


In [35]:
// take the pair with the highest count
val mostOccurrencesPair = occurrencesCount
          .reduce((x, y) => if (x._2 > y._2) x else y)


mostOccurrencesPair: ((String, String), Int) = ((spotify:track:5xlWA3V2l7ZiqYF8Ag5EM8,spotify:track:194uD6fRM58ztn7Z1Mfyr2),1)


In [37]:
// Unisci i dettagli della traccia specifica e delle tracce correlate
val trackDetails = rddTracks.map(x => (x._1, x._2)) // (trackUri, trackName)

// mostOccurrencesPair to rdd to join 
val mostOccurrencesPairRDD = sc.parallelize(Seq(mostOccurrencesPair))

val enrichedResults = mostOccurrencesPairRDD
        .map {case ((track1, track2), count) => (track1, (track2, count)) }
        .join(trackDetails) // Unisci il nome della traccia principale
        .map { case (track1, ((track2, count), track1Name)) => (track2, (track1, track1Name, count)) }
        .join(trackDetails) // Unisci il nome delle co-tracce
        .map { case (track2, ((track1, track1Name, count), track2Name)) => (track1, track1Name, track2, track2Name, count) }

// Salva il risultato
enrichedResults.coalesce(1).saveAsTextFile("output/result")

trackDetails: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[100] at map at <console>:31
mostOccurrencesPairRDD: org.apache.spark.rdd.RDD[((String, String), Int)] = ParallelCollectionRDD[101] at parallelize at <console>:34
enrichedResults: org.apache.spark.rdd.RDD[(String, String, String, String, Int)] = MapPartitionsRDD[110] at map at <console>:41


In [55]:
 /*

       // PER TUTTE LE CANZONI, DA FARE CON POCHI FILE

// For every song, find the most similar song, i.e., the one that appears most frequently in the same playlist.
// (Join-Join-Aggregate)

// auto-join of track_in_playlist 
// to get all song pairs in the same playlist

// RDD of (pid, trackUri)
val playlistTracks = rddTrackInPlaylists.map(x => (x._1, x._2))

val rddTrackPairs = playlistTracks.join(playlistTracks)
    .filter { case (_, (track1, track2)) => track1 != track2 }  // no auto-pair
    .map { case (_, (track1, track2)) => ((track1, track2), 1) }   

//Forma: ((track1, track2), 1)

<console>:  error: incomplete input

In [20]:
// count the number of times each pair appears together
val cooccurencesCount = rddTrackPairs.reduceByKey(_ + _)

cooccurencesCount: org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[56] at reduceByKey at <console>:26


In [21]:
// get the most common pair for each track
val mostCommonPair = cooccurencesCount
          .map { case ((track1, track2), count) => (track1, (track2, count)) }
          .reduceByKey { case ((track2A, count1), (track2B, count2)) => if (count1 > count2) (track2A, count1) else (track2B, count2) }

mostCommonPair: org.apache.spark.rdd.RDD[(String, (String, Int))] = ShuffledRDD[58] at reduceByKey at <console>:28


In [8]:
//mostCommonPair.collect()

res0: Array[(String, (String, Int))] = Array((spotify:track:1mjcyWQPFsoG1Cb6Gl33Tk,(spotify:track:3kAwGo8mtUPsrctroxYLku,2)), (spotify:track:7y7bmax41rKlTLOmgNvzKR,(spotify:track:4awpwf3TeFWOtLiswRbKfr,1)), (spotify:track:7inXu0Eaeg02VsM8kHNvzM,(spotify:track:3T7dNA7O8c3Axj5WyDNcH3,11)), (spotify:track:1WHCuLyhiISWVENR0qXZ51,(spotify:track:57TUYBa41jfW56U2U9652l,1)), (spotify:track:7ccnwVhaD3ITUQ2x8EkilA,(spotify:track:5xoUgPXbMNUmoHU0Enwtwq,2)), (spotify:track:5oUV6yWdDM0R9Q2CizRhIt,(spotify:track:3ZMv9EzGoteNi5Qnx0KpEO,5)), (spotify:track:7AUOZzM5P8UDuA0zga0PP8,(spotify:track:57yfmPoMfWljcEl3qI1ADp,2)), (spotify:track:4bGEfWw5uEAnvYuTbESsMa,(spotify:track:4CbKVDZkYKdv69I4bCaKUq,1)), (spotify:track:2m6Wm0nBUJdDfCggvpWAnV,(spotify:track:12TE7Vt592RcM1G3EaaZ0f,2)), (spotify:track:3CpoeW0...


In [22]:
// RDD of (track_uri, name)
val trackDetails = rddTracks.map(x => (x._1, x._2))

// join with track details
val firstResult = mostCommonPair.
        join(trackDetails).
        map { case (track1, ((track2, count), track1Name)) => (track1, track1Name, track2, count) }



trackDetails: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[59] at map at <console>:27
firstResult: org.apache.spark.rdd.RDD[(String, String, String, Int)] = MapPartitionsRDD[63] at map at <console>:32


In [23]:
val finalResult = firstResult
        .map { case (track1, track1Name, track2, count) => (track2, (track1, track1Name, count)) }
        .join(trackDetails)
        .map { case (track2, ((track1, track1Name, count), track2Name)) =>
          (track1, track1Name, track2, track2Name, count)
        }
        

finalResult: org.apache.spark.rdd.RDD[(String, String, String, String, Int)] = MapPartitionsRDD[68] at map at <console>:29


In [24]:
val csv = finalResult.map {
  case (trackUri, trackName, coTrackUri, coTrackName, count) =>
    s"$trackUri,$trackName,$coTrackUri,$coTrackName,$count"
}

csv.saveAsTextFile("output/result")

 */

csv: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[69] at map at <console>:25
