In [1]:
import org.apache.spark

Intitializing Scala interpreter ...

Spark Web UI available at http://10.201.102.105:4040
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1738086391068)
SparkSession available as 'spark'


import org.apache.spark


In [None]:
val sc = spark.SparkContext.getOrCreate()

In [2]:
val path_to_datasets = "../../../datasets/processed"

val path_to_tracks = path_to_datasets + "/tracks.csv"
val path_to_playlists = path_to_datasets + "/playlists.csv"
val path_to_track_in_playlists = path_to_datasets + "/tracks_in_playlist.csv"
val path_to_artists = path_to_datasets + "/artists.csv"

path_to_datasets: String = ../../../datasets/processed
path_to_tracks: String = ../../../datasets/processed/tracks.csv
path_to_playlists: String = ../../../datasets/processed/playlists.csv
path_to_track_in_playlists: String = ../../../datasets/processed/tracks_in_playlist.csv
path_to_artists: String = ../../../datasets/processed/artists.csv


In [3]:
object CsvParser {

  val noGenresListed = "(no genres listed)"
  val commaRegex = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val pipeRegex = "\\|(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val quotes = "\""

  // (PID, playlist_name, num_followers)
  def parsePlayListLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }

  // (track_uri, track_name, duration_ms, artist_uri, album_uri, album_name)
  def parseTrackLine(line: String): Option[(String, String, Int, String, String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt, input(3).trim, input(4).trim, input(5).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (artist_uri, artist_name)
  def parseArtistLine(line: String): Option[(String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (PID, track_uri, pos)
  def parseTrackInPlaylistLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)

      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }
}

defined object CsvParser


In [4]:
val rddTracks = sc.textFile(path_to_tracks).
  flatMap(CsvParser.parseTrackLine)

val rddPlaylists = sc.textFile(path_to_playlists).
  flatMap(CsvParser.parsePlayListLine)

val rddTracksInPlaylist = sc.textFile(path_to_track_in_playlists).
  flatMap(CsvParser.parseTrackInPlaylistLine)

val rddArtists = sc.textFile(path_to_artists).
  flatMap(CsvParser.parseArtistLine)

rddTracks: org.apache.spark.rdd.RDD[(String, String, Int, String, String, String)] = MapPartitionsRDD[2] at flatMap at <console>:31
rddPlaylists: org.apache.spark.rdd.RDD[(String, String, Int)] = MapPartitionsRDD[5] at flatMap at <console>:34
rddTracksInPlaylist: org.apache.spark.rdd.RDD[(String, String, Int)] = MapPartitionsRDD[8] at flatMap at <console>:37
rddArtists: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[11] at flatMap at <console>:40


In [5]:
import org.apache.spark.HashPartitioner
val numPartitions = sc.defaultParallelism
// Numero di partizioni, dipende dalle risorse del cluster
val partitioner = new HashPartitioner(numPartitions)

// Preparazione dei dati per il broadcast
val broadcastTracks = sc.broadcast(
  rddTracks.map {
    case (track_uri, _, _, artist_uri, _, _) => (track_uri, artist_uri)
  }.collectAsMap() // Converti in una mappa per lookup efficiente
)

val broadcastArtists = sc.broadcast(
  rddArtists.map {
    case (artist_uri, artist_name) => (artist_uri, artist_name)
  }.collectAsMap() // Converti in una mappa per lookup efficiente
)

// Trasformazione principale con broadcast join
val rddPidArtistNTracks = rddTracksInPlaylist
  .mapPartitions { iter =>
    val tracksMap = broadcastTracks.value // Accesso alla mappa broadcast
    val artistsMap = broadcastArtists.value // Accesso alla mappa broadcast

    iter.flatMap {
      case (pid, track_uri, _) =>
        tracksMap.get(track_uri) match {
          case Some(artist_uri) =>
            artistsMap.get(artist_uri) match {
              case Some(_) => Some(((pid, artist_uri), 1)) // Solo se l'artista esiste
              case None => None
            }
          case None => None
        }
    }
  }

// Partizionamento e caching
val rddPidArtistNTracksPartitioned = rddPidArtistNTracks
  .partitionBy(partitioner)



((0,spotify:artist:2wIVse2owClT7go1WT98tk),1)


import org.apache.spark.HashPartitioner
numPartitions: Int = 8
partitioner: org.apache.spark.HashPartitioner = org.apache.spark.HashPartitioner@8
broadcastTracks: org.apache.spark.broadcast.Broadcast[scala.collection.Map[String,String]] = Broadcast(5)
broadcastArtists: org.apache.spark.broadcast.Broadcast[scala.collection.Map[String,String]] = Broadcast(7)
rddPidArtistNTracks: org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[14] at mapPartitions at <console>:48
rddPidArtistNTracksPartitioned: org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[15] at partitionBy at <console>:69
res0: ((String, String), Int) = ((0,spotify:artist:6vWDO969PvNqNYHIOW5v0m),1)


In [6]:
val artistTrackCount = rddPidArtistNTracksPartitioned.reduceByKey(_ + _) // (PID, artist_uri) -> conteggio

// Calcolo della somma e del conteggio per ogni playlist
val pidToArtistTracks = artistTrackCount.map(x => (x._1._1, x._2)) // PID -> conteggio

val averageSongsPerArtist = pidToArtistTracks.aggregateByKey((0, 0))(
  // Combina localmente (somma parziale e conteggio)
  (acc, value) => (acc._1 + value, acc._2 + 1),
  // Combina globalmente i risultati delle partizioni
  (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
).mapValues { case (totalTracks, totalArtists) =>
  totalTracks.toDouble / totalArtists
}

// Calcolo della media complessiva
val (sumOfAverages, totalPlaylists) = averageSongsPerArtist.mapPartitions(iter => {
  var sum = 0.0
  var count = 0L
  iter.foreach {
    case (_, avg) =>
      sum += avg
      count += 1
  }
  Iterator((sum, count))
}).reduce {
  case ((sum1, count1), (sum2, count2)) =>
    (sum1 + sum2, count1 + count2)
}

val overallAverage = sumOfAverages / totalPlaylists



artistTrackCount: org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[16] at reduceByKey at <console>:26
pidToArtistTracks: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[17] at map at <console>:29
averageSongsPerArtist: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[19] at mapValues at <console>:36
sumOfAverages: Double = 429654.73025031225
totalPlaylists: Long = 199000
overallAverage: Double = 2.1590689962327247


In [7]:
// empty cache
sc.getPersistentRDDs.foreach(_._2.unpersist())