In [1]:
import org.apache.spark

Intitializing Scala interpreter ...

Spark Web UI available at http://10.201.102.32:4040
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1737559999714)
SparkSession available as 'spark'


import org.apache.spark


In [None]:
val sc = spark.SparkContext.getOrCreate()

In [3]:
val path_to_datasets = "../../../datasets/processed"

val path_to_tracks = path_to_datasets + "/tracks.csv"
val path_to_playlists = path_to_datasets + "/playlists.csv"
val path_to_track_in_playlists = path_to_datasets + "/tracks_in_playlist.csv"
val path_to_artists = path_to_datasets + "/artists.csv"

path_to_datasets: String = ../../../datasets/processed
path_to_tracks: String = ../../../datasets/processed/tracks.csv
path_to_playlists: String = ../../../datasets/processed/playlists.csv
path_to_track_in_playlists: String = ../../../datasets/processed/tracks_in_playlist.csv
path_to_artists: String = ../../../datasets/processed/artists.csv


In [4]:
object CsvParser {

  val noGenresListed = "(no genres listed)"
  val commaRegex = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val pipeRegex = "\\|(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val quotes = "\""

  // (PID, playlist_name, num_followers)
  def parsePlayListLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }

  // (track_uri, track_name, duration_ms, artist_uri, album_uri, album_name)
  def parseTrackLine(line: String): Option[(String, String, Int, String, String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt, input(3).trim, input(4).trim, input(5).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (artist_uri, artist_name)
  def parseArtistLine(line: String): Option[(String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (PID, track_uri, pos)
  def parseTrackInPlaylistLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }
}

defined object CsvParser


In [5]:
val rddTracks = sc.textFile(path_to_tracks).
  flatMap(CsvParser.parseTrackLine)

val rddPlaylists = sc.textFile(path_to_playlists).
  flatMap(CsvParser.parsePlayListLine)

val rddTrackInPlaylists = sc.textFile(path_to_track_in_playlists).
  flatMap(CsvParser.parseTrackInPlaylistLine)

val rddArtists = sc.textFile(path_to_artists).
  flatMap(CsvParser.parseArtistLine)

rddTracks: org.apache.spark.rdd.RDD[(String, String, Int, String, String, String)] = MapPartitionsRDD[2] at flatMap at <console>:31
rddPlaylists: org.apache.spark.rdd.RDD[(String, String, Int)] = MapPartitionsRDD[5] at flatMap at <console>:34
rddTrackInPlaylists: org.apache.spark.rdd.RDD[(String, String, Int)] = MapPartitionsRDD[8] at flatMap at <console>:37
rddArtists: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[11] at flatMap at <console>:40


In [8]:
import org.apache.spark.HashPartitioner
val numPartitions = 8 // Numero di partizioni, dipende dalle risorse del cluster
val partitioner = new HashPartitioner(8)

val rddTrackInPlaylistsPartitioned = rddTrackInPlaylists.keyBy(_._1).partitionBy(partitioner)
val rddTracksPartitioned = rddTracks.keyBy(_._1).partitionBy(partitioner)
val rddArtistsPartitioned = rddArtists.keyBy(_._1).partitionBy(partitioner)

val rddTracksInPlaylistTracks = rddTrackInPlaylistsPartitioned
  .join(rddTracksPartitioned)
  .map { case (_, (pid, track)) =>
    (pid, track._2, track._3, track._4, track._5, track._6)
  }

val rddTracksInPlaylistTracksArtists = rddTracksInPlaylistTracks.keyBy(_._4)
  .join(rddArtistsPartitioned)
  .map { case (_, (track, artist)) =>
    (track._1, track._2, track._3, artist._2)
  }



import org.apache.spark.HashPartitioner
numPartitions: Int = 8
partitioner: org.apache.spark.HashPartitioner = org.apache.spark.HashPartitioner@8
rddTrackInPlaylistsPartitioned: org.apache.spark.rdd.RDD[(String, (String, String, Int))] = ShuffledRDD[34] at partitionBy at <console>:37
rddTracksPartitioned: org.apache.spark.rdd.RDD[(String, (String, String, Int, String, String, String))] = ShuffledRDD[36] at partitionBy at <console>:38
rddArtistsPartitioned: org.apache.spark.rdd.RDD[(String, (String, String))] = ShuffledRDD[38] at partitionBy at <console>:39
rddTracksInPlaylistTracks: org.apache.spark.rdd.RDD[((String, String, Int), String, Int, String, String, String)] = MapPartitionsRDD[42] at map at <console>:43
rddTracksInPlaylistTracksArtists: org.apache.spark.rdd.RDD[((String, Strin...


In [7]:
val pidArtistTrack = rddTracksInPlaylistTracksArtists
  .map(x => ((x._1, x._4), 1))

// Passo 1: Calcolo del numero totale di brani per ogni artista in ogni playlist
val artistTrackCount = pidArtistTrack.reduceByKey(_ + _)

// Passo 2: Calcolo della somma e del conteggio per ogni playlist
val pidToArtistTracks = artistTrackCount.map(x => (x._1._1, x._2))

val averageSongsPerArtist = pidToArtistTracks.aggregateByKey((0, 0))(
  // Combina localmente (somma parziale e conteggio)
  (acc, value) => (acc._1 + value, acc._2 + 1),
  // Combina globalmente i risultati delle partizioni
  (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
).mapValues { case (totalTracks, totalArtists) =>
  totalTracks.toDouble / totalArtists
}

// Passo 3: Calcolo della media complessiva
val totalPlaylists = averageSongsPerArtist.count() // Numero totale di playlist
val sumOfAverages = averageSongsPerArtist.map(_._2).sum() // Somma di tutte le medie

val overallAverage = sumOfAverages / totalPlaylists // Media complessiva


pidArtistTrack: org.apache.spark.rdd.RDD[(((String, String, Int), String), Int)] = MapPartitionsRDD[27] at map at <console>:26
artistTrackCount: org.apache.spark.rdd.RDD[(((String, String, Int), String), Int)] = ShuffledRDD[28] at reduceByKey at <console>:29
pidToArtistTracks: org.apache.spark.rdd.RDD[((String, String, Int), Int)] = MapPartitionsRDD[29] at map at <console>:32
averageSongsPerArtist: org.apache.spark.rdd.RDD[((String, String, Int), Double)] = MapPartitionsRDD[31] at mapValues at <console>:39
totalPlaylists: Long = 0
sumOfAverages: Double = 0.0
overallAverage: Double = NaN
