In [1]:
import org.apache.spark

Intitializing Scala interpreter ...

Spark Web UI available at http://10.201.102.227:4040
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1737988953691)
SparkSession available as 'spark'


import org.apache.spark


In [None]:
val sc = spark.SparkContext.getOrCreate()

In [2]:
val path_to_datasets = "../../../datasets/processed"

val path_to_tracks = path_to_datasets + "/tracks.csv"
val path_to_playlists = path_to_datasets + "/playlists.csv"
val path_to_track_in_playlists = path_to_datasets + "/tracks_in_playlist.csv"
val path_to_artists = path_to_datasets + "/artists.csv"

path_to_datasets: String = ../../../datasets/processed
path_to_tracks: String = ../../../datasets/processed/tracks.csv
path_to_playlists: String = ../../../datasets/processed/playlists.csv
path_to_track_in_playlists: String = ../../../datasets/processed/tracks_in_playlist.csv
path_to_artists: String = ../../../datasets/processed/artists.csv


In [3]:
object CsvParser {

  val noGenresListed = "(no genres listed)"
  val commaRegex = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val pipeRegex = "\\|(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val quotes = "\""

  // (PID, playlist_name, num_followers)
  def parsePlayListLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }

  // (track_uri, track_name, duration_ms, artist_uri, album_uri, album_name)
  def parseTrackLine(line: String): Option[(String, String, Int, String, String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt, input(3).trim, input(4).trim, input(5).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (artist_uri, artist_name)
  def parseArtistLine(line: String): Option[(String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (PID, track_uri, pos)
  def parseTrackInPlaylistLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)

      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }
}

defined object CsvParser


In [4]:
val rddTracks = sc.textFile(path_to_tracks).
  flatMap(CsvParser.parseTrackLine)

val rddPlaylists = sc.textFile(path_to_playlists).
  flatMap(CsvParser.parsePlayListLine)

val rddTrackInPlaylists = sc.textFile(path_to_track_in_playlists).
  flatMap(CsvParser.parseTrackInPlaylistLine)

val rddArtists = sc.textFile(path_to_artists).
  flatMap(CsvParser.parseArtistLine)

rddTracks: org.apache.spark.rdd.RDD[(String, (String, Int, String, String, String))] = MapPartitionsRDD[3] at map at <console>:32
rddPlaylists: org.apache.spark.rdd.RDD[(String, String, Int)] = MapPartitionsRDD[6] at flatMap at <console>:38
rddTrackInPlaylists: org.apache.spark.rdd.RDD[(String, (String, Int))] = MapPartitionsRDD[10] at map at <console>:42
rddArtists: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[13] at flatMap at <console>:47


In [5]:
import org.apache.spark.HashPartitioner
val numPartitions = sc.defaultParallelism
// Numero di partizioni, dipende dalle risorse del cluster
val partitioner = new HashPartitioner(numPartitions)

val rddTrackInPlaylistsWithKey = rddTrackInPlaylists.keyBy(_._2)
val rddTracksWithKey = rddTracks.keyBy(_._1)
val rddArtistsWithKey = rddArtists.keyBy(_._1)

val rddTracksInPlaylistTracks = rddTrackInPlaylistsWithKey
  .join(rddTracksWithKey)
  .map({
    case (_, ((pid, _,pos), (t_uri, track_name, duration_ms, artist_uri, album_uri, album_name))) =>
      (artist_uri, (pid, track_name, duration_ms, album_uri, album_name))
  })
val rddTracksInPlaylistTracksArtists = rddTracksInPlaylistTracks
  .join(rddArtistsWithKey)
  .map(
    {
      case (artist_uri, ((pid, track_name, duration_ms, album_uri, album_name), artist_name)) =>
        (pid, artist_uri)
    }
  )

val rddTracksInPlaylistTracksArtistsPartitioned =
  rddTracksInPlaylistTracksArtists
    .partitionBy(partitioner)
    .cache()



import org.apache.spark.HashPartitioner
numPartitions: Int = 8
partitioner: org.apache.spark.HashPartitioner = org.apache.spark.HashPartitioner@8
rddTrackInPlaylistsWithKey: org.apache.spark.rdd.RDD[(String, (String, Int))] = MapPartitionsRDD[10] at map at <console>:42
rddTracksWithKey: org.apache.spark.rdd.RDD[(String, (String, Int, String, String, String))] = MapPartitionsRDD[3] at map at <console>:32
rddArtistsWithKey: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[13] at flatMap at <console>:47
rddTracksInPlaylistTracks: org.apache.spark.rdd.RDD[(String, (String, String, Int, String, String))] = MapPartitionsRDD[17] at map at <console>:39
rddTracksInPlaylistTracksArtists: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[21] at map at <console>:45
rddTrack...


In [6]:
val pidArtistTrack = rddTracksInPlaylistTracksArtistsPartitioned
  .map(x => (x, 1))

// Passo 1: Calcolo del numero totale di brani per ogni artista in ogni playlist
val artistTrackCount = pidArtistTrack.reduceByKey(_ + _) // (PID, artist_uri) -> conteggio

// Passo 2: Calcolo della somma e del conteggio per ogni playlist
val pidToArtistTracks = artistTrackCount.map(x => (x._1._1, x._2)) // PID -> conteggio

val averageSongsPerArtist = pidToArtistTracks.aggregateByKey((0, 0))(
  // Combina localmente (somma parziale e conteggio)
  (acc, value) => (acc._1 + value, acc._2 + 1),
  // Combina globalmente i risultati delle partizioni
  (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
).mapValues { case (totalTracks, totalArtists) =>
  totalTracks.toDouble / totalArtists
}

// Step 6: Calcolo della media complessiva
val (sumOfAverages, totalPlaylists) = averageSongsPerArtist.mapPartitions(iter => {
  var sum = 0.0
  var count = 0L
  iter.foreach {
    case (_, avg) =>
      sum += avg
      count += 1
  }
  Iterator((sum, count))
}).reduce {
  case ((sum1, count1), (sum2, count2)) =>
    (sum1 + sum2, count1 + count2)
}

val overallAverage = sumOfAverages / totalPlaylists

// Passo 3: Calcolo della media complessiva
//val totalPlaylists = averageSongsPerArtist.count() // Numero totale di playlist
//val sumOfAverages = averageSongsPerArtist.map(_._2).sum() // Somma di tutte le medie

//val overallAverage = sumOfAverages / totalPlaylists // Media complessiva


pidArtistTrack: org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[23] at map at <console>:27
artistTrackCount: org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[24] at reduceByKey at <console>:30
pidToArtistTracks: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[25] at map at <console>:33
averageSongsPerArtist: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[27] at mapValues at <console>:40
sumOfAverages: Double = 429654.73025031225
totalPlaylists: Long = 199000
overallAverage: Double = 2.1590689962327247


In [7]:
// empty cache
sc.getPersistentRDDs.foreach(_._2.unpersist())