In [1]:
import org.apache.spark

Intitializing Scala interpreter ...

Spark Web UI available at http://10.201.102.32:4040
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1737567627354)
SparkSession available as 'spark'


import org.apache.spark


In [None]:
val sc = spark.SparkContext.getOrCreate()

In [2]:
val path_to_datasets = "../../../datasets/processed"

val path_to_tracks = path_to_datasets + "/tracks.csv"
val path_to_playlists = path_to_datasets + "/playlists.csv"
val path_to_track_in_playlists = path_to_datasets + "/tracks_in_playlist.csv"
val path_to_artists = path_to_datasets + "/artists.csv"

path_to_datasets: String = ../../../datasets/processed
path_to_tracks: String = ../../../datasets/processed/tracks.csv
path_to_playlists: String = ../../../datasets/processed/playlists.csv
path_to_track_in_playlists: String = ../../../datasets/processed/tracks_in_playlist.csv
path_to_artists: String = ../../../datasets/processed/artists.csv


In [3]:
object CsvParser {

  val noGenresListed = "(no genres listed)"
  val commaRegex = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val pipeRegex = "\\|(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val quotes = "\""

  // (PID, playlist_name, num_followers)
  def parsePlayListLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }

  // (track_uri, track_name, duration_ms, artist_uri, album_uri, album_name)
  def parseTrackLine(line: String): Option[(String, String, Int, String, String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt, input(3).trim, input(4).trim, input(5).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (artist_uri, artist_name)
  def parseArtistLine(line: String): Option[(String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (PID, track_uri, pos)
  def parseTrackInPlaylistLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }
}

defined object CsvParser


In [4]:
val rddTracks = sc.textFile(path_to_tracks).
  flatMap(CsvParser.parseTrackLine)

val rddPlaylists = sc.textFile(path_to_playlists).
  flatMap(CsvParser.parsePlayListLine)

val rddTrackInPlaylists = sc.textFile(path_to_track_in_playlists).
  flatMap(CsvParser.parseTrackInPlaylistLine)

val rddArtists = sc.textFile(path_to_artists).
  flatMap(CsvParser.parseArtistLine)

rddTracks: org.apache.spark.rdd.RDD[(String, String, Int, String, String, String)] = MapPartitionsRDD[2] at flatMap at <console>:31
rddPlaylists: org.apache.spark.rdd.RDD[(String, String, Int)] = MapPartitionsRDD[5] at flatMap at <console>:34
rddTrackInPlaylists: org.apache.spark.rdd.RDD[(String, String, Int)] = MapPartitionsRDD[8] at flatMap at <console>:37
rddArtists: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[11] at flatMap at <console>:40


In [None]:
import org.apache.spark.HashPartitioner
val numPartitions = 4 // Numero di partizioni, dipende dalle risorse del cluster
val partitioner = new HashPartitioner(numPartitions)

val rddTrackInPlaylistsPartitioned = rddTrackInPlaylists.keyBy(_._2)
  .partitionBy(partitioner)
val rddTracksPartitioned = rddTracks.keyBy(_._1).partitionBy(partitioner)
val rddArtistsPartitioned = rddArtists.keyBy(_._1).partitionBy(partitioner)

val rddTracksInPlaylistTracks = rddTracksPartitioned
  .join(rddTrackInPlaylistsPartitioned)
  .map { case (_, (track, tip)) => (
    track._4, // artist_uri
    (tip._1, // PID
      track._1, // track_uri
      track._2, // track_name
      track._3, // duration_ms
      track._5, // album_uri
      track._6) // album_name
  )
  }.cache()

//println("Numero di tracce in playlist: " + rddTracksInPlaylistTracks.first())

//println("Numero di tracce in playlist: " + rddTracksInPlaylistTracks.count())

// QUESTA JOIN DA PROBLEMI MA PERCHEEEE
val rddTracksInPlaylistTracksArtists = rddArtistsPartitioned.join(rddTracksInPlaylistTracks).map {
  case (artist_uri, (artist_name, (pid, track_uri, track_name, duration_ms, album_uri, album_name))) => (pid, track_name, duration_ms, artist_uri, album_uri, album_name, artist_name)
}

println("Numero di tracce in playlist con artisti: " + rddTracksInPlaylistTracksArtists.first())



In [15]:
val pidArtistTrack = rddTracksInPlaylistTracksArtists
  .map({
    case (pid, _, _, artist) => ((pid, artist), 1)
  })
println(pidArtistTrack.count())

// Passo 1: Calcolo del numero totale di brani per ogni artista in ogni playlist
val artistTrackCount = pidArtistTrack.reduceByKey(_ + _)

// Passo 2: Calcolo della somma e del conteggio per ogni playlist
val pidToArtistTracks = artistTrackCount.map(
  {
    case ((pid, _), count) => (pid, count)
  }
)

println(pidToArtistTracks.count())


0
0


pidArtistTrack: org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[90] at map at <console>:36
artistTrackCount: org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[91] at reduceByKey at <console>:42
pidToArtistTracks: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[92] at map at <console>:45
