In [2]:
import org.apache.spark

import org.apache.spark


In [None]:
val sc = spark.SparkContext.getOrCreate()

In [3]:
val path_to_datasets = "../../../datasets/processed"

val path_to_tracks = path_to_datasets + "/tracks.csv"
val path_to_playlists = path_to_datasets + "/playlists.csv"
val path_to_track_in_playlists = path_to_datasets + "/tracks_in_playlist.csv"
val path_to_artists = path_to_datasets + "/artists.csv"

path_to_datasets: String = ../../../datasets/processed
path_to_tracks: String = ../../../datasets/processed/tracks.csv
path_to_playlists: String = ../../../datasets/processed/playlists.csv
path_to_track_in_playlists: String = ../../../datasets/processed/tracks_in_playlist.csv
path_to_artists: String = ../../../datasets/processed/artists.csv


In [4]:
object CsvParser {

  val noGenresListed = "(no genres listed)"
  val commaRegex = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val pipeRegex = "\\|(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
  val quotes = "\""

  // (PID, playlist_name, num_followers)
  def parsePlayListLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }

  // (track_uri, track_name, duration_ms, artist_uri, album_uri, album_name)
  def parseTrackLine(line: String): Option[(String, String, Int, String, String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim, input(2).trim.toInt, input(3).trim, input(4).trim, input(5).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (artist_uri, artist_name)
  def parseArtistLine(line: String): Option[(String, String)] = {
    try {
      val input = line.split(commaRegex)
      Some(input(0).trim, input(1).trim)
    } catch {
      case _: Exception => None
    }
  }

  // (PID, track_uri, pos)
  def parseTrackInPlaylistLine(line: String): Option[(String, String, Int)] = {
    try {
      val input = line.split(commaRegex)

      Some(input(0).trim, input(1).trim, input(2).trim.toInt)
    } catch {
      case _: Exception => None
    }
  }
}

defined object CsvParser


In [5]:
val rddTracks = sc.textFile(path_to_tracks).
  flatMap(CsvParser.parseTrackLine)

val rddPlaylists = sc.textFile(path_to_playlists).
  flatMap(CsvParser.parsePlayListLine)

val rddTrackInPlaylists = sc.textFile(path_to_track_in_playlists).
  flatMap(CsvParser.parseTrackInPlaylistLine)

val rddArtists = sc.textFile(path_to_artists).
  flatMap(CsvParser.parseArtistLine)

rddTracks: org.apache.spark.rdd.RDD[(String, String, Int, String, String, String)] = MapPartitionsRDD[2] at flatMap at <console>:31
rddPlaylists: org.apache.spark.rdd.RDD[(String, String, Int)] = MapPartitionsRDD[5] at flatMap at <console>:34
rddTrackInPlaylists: org.apache.spark.rdd.RDD[(String, String, Int)] = MapPartitionsRDD[8] at flatMap at <console>:37
rddArtists: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[11] at flatMap at <console>:40


In [6]:
val rddTracksInPlaylistTracks = rddTrackInPlaylists.keyBy({
    case (_, t_uri, _) => t_uri
  })
  .join(rddTracks.keyBy({ case (t_uri, _, _, _, _, _) => t_uri }
  ))
  // take all fields of the track, and the playlist PID
  .map(x => (x._2._1._1, x._2._2._2, x._2._2._3, x._2._2._4, x._2._2._5, x._2._2._6))
val rddTracksInPlaylistTracksArtists = rddTracksInPlaylistTracks.keyBy(_._4)
  .join(rddArtists.keyBy(_._1))
  // keep all the fields of the track, and the playlist PID and the artist name
  .map(x => (x._2._1._1, x._2._1._2, x._2._1._3, x._2._1._4, x._2._1._5, x._2._1._6, x._2._2._2))

println("conteggio tracce in playlist: " + rddTracksInPlaylistTracksArtists.count())


conteggio tracce in playlist: 6600977


rddTracksInPlaylistTracks: org.apache.spark.rdd.RDD[(String, String, Int, String, String, String)] = MapPartitionsRDD[17] at map at <console>:33
rddTracksInPlaylistTracksArtists: org.apache.spark.rdd.RDD[(String, String, Int, String, String, String, String)] = MapPartitionsRDD[23] at map at <console>:37


In [65]:
// (PID, track_name, duration_ms, artist_uri, album_uri, album_name, artist_name)
val pidArtistTrack = rddTracksInPlaylistTracksArtists.map(x => ((x._1, x._4), 1))
val artistTrackCount = pidArtistTrack
  .reduceByKey(_ + _)

val pidToArtistTracks = artistTrackCount.map(x => (x._1._1, x._2))
// (PID, num_tracks)
val averageSongsPerArtist = pidToArtistTracks
  .groupByKey() // Raggruppa tutte le playlist
  .mapValues { counts =>
    val totalArtists = counts.size
    val totalTracks = counts.sum
    totalTracks.toDouble / totalArtists
  }

// Calcolo della media complessiva
val totalPlaylists = averageSongsPerArtist.count()
val sumOfAverages = averageSongsPerArtist.map(_._2).sum()

val overallAverage = sumOfAverages / totalPlaylists // Media complessiva

// Mostra il risultato
println(s"La media complessiva di canzoni per artista è: $overallAverage")




PLAYLIST: 7000, NewNew, 2
La media complessiva di canzoni per artista è: 2.162394557670738


pidArtistTrack: org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[1014] at map at <console>:35
artistTrackCount: org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[1015] at reduceByKey at <console>:37
pidToArtistTracks: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[1016] at map at <console>:39
averageSongsPerArtist: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[1018] at mapValues at <console>:43
totalPlaylists: Long = 99000
sumOfAverages: Double = 214077.06120940307
overallAverage: Double = 2.162394557670738
