In [7]:
import org.apache.spark

import org.apache.spark


In [None]:
// DO NOT EXECUTE - this is needed just to avoid showing errors in the following cells
val sc = spark.SparkContext.getOrCreate()

In [8]:
val path_to_datasets = "../../../dataset/spotify/data/"

path_to_datasets: String = ../../../dataset/spotify/data/


In [9]:
case class Track(
                 uri: String,
                 name: String,
                 duration: Int,
                 artistUri: String,
                 albumUri: String,
                 albumName: String
               )

case class Playlist(
                     pid: Int,
                     name: String,
                     numFollowers: Int
                   )

case class TrackInPlaylist(
                            pid: Int,
                            trackUri: String,
                            pos: Int
                          )

case class Artist(
                     uri: String,
                     name: String
                 )



defined class Track
defined class Playlist
defined class TrackInPlaylist
defined class Artist


In [11]:
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import java.io.File


import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import java.io.File


In [12]:
val pathToJson = path_to_datasets + "mpd.slice.0-999.json"

val mapper = new ObjectMapper()
mapper.registerModule(DefaultScalaModule)

val jsonString = scala.io.Source.fromFile(new File(pathToJson)).getLines().mkString

val jsonData = mapper.readValue(jsonString, classOf[Map[String, Any]])

val playlistsRaw = jsonData("playlists").asInstanceOf[List[Map[String, Any]]]

// Converte i dati delle playlist in oggetti Scala
val playlists = playlistsRaw.map { playlist =>
  Playlist(
    pid = playlist("pid").asInstanceOf[Int],
    name = playlist("name").toString,
    numFollowers = playlist("num_followers").asInstanceOf[Int]
  )
}

val tracks = playlistsRaw.flatMap { playlist => 
  playlist("tracks").asInstanceOf[List[Map[String, Any]]].map { track => 
    Track(
      uri = track("track_uri").toString,
      name = track("track_name").toString,
      duration = track("duration_ms").asInstanceOf[Int],
      artistUri = track("artist_uri").toString,
      albumUri = track("album_uri").toString,
      albumName = track("album_name").toString
    )
  }
}.distinct

val trackInPlaylist = playlistsRaw.flatMap { playlist =>
    playlist("tracks").asInstanceOf[List[Map[String, Any]]].zipWithIndex.map { case (track, index) =>
        TrackInPlaylist(
        pid = playlist("pid").asInstanceOf[Int],
        trackUri = track("track_uri").toString,
        pos = index
        )
    }
}

val artists = tracks.map(track => Artist(track.artistUri, "Unknown")).distinct


pathToJson: String = ../../../dataset/spotify/data/mpd.slice.0-999.json
mapper: com.fasterxml.jackson.databind.ObjectMapper = com.fasterxml.jackson.databind.ObjectMapper@768f225d
jsonString: String = {    "info": {        "generated_on": "2017-12-03 08:41:42.057563",         "slice": "0-999",         "version": "v1"    },     "playlists": [        {            "name": "Throwbacks",             "collaborative": "false",             "pid": 0,             "modified_at": 1493424000,             "num_tracks": 52,             "num_albums": 47,             "num_followers": 1,             "tracks": [                {                    "pos": 0,                     "artist_name": "Missy Elliott",                     "track_uri": "spotify:track:0UaMYEvWZi0ZqiDOoHU3YI",                     "art...


In [13]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame


In [20]:
val spark = SparkSession.builder()
  .appName("Preprocessing")
  .master("local[*]")
  .config("spark.hadoop.mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") // Disabilita i file _SUCCESS e .crc
  .getOrCreate()

import spark.implicits._

// Converti le liste in DataFrame
val playlistsDF = playlists.toDF()
val tracksDF = tracks.toDF()
val trackInPlaylistDF = trackInPlaylist.toDF()
val artistsDF = artists.toDF()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@56d213f9
import spark.implicits._
playlistsDF: org.apache.spark.sql.DataFrame = [pid: int, name: string ... 1 more field]
tracksDF: org.apache.spark.sql.DataFrame = [uri: string, name: string ... 4 more fields]
trackInPlaylistDF: org.apache.spark.sql.DataFrame = [pid: int, trackUri: string ... 1 more field]
artistsDF: org.apache.spark.sql.DataFrame = [uri: string, name: string]


In [21]:
playlistsDF.coalesce(1).write.option("header", "true").csv("output/playlists.csv")
tracksDF.coalesce(1).write.option("header", "true").csv("output/tracks.csv")
trackInPlaylistDF.coalesce(1).write.option("header", "true").csv("output/track_in_playlist.csv")
artistsDF.coalesce(1).write.option("header", "true").csv("output/artists.csv")

In [None]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import com.fasterxml.jackson.databind.ObjectMapper
import java.nio.file.{Files, Paths}
import scala.collection.JavaConverters._

// Percorso della cartella contenente i file JSON
val folderPath = "../../../dataset/spotify/data/"
val files = new java.io.File(folderPath).listFiles.filter(_.getName.endsWith(".json"))

// Inizializza un contatore per tracciare i progressi
var fileIndex = 0

// Unisci tutti i DataFrame
var combinedDF = spark.emptyDataFrame

files.foreach { file =>
  fileIndex += 1 // Incrementa l'indice del file
  println(s"Processing file #$fileIndex: ${file.getName}")

  // Leggi il file JSON corrente
  val currentDF = spark.read
    .option("multiline", "true")
    .json(file.getAbsolutePath)

  // Combina i DataFrame
  combinedDF = if (combinedDF.isEmpty) currentDF else combinedDF.union(currentDF)
}

// Estrai informazioni in DataFrame
val playlistsDF2 = combinedDF.select($"pid", $"name", $"num_followers").distinct()

val tracksDF2 = combinedDF
  .select(explode($"tracks").as("track"))
  .select(
    $"track.track_uri".as("uri"),
    $"track.track_name".as("name"),
    $"track.duration_ms".as("duration"),
    $"track.artist_uri".as("artistUri"),
    $"track.album_uri".as("albumUri"),
    $"track.album_name".as("albumName")
  ).distinct()

val trackInPlaylistDF2 = combinedDF
  .select($"pid", explode($"tracks").as("track"))
  .select($"pid", $"track.track_uri".as("trackUri"), $"track.pos".as("pos"))

val artistsDF2 = combinedDF
  .select(explode($"tracks").as("track"))
  .select($"track.artist_uri".as("uri"), $"track.artist_name".as("name"))
  .distinct()

// Salva i DataFrame come CSV
playlistsDF2.coalesce(1).write
  .option("header", "true")
  .csv("output/playlists.csv")

tracksDF2.coalesce(1).write
  .option("header", "true")
  .csv("output/tracks.csv")

trackInPlaylistDF2.coalesce(1).write
  .option("header", "true")
  .csv("output/track_in_playlist.csv")

artistsDF2.coalesce(1).write
  .option("header", "true")
  .csv("output/artists.csv")


Processing file #1: mpd.slice.0-999.json
Processing file #2: mpd.slice.1000-1999.json
