In [None]:

val path_to_datasets = "../../../datasets/spotify/data/"
case class Track(
    uri: String,
name: String,
duration: Int,
artistUri: String,
albumUri: String,
albumName: String
)

case class Playlist(
    pid: Int,
name: String,
numFollowers: Int
)

case class TrackInPlaylist(
    pid: Int,
trackUri: String,
pos: Int
)

case class Artist(
    uri: String,
name: String
)

val folderPath = "../../../datasets/spotify/data/"
val files = new java.io.File(folderPath).listFiles.filter(_.getName.endsWith(".json"))

//val pathToJson = path_to_datasets + "mpd.slice.0-999.json"

val mapper = new ObjectMapper()
mapper.registerModule(DefaultScalaModule)

val playlists = ListBuffer[Playlist]()
val tracks = ListBuffer[Track]()
val trackInPlaylist = ListBuffer[TrackInPlaylist]()
val artists = ListBuffer[Artist]()

var counter = 0

for (file <- files.take(2)) {
    println(s"Processing file: ${counter}")
//increment the counter
counter += 1

val jsonString = scala.io.Source.fromFile(file).getLines().mkString
val jsonData = mapper.readValue(jsonString, classOf[Map[String, Any]])
val playlistsRaw = jsonData("playlists").asInstanceOf[List[Map[String, Any]]]

// Converte i dati delle playlist
playlists ++= playlistsRaw.map { playlist =>
Playlist(
pid = playlist("pid").asInstanceOf[Int],
name = playlist("name").toString,
numFollowers = playlist("num_followers").asInstanceOf[Int]
)
}

tracks ++= playlistsRaw.flatMap { playlist =>
playlist("tracks").asInstanceOf[List[Map[String, Any]]].map { track =>
Track(
uri = track("track_uri").toString,
name = track("track_name").toString,
duration = track("duration_ms").asInstanceOf[Int],
artistUri = track("artist_uri").toString,
albumUri = track("album_uri").toString,
albumName = track("album_name").toString
)
}
}.distinct

trackInPlaylist ++= playlistsRaw.flatMap { playlist =>
playlist("tracks").asInstanceOf[List[Map[String, Any]]].zipWithIndex.map { case (track, index) =>
TrackInPlaylist(
pid = playlist("pid").asInstanceOf[Int],
trackUri = track("track_uri").toString,
pos = index
)
}
}

artists ++= tracks.map(track => Artist(track.artistUri, "Unknown")).distinct
}

val spark = SparkSession.builder()
.appName("Preprocessing")
.master("local[*]")
.config("spark.hadoop.mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") // Disabilita i file _SUCCESS e .crc
.getOrCreate()

import spark.implicits._

// Converti le liste in DataFrame
val playlistsDF = playlists.toDF()
val tracksDF = tracks.toDF()
val trackInPlaylistDF = trackInPlaylist.toDF()
val artistsDF = artists.toDF()

playlistsDF.coalesce(1).write.option("header", "true").csv("../../../datasets/output/playlists.csv")
tracksDF.coalesce(1).write.option("header", "true").csv("../../../datasets/output/tracks.csv")
trackInPlaylistDF.coalesce(1).write.option("header", "true").csv("../../../datasets/output/track_in_playlist.csv")
artistsDF.coalesce(1).write.option("header", "true").csv("../../../datasets/output/artists.csv")
