In [1]:
%use dataframe

In [2]:
// This file contains LastFM scrobble data
// from: Saturday, January 1, 2022 12:00:00 AM
// until: Wednesday, November 29, 2023 9:17:54 PM
val df = DataFrame.read("scrobbles-zubie7a-1701292674.csv")

In [3]:
df.head(5)

In [4]:
// Boundary epoch values for time range
// 1640995200: Saturday, January 1, 2022 12:00:00 AM
// 1668165071: November 11, 2022 11:11:11 AM
// 1640995200: January 1, 2023 00:00:00 AM
// 1699701071: Saturday, November 11, 2023 11:11:11 AM
val start2022 = 1640995200
val end2022 = 1668165071
val start2023 = 1672531200
val end2023 = 1699701071
// Create "wrapped" dataframes between January 1 and November 11th,
// trying to mimic Spotify's time range (sorry December!)
val dfWrapped2022 = df.filter { start2022 < uts && uts < end2022 }
val dfWrapped2023 = df.filter { start2023 < uts && uts < end2023 }

In [5]:
// Find the 10 most listened songs in 2022 and 2023, to compare.

val dfMostListenedSongs2022 = 
    dfWrapped2022
        .groupBy("track")
        .count()
        .rename("track").into("track2022")
        .rename("count").into("countYear2022")
        .sortByDesc("countYear2022")
        // Adds an increasing iterator per row, to be used as rank.
        .addId("rank")


val dfMostListenedSongs2023 = 
    dfWrapped2023
        .groupBy("track")
        .count()
        .rename("track").into("track2023")
        .rename("count").into("countYear2023")
        .sortByDesc("countYear2023")
        // Adds an increasing iterator per row, to be used as rank.
        .addId("rank")

In [6]:
val dfRanksSongs2022 = 
    dfMostListenedSongs2022
        .select { rank and track2022 }
        .rename("rank").into { "track2023_rank2022" }
        .rename("track2022").into { "track" }

In [7]:
// With the left join, tracks from 2023 not present in 2022 will have
// the 2022 values filled in with null.
val dfMostListenedSongs2023WithPreviousYearRank =
    dfMostListenedSongs2023.leftJoin(dfRanksSongs2022) { track2023 match right.track }

dfMostListenedSongs2022
    .join(
        dfMostListenedSongs2023WithPreviousYearRank
            .add("track2023_rank2023") { rank } 
    )

In [8]:
// Find the 10 most listened albums in 2022 and 2023, to compare.

val dfMostListenedAlbums2022 = 
    dfWrapped2022
        .groupBy("album")
        .count()
        .rename("album").into("album2022")
        .rename("count").into("countYear2022")
        .sortByDesc("countYear2022")
        // Adds an increasing iterator per row, to be used as rank.
        .addId("rank")
        
val dfMostListenedAlbums2023 = 
    dfWrapped2023
        .groupBy("album")
        .count()
        .rename("album").into("album2023")
        .rename("count").into("countYear2023")
        .sortByDesc("countYear2023")
        // Adds an increasing iterator per row, to be used as rank.
        .addId("rank")

In [9]:
val dfRanksAlbums2022 = 
    dfMostListenedAlbums2022
        .select { rank and album2022 }
        .rename("rank").into { "album2023_rank2022" }
        .rename("album2022").into { "album" }

In [10]:
// With the left join, albums from 2023 not present in 2022 will have
// the 2022 values filled in with null.
val dfMostListenedAlbums2023WithPreviousYearRank =
    dfMostListenedAlbums2023.leftJoin(dfRanksAlbums2022) { album2023 match right.album }

dfMostListenedAlbums2022
    .join(
        dfMostListenedAlbums2023WithPreviousYearRank
            .add("album2023_rank2023") { rank } 
    )

In [11]:
// Find the 10 most listened artists in 2022 and 2023, to compare.

val dfMostListenedArtists2022 = 
    dfWrapped2022
        .groupBy("artist")
        .count()
        .rename("artist").into("artist2022")
        .rename("count").into("countYear2022")
        .sortByDesc("countYear2022")
        // Adds an increasing iterator per row, to be used as rank.
        .addId("rank")

val dfMostListenedArtists2023 = 
    dfWrapped2023
        .groupBy("artist")
        .count()
        .rename("artist").into("artist2023")
        .rename("count").into("countYear2023")
        .sortByDesc("countYear2023")
        // Adds an increasing iterator per row, to be used as rank.
        .addId("rank")

In [12]:
val dfRanksArtists2022 = 
    dfMostListenedArtists2022
        .select { rank and artist2022 }
        .rename("rank").into { "artist2023_rank2022" }
        .rename("artist2022").into { "artist" }

In [13]:
// With the left join, albums from 2023 not present in 2022 will have
// the 2022 values filled in with null.
val dfMostListenedArtists2023WithPreviousYearRank =
    dfMostListenedArtists2023.leftJoin(dfRanksArtists2022) { artist2023 match right.artist }

dfMostListenedArtists2022
    .join(
        dfMostListenedArtists2023WithPreviousYearRank
            .add("artist2023_rank2023") { rank } 
    )

In [14]:
// Find the 10 most listened artists in 2022 and 2023, to compare.

val dfMostListenedArtists2022 = 
    dfWrapped2022
        .groupBy("artist")
        .count()
        .rename("artist").into("artist2022")
        .rename("count").into("countYear2022")
        .sortByDesc("countYear2022")
        // Adds an increasing iterator per row, to be used as rank.
        .addId("rank")

val dfMostListenedArtists2023 = 
    dfWrapped2023
        .groupBy("artist")
        .count()
        .rename("artist").into("artist2023")
        .rename("count").into("countYear2023")
        .sortByDesc("countYear2023")
        // Adds an increasing iterator per row, to be used as rank.
        .addId("rank")

In [15]:
// Top songs listened to in 2023 never listened to in 2022.
dfMostListenedSongs2023
    .excludeJoin(dfMostListenedSongs2022) { track2023 match right.track2022 }

In [16]:
// Top albums listened to in 2023 never listened to in 2022.
dfMostListenedAlbums2023
    .excludeJoin(dfMostListenedAlbums2022) { album2023 match right.album2022 }

In [17]:
// Top artists listened to in 2023 never listened to in 2022.
dfMostListenedArtists2023
    .excludeJoin(dfMostListenedArtists2022) { artist2023 match right.artist2022 }

In [18]:
// Top songs listened to in 2022 never listened to in 2023.
dfMostListenedSongs2022
    .excludeJoin(dfMostListenedSongs2023) { track2022 match right.track2023 }

In [19]:
// Top albums listened to in 2022 never listened to in 2023.
dfMostListenedAlbums2022
    .excludeJoin(dfMostListenedAlbums2023) { album2022 match right.album2023 }

In [20]:
// Top artists listened to in 2022 never listened to in 2023.
dfMostListenedArtists2022
    .excludeJoin(dfMostListenedArtists2023) { artist2022 match right.artist2023 }

In [21]:
// Add columns for timestamps and date.
import java.sql.Timestamp

val dfWrappedWithDate2022 = 
    dfWrapped2022
        .add("timestamp") { Timestamp("uts"<Long>() * 1000 ) } 
        .add("date") { "timestamp"<Timestamp>().toString().substring(0..10) }

val dfWrappedWithDate2023 = 
    dfWrapped2023
        .add("timestamp") { Timestamp("uts"<Long>() * 1000 ) } 
        .add("date") { "timestamp"<Timestamp>().toString().substring(0..10) }

In [22]:
// Find the 10 days with the most songs listened in 2022 and 2023, to compare.

val dfDaysWithMostSongs2022 = 
    dfWrappedWithDate2022
        .groupBy("date")
        .count()
        .rename("date").into("day2022")
        .rename("count").into("countDay2022")
        .sortByDesc("countDay2022")
        .head(10)
        .addId("rank")

val dfDaysWithMostSongs2023 = 
    dfWrappedWithDate2023
        .groupBy("date")
        .count()
        .rename("date").into("day2023")
        .rename("count").into("countDay2023")
        .sortByDesc("countDay2023")
        .head(10)
        .addId("rank")

dfDaysWithMostSongs2022.join(dfDaysWithMostSongs2023)

In [23]:
// Find the 10 songs most listened the same day in 2022 and 2023, to compare.

val dfSongsMostRepeatedInADay2022 = 
    dfWrappedWithDate2022
        .groupBy{ "track" and "date" }
        .count()
        .rename("date").into("day2022")
        .rename("track").into("track2022")
        .rename("count").into("countDay2022")
        .sortByDesc("countDay2022")
        .head(10)
        .addId("rank")

val dfSongsMostRepeatedInADay2023 = 
    dfWrappedWithDate2023
        .groupBy{ "track" and "date" }
        .count()
        .rename("date").into("day2023")
        .rename("track").into("track2023")
        .rename("count").into("countDay2023")
        .sortByDesc("countDay2023")
        .head(10)
        .addId("rank")

dfSongsMostRepeatedInADay2022.join(dfSongsMostRepeatedInADay2023)

In [24]:
// Find the 10 albums most listened the same day in 2022 and 2023, to compare.

val dfAlbumsMostListenedInADay2022 = 
    dfWrappedWithDate2022
        .groupBy{ "album" and "date" }
        .count()
        .rename("date").into("day2022")
        .rename("album").into("album2022")
        .rename("count").into("countDay2022")
        .sortByDesc("countDay2022")
        .head(10)
        .addId("rank")

val dfAlbumsMostListenedInADay2023 = 
    dfWrappedWithDate2023
        .groupBy{ "album" and "date" }
        .count()
        .rename("date").into("day2023")
        .rename("album").into("album2023")
        .rename("count").into("countDay2023")
        .sortByDesc("countDay2023")
        .head(10)
        .addId("rank")

dfAlbumsMostListenedInADay2022.join(dfAlbumsMostListenedInADay2023)

In [25]:
// Find the 10 artists most listened the same day in 2022 and 2023, to compare.

val dfArtistsMostListenedInADay2022 = 
    dfWrappedWithDate2022
        .groupBy{ "artist" and "date" }
        .count()
        .rename("date").into("day2022")
        .rename("artist").into("artist2022")
        .rename("count").into("countDay2022")
        .sortByDesc("countDay2022")
        .head(10)
        .addId("rank")

val dfArtistsMostListenedInADay2023 = 
    dfWrappedWithDate2023
        .groupBy{ "artist" and "date" }
        .count()
        .rename("date").into("day2023")
        .rename("artist").into("artist2023")
        .rename("count").into("countDay2023")
        .sortByDesc("countDay2023")
        .head(10)
        .addId("rank")

dfArtistsMostListenedInADay2022.join(dfArtistsMostListenedInADay2023)

In [26]:
// Listening songs several times in the same day (e.g. having the song or
// album on repeat) will make them overrepresented on the top ranks,
// so let's find out which songs have been listened to the most on
// distinct days of the year.
dfWrappedWithDate2022
    .select { track and date }
    .distinct()
    .groupBy("track")
    .count()
    .sortByDesc("count")

In [27]:
dfWrappedWithDate2023
    .select { track and date }
    .distinct()
    .groupBy("track")
    .count()
    .sortByDesc("count")

In [28]:
val countUniqueSongs2022 = (
    dfWrappedWithDate2022.groupBy("track").count()
).count().toFloat()

In [29]:
val countUniqueSongs2023 = (
    dfWrappedWithDate2023.groupBy("track").count()
).count().toFloat()

In [30]:
countUniqueSongs2022

7248.0

In [31]:
countUniqueSongs2023

6771.0

In [32]:
countUniqueSongs2023 / countUniqueSongs2022

0.9341887

In [33]:
val countUniqueArtists2022 = (
    dfWrappedWithDate2022.groupBy("artist").count()
).count().toFloat()

In [34]:
val countUniqueArtists2023 = (
    dfWrappedWithDate2023.groupBy("artist").count()
).count().toFloat()

In [35]:
countUniqueArtists2022

2844.0

In [36]:
countUniqueArtists2023

2953.0

In [37]:
countUniqueArtists2023 / countUniqueArtists2022

1.0383263

In [38]:
val countTotalSongs2022 = dfWrapped2022.count().toFloat()

In [39]:
val countTotalSongs2023 = dfWrapped2023.count().toFloat()

In [40]:
countTotalSongs2022

25351.0

In [41]:
countTotalSongs2023

17852.0

In [42]:
countTotalSongs2023 / countTotalSongs2022

0.7041931

In [43]:
%use lets-plot

In [44]:
val plotData2022 = 
    letsPlot(
        dfWrappedWithDate2022
            .sortBy("date")
            .toMap()
    ) { x = "date" } + ggsize(630, 500)
    
plotData2022 + geomHistogram(binWidth=0.5)

In [45]:
val plotData2023 = 
    letsPlot(
        dfWrappedWithDate2023
            .sortBy("date")
            .toMap()
    ) { x = "date" } + ggsize(630, 500)
    
plotData2023 + geomHistogram(binWidth=0.5)

In [46]:
// By hour of the day 2022
val dfWrappedWithDateWithHour2022 =
    dfWrappedWithDate2022
        .add("hour") { 
            "timestamp"<Timestamp>()
                .toString()
                .split(" ")[1]
                .split(":")[0] 
            }

val plotDataHour2022 = 
    letsPlot(
        dfWrappedWithDateWithHour2022
        .sortBy("hour")
        .toMap()
    ) { x = "hour" } + ggsize(630, 500)
    
plotDataHour2022 + geomHistogram(binWidth=0.5)

In [47]:
// By hour of the day 2023
val dfWrappedWithDateWithHour2023 =
    dfWrappedWithDate2023
        .add("hour") { 
            "timestamp"<Timestamp>()
                .toString()
                .split(" ")[1]
                .split(":")[0] 
            }

val plotDataHour2023 = 
    letsPlot(
        dfWrappedWithDateWithHour2023
        .sortBy("hour")
        .toMap()
    ) { x = "hour" } + ggsize(630, 500)
    
plotDataHour2023 + geomHistogram(binWidth=0.5)

In [48]:
import java.util.Calendar

fun getDayOfTheWeek(ts: Timestamp): Int {
    val calendar = Calendar.getInstance()
    calendar.setTime(ts)
    val day = calendar.get(java.util.Calendar.DAY_OF_WEEK)
    return day
}

// By day of the week 2022
val dfWrappedWithDateWithDay2022 =
    dfWrappedWithDate2022
        .add("dayOfWeek") { 
            getDayOfTheWeek(
                "timestamp"<Timestamp>()
            )
        }

val plotDataDay2022 = 
    letsPlot(
        dfWrappedWithDateWithDay2022
        .sortBy("dayOfWeek")
        .toMap()
    ) { x = "dayOfWeek" } + ggsize(630, 500)
    
plotDataDay2022 + geomHistogram(binWidth=0.5)

In [49]:
import java.util.Calendar

fun getDayOfTheWeek(ts: Timestamp): Int {
    val calendar = Calendar.getInstance()
    calendar.setTime(ts)
    val day = calendar.get(java.util.Calendar.DAY_OF_WEEK)
    return day
}

// By day of the week 2023
val dfWrappedWithDateWithDay2023 =
    dfWrappedWithDate2023
        .add("dayOfWeek") { 
            getDayOfTheWeek(
                "timestamp"<Timestamp>()
            )
        }

val plotDataDay2023 = 
    letsPlot(
        dfWrappedWithDateWithDay2023
        .sortBy("dayOfWeek")
        .toMap()
    ) { x = "dayOfWeek" } + ggsize(630, 500)
    
plotDataDay2023 + geomHistogram(binWidth=0.5)

In [50]:
import java.util.Calendar

fun getMonth(ts: Timestamp): Int {
    val calendar = Calendar.getInstance()
    calendar.setTime(ts)
    val month = calendar.get(java.util.Calendar.MONTH)
    return month
}

// By month of year 2022
val dfWrappedWithDateWithMonth2022 =
    dfWrappedWithDate2022
        .add("month") { 
            getMonth(
                "timestamp"<Timestamp>()
            ) + 1
        }

val plotDataMonth2022 = 
    letsPlot(
        dfWrappedWithDateWithMonth2022
        .sortBy("month")
        .toMap()
    ) { x = "month" } + ggsize(600, 500)
    
plotDataMonth2022 + geomHistogram(binWidth=0.5)

In [51]:
import java.util.Calendar

fun getMonth(ts: Timestamp): Int {
    val calendar = Calendar.getInstance()
    calendar.setTime(ts)
    val month = calendar.get(java.util.Calendar.MONTH)
    return month
}

// By month of year 2023
val dfWrappedWithDateWithMonth2023 =
    dfWrappedWithDate2023
        .add("month") { 
            getMonth(
                "timestamp"<Timestamp>()
            ) + 1
        }

val plotDataMonth2023 = 
    letsPlot(
        dfWrappedWithDateWithMonth2023
        .sortBy("month")
        .toMap()
    ) { x = "month" } + ggsize(600, 500)
    
plotDataMonth2023 + geomHistogram(binWidth=0.5)

In [52]:
import java.util.Calendar

fun getWeek(ts: Timestamp): Int {
    val calendar = Calendar.getInstance()
    calendar.setTime(ts)
    val week = calendar.get(java.util.Calendar.WEEK_OF_YEAR)
    return week
}

// By month of year 2022
val dfWrappedWithDateWithWeek2022 =
    dfWrappedWithDate2022
        .add("week") { 
            getWeek(
                "timestamp"<Timestamp>()
            ) + 1
        }

val plotDataWeek2022 = 
    letsPlot(
        dfWrappedWithDateWithWeek2022
        .sortBy("week")
        .toMap()
    ) { x = "week" } + ggsize(600, 500)
    
plotDataWeek2022 + geomHistogram(binWidth=0.5)

In [53]:
import java.util.Calendar

fun getWeek(ts: Timestamp): Int {
    val calendar = Calendar.getInstance()
    calendar.setTime(ts)
    val week = calendar.get(java.util.Calendar.WEEK_OF_YEAR)
    return week
}

// By week of year 2023
val dfWrappedWithDateWithWeek2023 =
    dfWrappedWithDate2023
        .add("week") { 
            getWeek(
                "timestamp"<Timestamp>()
            ) + 1
        }

val plotDataWeek2023 = 
    letsPlot(
        dfWrappedWithDateWithWeek2023
        .sortBy("week")
        .toMap()
    ) { x = "week" } + ggsize(600, 500)
    
plotDataWeek2023 + geomHistogram(binWidth=0.5)

In [248]:
// Now I've got the Spotify streaming data! Let's see what can be done with it.

In [249]:
val dfpl1 = DataFrame.read("Playlist1.json")
val dfpl2 = DataFrame.read("Playlist2.json")
val dfpl3 = DataFrame.read("Playlist3.json")
val dfpl4 = DataFrame.read("Playlist4.json")

In [218]:
val dfPlaylists1 = dfpl1.explode().explode().flatten()
val dfPlaylists2 = dfpl2.explode().explode().flatten()
val dfPlaylists3 = dfpl3.explode().explode().flatten()
val dfPlaylists4 = dfpl4.explode().explode().flatten()

In [220]:
val dfPlaylists = dfPlaylists1.concat(dfPlaylists2).concat(dfPlaylists3).concat(dfPlaylists4)

In [221]:
// Check that the playlists' data has been properly merged.
dfPlaylists.groupBy("name").count().sortByDesc("count")

In [115]:
// Now read the streaming data sources.
val dfst1 = DataFrame.read("StreamingHistory0.json")
val dfst2 = DataFrame.read("StreamingHistory1.json")
val dfst3 = DataFrame.read("StreamingHistory2.json")

In [214]:
val dfStreaming = dfst1.concat(dfst2).concat(dfst3)
dfStreaming.count()

25314

In [250]:
// Constrain it to the time period between January 1st and November 11th.
val dfStreaming2023 = dfStreaming.filter { endTime > "2023-01-01" && endTime < "2023-11-11" }
dfStreaming2023.count()

22369

In [254]:
// LastFM scrobbles songs if they played for more than 240 seconds
// or played at least halfway the song duration, but in Spotify data
// there's no info about the song duration only the time played.
// 240 seconds is 4 minutes and most songs are around this time, so
// let's pick 120 seconds as a play threshold.
val dfStreaming2023Min2Minutes = dfStreaming2023.filter { msPlayed > 120000 } 
// the result is 17775 which is very close to the number reported by
// LastFM, 17852 !!!
dfStreaming2023Min2Minutes.count()

17775

In [255]:
countTotalSongs2023

17852.0

In [257]:
// Get date and hour from the end time playing. This may add some noise
// if the end time is the next hour or day after the song started playing,
// but let's not consider this to be a big problem.
val dfStreaming2023WithDate = dfStreaming2023Min2Minutes
    .add("date") { endTime.split(" ")[0] }
    .add("hour") { endTime.split(" ")[1].split(":")[0] }

In [258]:
val dfMostStreamedSongs2023 = 
    dfStreaming2023WithDate
        .groupBy("trackName")
        .count()
        .sortByDesc("count")
        // Adds an increasing iterator per row, to be used as rank.
        .addId("rank")

In [260]:
// Check the data largely matches the LastFM data.
dfMostStreamedSongs2023

In [264]:
dfStreaming2023Min2Minutes.select { msPlayed }.sum()

In [265]:
// This seems like it automatically picks up which columns can be summed.
val dfMsPlayedByDate2023 = 
    dfStreaming2023WithDate
        .groupBy("date")
        .sum()

val dfTimePlayedByDate2023 =
    dfMsPlayedByDate2023
        .add("hoursPlayed") { msPlayed / (1000.0 * 3600) }
        
dfTimePlayedByDate2023

In [268]:
// Hours per day listened in 2023 according to Spotify.
val plotDataHoursPlayedPerDaySpotify2023 = 
    letsPlot(
        dfTimePlayedByDate2023
        .sortBy("date")
        .toMap()
    ) { x = "date"; y = "hoursPlayed" } + ggsize(630, 500)
    
plotDataHoursPlayedPerDaySpotify2023 + geomPoint()

In [243]:
// This seems like it automatically picks up which columns can be summed.
val dfMsPlayedByHour2023 = 
    dfStreaming2023WithDate
        .groupBy("hour")
        .sum()

val dfTimePlayedByHour2023 =
    dfMsPlayedByHour2023
        .add("hoursPlayed") { "msPlayed"<Int>() / (1000.0 * 3600) }
        
dfTimePlayedByHour2023

In [267]:
// Hours per hour listened in 2023 according to Spotify.
val plotDataHoursPlayedPerHourSpotify2023 = 
    letsPlot(
        dfTimePlayedByHour2023
        .sortBy("hour")
        .toMap()
    ) { x = "hour"; y = "hoursPlayed" } + ggsize(630, 500)
    
plotDataHoursPlayedPerHourSpotify2023 + geomPoint()

In [222]:
// By hour of the day 2023 from Spotify
val plotDataHourSpotify2023 = 
    letsPlot(
        dfStreaming2023WithDate
        .sortBy("hour")
        .toMap()
    ) { x = "hour" } + ggsize(630, 500)
    
plotDataHourSpotify2023 + geomHistogram(binWidth=0.5)

In [213]:
// By hour of the day 2023 from LastFM
val dfWrappedWithDateWithHour2023 =
    dfWrappedWithDate2023
        .add("hour") { 
            "timestamp"<Timestamp>()
                .toString()
                .split(" ")[1]
                .split(":")[0] 
            }

val plotDataHour2023 = 
    letsPlot(
        dfWrappedWithDateWithHour2023
        .sortBy("hour")
        .toMap()
    ) { x = "hour" } + ggsize(630, 500)
    
plotDataHour2023 + geomHistogram(binWidth=0.5)