In [1]:
import pandas as pd
import glob

In [2]:
# Load the data

In [3]:
data = glob.glob('./data/*')

In [4]:
# collect data
temp = []
for file in data:
    df = pd.read_json(file, date_unit="ms", convert_axes=True)
    # Store new frame in an array (Performance: https://stackoverflow.com/questions/36489576/why-does-concatenation-of-dataframes-get-exponentially-slower)
    temp.append(df)
spotifyData = pd.concat(temp)

In [7]:
spotifyData

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2020-05-29 15:22,Fynn Kliemann,Alles was ich hab,4026
1,2020-05-29 15:26,Fynn Kliemann,Warten,264288
2,2020-05-29 15:29,Fynn Kliemann,Die Hook,172363
3,2020-05-29 15:31,Fynn Kliemann,Liebster Wahnsinn,95118
4,2020-05-29 15:35,Fynn Kliemann,Regen,224003
...,...,...,...,...
6154,2020-11-08 15:04,minite,Be Your Home,151111
6155,2020-11-08 15:07,Nina Young,Sunsets & Birds,148211
6156,2020-11-08 15:10,AK,All Equal,222992
6157,2020-11-08 15:13,NLSN,Cloud Forest,133163


In [None]:
# What is our current date range?

In [12]:
f'Currently analyzing data between: {min(spotifyData.endTime)}, and {max(spotifyData.endTime)}'

'Currently analyzing data between: 2018-10-12 14:47, and 2021-03-22 17:41'

In [13]:
# How many unique values do we have? / Do we have duplicates?

In [27]:
f'There is a total number of {spotifyData.count().endTime} rows. {spotifyData.drop_duplicates().count().endTime} of which are unique' 

'There is a total number of 168643 rows. 94460 of which are unique'

In [66]:
spotifyData = spotifyData.drop_duplicates()

In [28]:
# Whats the most played songs by ms played? How long did I play them?

In [50]:
spotifyData.groupby(["trackName", "artistName"]).sum().sort_values("msPlayed", ascending=False).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,msPlayed,secondsPlayed,minutesPlayed,hoursPlayed
trackName,artistName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Nevers Road,Witt Lowry,66257260,662572.6,11042.876667,184.047944
I Don't Care,Our Last Night,43365446,433654.46,7227.574333,120.459572
Make A Promise (feat. Elle Vee),Culture Code,39671692,396716.92,6611.948667,110.199144
The New American Religion,Signals,37359431,373594.31,6226.571833,103.776197
Silhouettes,FYKE,36210699,362106.99,6035.1165,100.585275
Fingerprints,Dylan Owen,35527954,355279.54,5921.325667,98.688761
Solange du dich bewegst,Wilhelmine,33024115,330241.15,5504.019167,91.733653
Kindest Regards,Witt Lowry,32721138,327211.38,5453.523,90.89205
We’ll Always Have Paris,Capstan,32461650,324616.5,5410.275,90.17125
Zuhause,Fynn Kliemann,32174144,321741.44,5362.357333,89.372622


In [44]:
# Lets run this again, but instead make our data more readable by adding seconds/minutes/hours

In [48]:
spotifyData["secondsPlayed"] = spotifyData.msPlayed/100
spotifyData["minutesPlayed"] = spotifyData.secondsPlayed/60
spotifyData["hoursPlayed"] = spotifyData.minutesPlayed/60

In [91]:
spotifyData.groupby(["trackName", "artistName"]).agg(
    totalTimePlayedMs=('msPlayed', 'sum'),
    totalTimePlayedMinutes=('minutesPlayed', 'sum'),
    totalPlayCounts=('msPlayed', 'count')).sort_values("totalTimePlayedMs", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,totalTimePlayedMs,totalTimePlayedMinutes,totalPlayCounts
trackName,artistName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nevers Road,Witt Lowry,32565238,5427.539667,137
Fingerprints,Dylan Owen,26897092,4482.848667,127
Ghosts Revisited (feat. Kiah Victoria),Dylan Owen,21112214,3518.702333,82
Ghosts,Dylan Owen,18970573,3161.762167,85
Kindest Regards,Witt Lowry,18521294,3086.882333,76
...,...,...,...,...
Auffe abe,Ischgl 3,0,0.000000,1
Bonfire,The Hunna,0,0.000000,1
Facts,Moneybagg Yo,0,0.000000,1
Break My Heart,HEY LIFE,0,0.000000,1


In [53]:
# What are my all time most played artists?

In [70]:
spotifyData.groupby("artistName").sum().sort_values("msPlayed", ascending=False).head(20)

Unnamed: 0_level_0,msPlayed,secondsPlayed,minutesPlayed,hoursPlayed
artistName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dylan Owen,296111672,2961116.72,49351.945333,822.532422
Witt Lowry,241290048,2412900.48,40215.008,670.250133
Our Last Night,171507878,1715078.78,28584.646333,476.410772
EDEN,140251592,1402515.92,23375.265333,389.587756
Fynn Kliemann,126500494,1265004.94,21083.415667,351.390261
Sleeping At Last,93818209,938182.09,15636.368167,260.606136
William Ryan Key,92903690,929036.9,15483.948333,258.065806
Mayday Parade,86947073,869470.73,14491.178833,241.519647
NF,85507442,855074.42,14251.240333,237.520672
Linkin Park,77949612,779496.12,12991.602,216.5267


In [59]:
# Whats the total number of minutes that I listed to music?

In [67]:
totalTime = spotifyData.minutesPlayed.sum()

In [69]:
totalTime/60, totalTime/60/24

(42753.04795555556, 1781.3769981481482)

In [None]:
# First mistake :) - I forgot to actually use the data without duplicates. 