# Preprocessing

In [1]:
import json
import csv
import os
import shutil
from tqdm import tqdm
import pandas as pd

In [5]:
data_root = "../materials"
working_root = "../working"
scratch_root = "../scratch"

os.makedirs(working_root, exist_ok=True)
os.makedirs(scratch_root, exist_ok=True)

**Unzip `1.listens.zst`**

In [None]:
for i in range(12):
    src = os.path.join(data_root, f"{i + 1}.listens.zst")
    
    scratch_listen_1_compressed = shutil.copy2(src, scratch_root)
    !unzstd {scratch_listen_1_compressed}
    !rm {src}

## *User ID - MessyBrainz ID*

1. Collect `user_id` - `recording_msid` mappings from ListenBrainz dataset
2. Save to `userid-msid.csv`

**Write `userid-msid-1.csv`**

In [None]:
data = []

with open(os.path.join(scratch_root, "1.listens")) as fp:
    for line in tqdm(fp):
        j = json.loads(line)
        user_id = j['user_id']
        recording_msid = j['recording_msid']
        data.append([user_id, recording_msid])

with open(os.path.join(working_root, "userid-msid-1.csv"), "w", newline='', encoding='utf-8') as fp:
    w = csv.writer(fp)
    w.writerows(data)

In [3]:
json = None

**Read `userid-msid-1.csv`**

In [6]:
data = pd.read_csv(os.path.join(working_root, "userid-msid-1.csv"), header=None)
data.columns = ["userid", "msid"]
data.head()

Unnamed: 0,userid,msid
0,24848,92fc76b4-bda0-4c2c-82b9-1ef4d489c071
1,24076,002091de-47f9-49d0-8da9-9af50e28f06e
2,22845,a6877bfb-0256-4471-82ad-2e60a78329c7
3,2966,1f1eae6d-e858-4236-9ffa-f4c8bb15d9c5
4,31175,166e1331-aec6-4be8-83ee-bbce489d3c14


In [7]:
data.describe().loc['count']

userid    8839350.0
Name: count, dtype: float64

## Unique *MessyBrainz ID* mappings

1. Remove duplicates from `listenbrainz_msid_mapping.csv`
2. Save to `small_msid_mapping.csv`

**Unzip `listenbrainz_msid_mapping.csv.zst`**

In [None]:
src = os.path.join(data_root, "listenbrainz_msid_mapping.csv.zst")
listenbrainz_msid_mapping_fn = shutil.copy2(src, scratch_root)
!unzstd {listenbrainz_msid_mapping_fn}
!rm {src}

In [8]:
unique_msids = set(data['msid'])
print(f"Unique msids: {len(unique_msids)}")

Unique msids: 3013073


**Write `small_msid_mapping.csv`**

In [10]:
with open(os.path.join(scratch_root, "listenbrainz_msid_mapping.csv")) as r_fp, open(os.path.join(scratch_root, "small_msid_mapping.csv"), "w") as w_fp:
    r = csv.reader(r_fp)
    w = csv.writer(w_fp)
    header = next(r)
    w.writerow(header)
    for line in r:
        if line[0] in unique_msids and (line[2] == "exact_match" or line[2] == "high_quality"):
            w.writerow(line)

unique_msids = None

In [8]:
csv = None

**Read `small_msid_mapping.csv`**

In [9]:
smallmapping = pd.read_csv(os.path.join(scratch_root, "small_msid_mapping.csv"))
smallmapping = smallmapping.drop("match_type", axis=1)
smallmapping.set_index("recording_msid", inplace=True)
smallmapping.head()

Unnamed: 0_level_0,recording_mbid
recording_msid,Unnamed: 1_level_1
93b81537-8fd5-4cd7-9e76-58c2105225ef,38ee842b-0cb9-48de-aa5a-24cce06546ef
d1b3acbd-d409-405c-afb7-ea0a974004d6,74d1b1cf-224c-412c-b534-13a91852d5da
97c45bd5-d372-4502-add8-a8daba5e4c6e,834b5aa5-8ef0-465d-94d4-026c099d6e76
fbc0089e-34da-4caa-b899-095da8374522,4cffc0b3-27de-4ae4-afc2-6c2c71872e1f
33537acb-388f-4041-a2c4-a38b3865e87f,aca956f1-128b-48cd-8b3f-57e405a91fca


## Convert *User ID - MessyBrainz ID* into *User ID - MusicBrainz Recording ID*

In [10]:
recording_mbids = []
for index, row in tqdm(data.iterrows()):
    if row["msid"] in smallmapping.index:
        recording_mbid = smallmapping.loc[row["msid"]]["recording_mbid"]
    else:
        recording_mbid = None
    recording_mbids.append(recording_mbid)

smallmapping = None
data["mbid"] = recording_mbids
recording_mbids = None
data = data.drop("msid", axis=1)
data = data.dropna(subset=["mbid"])
data.head()

8839350it [06:50, 21545.64it/s] 


Unnamed: 0,userid,mbid
1,24076,9e37e7ff-d885-481b-aacc-a34ec0dc5c1b
2,22845,f8d186be-5bdf-4e2d-883a-0a3c4d7a9989
3,2966,bd2494aa-b5ec-4507-a507-cb1237e654b0
4,31175,c257ac23-1f8d-4e41-bb84-803f31804f9f
5,4942,d622c096-11e0-4339-adbc-62092315e660


In [11]:
data.describe().loc['count']

userid    6140859.0
Name: count, dtype: float64

## Find *canonical IDs*

**Unzip `canonical_recording_redirect.csv`**

In [None]:
src = os.path.join(data_root, "canonical_recording_redirect.csv.zst")
cannonical_msid_mapping_fn = shutil.copy2(src, scratch_root)
!unzstd {cannonical_msid_mapping_fn}
!rm {src}

**Read `canonical_recording_redirect.csv`**

In [12]:
cannonicalmapping = pd.read_csv(os.path.join(scratch_root, "canonical_recording_redirect.csv"))
cannonicalmapping = cannonicalmapping.drop("canonical_release_mbid", axis=1)
cannonicalmapping.set_index("recording_mbid", inplace=True)
cannonicalmapping.head()

Unnamed: 0_level_0,canonical_recording_mbid
recording_mbid,Unnamed: 1_level_1
f3f8a7b8-a376-450c-8139-934d2393d49a,ecb125d7-d23e-4d76-8282-745713563110
a20f4c73-1f7a-48e7-903b-a34721c13629,ecb125d7-d23e-4d76-8282-745713563110
17344c3d-d600-4bb8-ac2d-93cab18ced4e,ecb125d7-d23e-4d76-8282-745713563110
7bf54872-7e1b-450c-9af4-385bcba33b78,ecb125d7-d23e-4d76-8282-745713563110
910f5db3-9a25-44ba-8f07-9956123c8e00,ecb125d7-d23e-4d76-8282-745713563110


### Replace `recording_mbid` with `canonical_recording_redirect.csv`

In [13]:
canonical_recording_mbids = []

for index, row in tqdm(data.iterrows()):
    if row["mbid"] in cannonicalmapping.index:
        canonical_recording_mbid = cannonicalmapping.loc[row["mbid"]]["canonical_recording_mbid"]
    else:
        canonical_recording_mbid = row["mbid"]
    canonical_recording_mbids.append(canonical_recording_mbid)

cannonicalmapping = None
data["canonical_recording_mbid"] = canonical_recording_mbids
canonical_recording_mbids = None
data = data.drop("mbid", axis=1)
data.head()

6140859it [03:22, 30338.99it/s] 


Unnamed: 0,userid,canonical_recording_mbid
1,24076,9e37e7ff-d885-481b-aacc-a34ec0dc5c1b
2,22845,f8d186be-5bdf-4e2d-883a-0a3c4d7a9989
3,2966,bd2494aa-b5ec-4507-a507-cb1237e654b0
4,31175,c257ac23-1f8d-4e41-bb84-803f31804f9f
5,4942,d622c096-11e0-4339-adbc-62092315e660


In [14]:
data.describe().loc['count']['userid']

6140859.0

## Get *Artist MusicBrainz IDs*

**Unzip `canonical_musicbrainz_data.csv.zst`**

In [None]:
src = os.path.join(data_root, "canonical_musicbrainz_data.csv.zst")
cannonical_musicbrainz_data_fn = shutil.copy2(src, scratch_root)
!unzstd {cannonical_musicbrainz_data_fn}
!rm {src}

In [15]:
shutil = None

**Read `canonical_musicbrainz_data.csv.zst`**

In [16]:
musicbrainzdata = pd.read_csv(os.path.join(scratch_root, "canonical_musicbrainz_data.csv"), usecols=["artist_mbids", "recording_mbid"])
musicbrainzdata.set_index("recording_mbid", inplace=True)
musicbrainzdata.head()

Unnamed: 0_level_0,artist_mbids
recording_mbid,Unnamed: 1_level_1
00b1a29d-ad9e-4b64-aed6-281f69f628ae,89ad4ac3-39f7-470e-963a-56509c546377
0aeea6af-3f85-45f3-88ed-8ce2bdedc4c6,89ad4ac3-39f7-470e-963a-56509c546377
24f32cf2-127e-45ca-ad19-91ed3ec87409,89ad4ac3-39f7-470e-963a-56509c546377
28e2548b-9c6f-47b7-8ab5-b1735499f291,89ad4ac3-39f7-470e-963a-56509c546377
390a9ab5-89c6-4e25-8ebf-f16a39c8c9cb,89ad4ac3-39f7-470e-963a-56509c546377


In [17]:
artist_mbids = []

for index, row in tqdm(data.iterrows()):
    if row["canonical_recording_mbid"] in musicbrainzdata.index:
        artist_mbid = musicbrainzdata.loc[row["canonical_recording_mbid"]]["artist_mbids"]
        artist_mbid = artist_mbid.split(',')[0]
    else :
        artist_mbid = None
        
    artist_mbids.append(artist_mbid)

musicbrainzdata = None
data["artist_mbids"] = artist_mbids
artist_mbids = None
data = data.drop("canonical_recording_mbid", axis=1)
data = data.dropna(subset=["artist_mbids"])
data.head()

6140859it [05:58, 17114.50it/s] 


Unnamed: 0,userid,artist_mbids
1,24076,29266b3d-b5ae-4d09-b721-326246adf68f
2,22845,744b52c8-509b-4451-abfd-a17d18d4bd1d
3,2966,b7539c32-53e7-4908-bda3-81449c367da6
4,31175,875203e1-8e58-4b86-8dcb-7190faf411c5
5,4942,84825fb6-c98c-4b43-a184-c7f70619f355


In [28]:
%whos

Variable                    Type        Data/Info
-------------------------------------------------
artist_mbid                 str         f31ec2ac-9071-4306-acd1-d2c8321033b5
artist_mbids                NoneType    None
cannonicalmapping           NoneType    None
canonical_recording_mbid    str         6a4fc110-ae48-4abb-a1f4-0c295a791f78
canonical_recording_mbids   NoneType    None
csv                         NoneType    None
data                        NoneType    None
data_root                   str         materials
index                       int         8839349
json                        NoneType    None
musicbrainzdata             NoneType    None
os                          NoneType    None
pd                          NoneType    None
recording_mbid              str         b097d4c4-2839-4ef9-baac-4ca705f7bec4
recording_mbids             NoneType    None
row                         Series      userid                   <...>e: 8839349, dtype: object
scratch_root             

In [19]:
tqdm = None

In [20]:
data.describe().loc['count']

userid    6089272.0
Name: count, dtype: float64

## Get `User ID - Artist IDs` counts

In [21]:
data = data.groupby(["userid", "artist_mbids"]).size().reset_index(name="count")
data.head()

Unnamed: 0,userid,artist_mbids,count
0,1,002e9f6e-13af-4347-83c5-f5ace70e0ec4,1
1,1,01252145-c9e8-4de5-a480-9b2bed05450a,14
2,1,02a7de68-2681-4d7e-8c36-6f2fdb37c07d,1
3,1,0383dadf-2a4e-4d10-a46a-e9e041da8eb3,12
4,1,03f93de6-6d62-4710-bcc7-9b3d7c3d95f5,1


In [22]:
data.tail()

Unnamed: 0,userid,artist_mbids,count
1178596,39526,fa19ee38-c2a9-4ed1-9b24-a18100cf9db3,7
1178597,39526,fabb37f8-eb2a-4cc1-a72a-b56935bbb72d,1
1178598,39526,fbcd7b29-455f-49e6-9c4f-8249d20a055e,1
1178599,39526,fd429857-5ace-4609-ae54-1502c3bdac11,5
1178600,39526,fe125f58-6c39-42fa-85c8-3eeddda5ad21,1


In [25]:
data.to_csv(os.path.join(working_root, "userid-artist-counts.csv"), index=False)