In [3]:
# Cell 2: Setup and test API
import requests
import pandas as pd
import time
import random

PROJECT_ROOT = '/Users/viktorsuter/Desktop/discogs'
DATA_DIR = f'{PROJECT_ROOT}/data'

HEADERS = {'Authorization': f'Discogs token={DISCOGS_TOKEN}'}

# Test API
response = requests.get('https://api.discogs.com/releases/1', headers=HEADERS)
print(f"API Status: {response.status_code}")
print(f"Rate limit remaining: {response.headers.get('X-Discogs-Ratelimit-Remaining')}")

API Status: 200
Rate limit remaining: 60


In [4]:
# Cell 3: Peek at the XML structure
xml_path = '/Users/viktorsuter/Downloads/discogs_20260101_releases.xml'

# Read first 5000 characters to see the structure
with open(xml_path, 'r', encoding='utf-8') as f:
    sample = f.read(10000)
    
print(sample)

<releases>
<release id="1" status="Accepted"><artists><artist><id>1</id><name>The Persuader</name></artist></artists><title>Stockholm</title><labels><label name="Svek" catno="SK032" id="5"/></labels><extraartists><artist><id>507025</id><name>George Cutmaster General</name><anv>G Phrupmastergeneral</anv><role>Lacquer Cut By</role></artist><artist><id>239</id><name>Jesper Dahlbäck</name><role>Written-By [All Tracks By]</role></artist></extraartists><formats><format name="Vinyl" qty="2" text=""><descriptions><description>12"</description><description>33 ⅓ RPM</description></descriptions></format></formats><genres><genre>Electronic</genre></genres><styles><style>Deep House</style></styles><country>Sweden</country><released>1999-03-00</released><notes>The song titles are the names of six Stockholm districts.

Title on labels: - Stockholm -

[Labels:]
Recorded at the Globe studio, Stockholm

Fax: +46 8 679 64 53</notes><data_quality>Needs Vote</data_quality><master_id is_main_release="true">

In [6]:
# Cell 3: Random sample from XML (without status and catno)
import xml.etree.ElementTree as ET
import pandas as pd
import random

xml_path = '/Users/viktorsuter/Downloads/discogs_20260101_releases.xml'
output_path = '/Users/viktorsuter/Desktop/discogs/data/releases_random_sample.csv'

# First pass: count total releases
print("Counting total releases (this takes a few minutes)...")
total = 0
for event, elem in ET.iterparse(xml_path, events=['end']):
    if elem.tag == 'release':
        total += 1
        elem.clear()
        if total % 500000 == 0:
            print(f"Counted {total:,}...")

print(f"Total releases: {total:,}")

# Generate random indices to keep
sample_size = 50000
keep_indices = set(random.sample(range(total), sample_size))
print(f"Will sample {sample_size:,} random releases")

# Second pass: extract only sampled releases
releases = []
count = 0

print("Extracting random sample...")
for event, elem in ET.iterparse(xml_path, events=['end']):
    if elem.tag == 'release':
        if count in keep_indices:
            release = {
                'id': elem.get('id'),
                'title': elem.findtext('title'),
                'country': elem.findtext('country'),
                'released': elem.findtext('released'),
                'notes': elem.findtext('notes'),
                'data_quality': elem.findtext('data_quality'),
                'master_id': elem.findtext('master_id'),
            }
            
            # Artists
            artists = [a.findtext('name') for a in elem.findall('artists/artist')]
            release['artists'] = ', '.join([a for a in artists if a])
            
            # Label (name only)
            labels = elem.findall('labels/label')
            if labels:
                release['label'] = labels[0].get('name')
            
            # Format
            formats = elem.findall('formats/format')
            if formats:
                release['format'] = formats[0].get('name')
                release['format_qty'] = formats[0].get('qty')
                descs = [d.text for d in formats[0].findall('descriptions/description') if d.text]
                release['format_desc'] = ', '.join(descs)
            
            # Genres
            genres = [g.text for g in elem.findall('genres/genre') if g.text]
            release['genres'] = ', '.join(genres)
            
            # Styles
            styles = [s.text for s in elem.findall('styles/style') if s.text]
            release['styles'] = ', '.join(styles)
            
            # Track count
            tracks = elem.findall('tracklist/track')
            release['track_count'] = len(tracks)
            
            releases.append(release)
            
            if len(releases) % 10000 == 0:
                print(f"Sampled {len(releases):,} releases...")
        
        count += 1
        elem.clear()

df = pd.DataFrame(releases)
df.to_csv(output_path, index=False)
print(f"\nDone! Saved {len(df):,} random releases to {output_path}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Counting total releases (this takes a few minutes)...
Counted 500,000...
Counted 1,000,000...
Counted 1,500,000...
Counted 2,000,000...
Counted 2,500,000...
Counted 3,000,000...
Counted 3,500,000...
Counted 4,000,000...
Counted 4,500,000...
Counted 5,000,000...
Counted 5,500,000...
Counted 6,000,000...
Counted 6,500,000...
Counted 7,000,000...
Counted 7,500,000...
Counted 8,000,000...
Counted 8,500,000...
Counted 9,000,000...
Counted 9,500,000...
Counted 10,000,000...
Counted 10,500,000...
Counted 11,000,000...
Counted 11,500,000...
Counted 12,000,000...
Counted 12,500,000...
Counted 13,000,000...
Counted 13,500,000...
Counted 14,000,000...
Counted 14,500,000...
Counted 15,000,000...
Counted 15,500,000...
Counted 16,000,000...
Counted 16,500,000...
Counted 17,000,000...
Counted 17,500,000...
Counted 18,000,000...
Counted 18,500,000...
Total releases: 18,797,505
Will sample 50,000 random releases
Extracting random sample...
Sampled 10,000 releases...
Sampled 20,000 releases...
Sampled 3

Unnamed: 0,id,title,country,released,notes,data_quality,master_id,artists,label,format,format_qty,format_desc,genres,styles,track_count
0,295,Skampler,UK,2001,"Compilation of tracks from [r=1926] (tracks 1,...",Needs Vote,13582,Various,Skam,CD,1,"Compilation, Reissue",Electronic,"Leftfield, IDM, Experimental, Ambient",15
1,360,EP 3 - Nonplays Love Songs,Germany,2000,Composed between 1997 and 2000.\n\nA3 features...,Correct,345343,Bernd Friedmann,Nonplace,Vinyl,1,"12"", EP, 45 RPM",Electronic,Downtempo,5
2,630,Closed Circuit,UK,1994,Packaged in a double jewel case with an illust...,Needs Vote,112616,Electronic Eye,Beyond,CD,2,"Album, Partially Mixed",Electronic,"Techno, Ambient",13
3,841,The Warning,UK,1997,The Warning plays at 45 RPM. \nStructure Of Re...,Correct,23423,Codename John,Metalheadz,Vinyl,1,"12"", 45 RPM, 33 ⅓ RPM",Electronic,Drum n Bass,2
4,1081,Cold Fresh Air / Listen Up,UK,1994,The first Liftin' Spirit track to have a full ...,Needs Vote,62620,Higher Sense,Liftin' Spirit Records,Vinyl,1,"12"", 33 ⅓ RPM",Electronic,"Drum n Bass, Jungle",4


In [7]:
# Cell 3: Random sample of 1 million releases
import xml.etree.ElementTree as ET
import pandas as pd
import random

xml_path = '/Users/viktorsuter/Downloads/discogs_20260101_releases.xml'
output_path = '/Users/viktorsuter/Desktop/discogs/data/releases_1m_sample.csv'

# Set seed for reproducibility
random.seed(42)

# Total releases (we already know this)
total = 18797505

# Generate 1 million unique random indices
sample_size = 1000000
keep_indices = set(random.sample(range(total), sample_size))
print(f"Sampling {sample_size:,} unique releases from {total:,} total")

# Extract sampled releases
releases = []
count = 0

print("Extracting random sample...")
for event, elem in ET.iterparse(xml_path, events=['end']):
    if elem.tag == 'release':
        if count in keep_indices:
            release = {
                'id': elem.get('id'),
                'title': elem.findtext('title'),
                'country': elem.findtext('country'),
                'released': elem.findtext('released'),
                'notes': elem.findtext('notes'),
                'data_quality': elem.findtext('data_quality'),
                'master_id': elem.findtext('master_id'),
            }
            
            # Artists
            artists = [a.findtext('name') for a in elem.findall('artists/artist')]
            release['artists'] = ', '.join([a for a in artists if a])
            
            # Label
            labels = elem.findall('labels/label')
            if labels:
                release['label'] = labels[0].get('name')
            
            # Format
            formats = elem.findall('formats/format')
            if formats:
                release['format'] = formats[0].get('name')
                release['format_qty'] = formats[0].get('qty')
                descs = [d.text for d in formats[0].findall('descriptions/description') if d.text]
                release['format_desc'] = ', '.join(descs)
            
            # Genres
            genres = [g.text for g in elem.findall('genres/genre') if g.text]
            release['genres'] = ', '.join(genres)
            
            # Styles
            styles = [s.text for s in elem.findall('styles/style') if s.text]
            release['styles'] = ', '.join(styles)
            
            # Track count
            tracks = elem.findall('tracklist/track')
            release['track_count'] = len(tracks)
            
            releases.append(release)
            
            if len(releases) % 100000 == 0:
                print(f"Sampled {len(releases):,} releases...")
        
        count += 1
        elem.clear()

df = pd.DataFrame(releases)
df.to_csv(output_path, index=False)

print(f"\nDone! Saved {len(df):,} releases to {output_path}")
print(f"Unique IDs: {df['id'].nunique():,} (should match {sample_size:,})")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Sampling 1,000,000 unique releases from 18,797,505 total
Extracting random sample...
Sampled 100,000 releases...
Sampled 200,000 releases...
Sampled 300,000 releases...
Sampled 400,000 releases...
Sampled 500,000 releases...
Sampled 600,000 releases...
Sampled 700,000 releases...
Sampled 800,000 releases...
Sampled 900,000 releases...
Sampled 1,000,000 releases...

Done! Saved 1,000,000 releases to /Users/viktorsuter/Desktop/discogs/data/releases_1m_sample.csv
Unique IDs: 1,000,000 (should match 1,000,000)

Columns: ['id', 'title', 'country', 'released', 'notes', 'data_quality', 'master_id', 'artists', 'label', 'format', 'format_qty', 'format_desc', 'genres', 'styles', 'track_count']


Unnamed: 0,id,title,country,released,notes,data_quality,master_id,artists,label,format,format_qty,format_desc,genres,styles,track_count
0,26,Soul Searching Vol. 1,US,1998-01-30,Administered by Groovy Native Music (ASCAP).\r...,Needs Vote,35067,DJ Rasoul,Large Records,Vinyl,1,"12"", 33 ⅓ RPM",Electronic,"House, Deep House",5
1,51,Take Me To Paradise,US,2000-08-01,C&P Deluxe Tempo Music (ASCAP)/Systolic Music ...,Needs Vote,2103439,Miguel Migs,Naked Music Recordings,Vinyl,1,"12"", 33 ⅓ RPM",Electronic,"Deep House, House",4
2,55,Building Blocks (Volume 1),Canada,1995-00-00,Recorded & mixed live at Seven Grand Studios l...,Needs Vote,0,Terrence Parker,IntelliNET,CD,1,Mixed,Electronic,House,20
3,79,City Of Islands,Sweden,1998-12-00,"Recorded at the Globe Studio, Stockholm.",Needs Vote,4242,The Persuader,Svek,Vinyl,1,"12"", 33 ⅓ RPM",Electronic,"Deep House, Tech House",4
4,123,The Collected Works Of Kosmic Messenger,Belgium,1997,electronic//Poetry\nall/tracks/written/perform...,Needs Vote,21866,"Stacey Pullen, Kosmic Messenger",Elypsia,Vinyl,2,"LP, Compilation, Album",Electronic,Techno,10
