In [3]:
# Cell 2: Setup and test API
import requests
import pandas as pd
import time
import random

PROJECT_ROOT = '/Users/viktorsuter/Desktop/discogs'
DATA_DIR = f'{PROJECT_ROOT}/data'

HEADERS = {'Authorization': f'Discogs token={DISCOGS_TOKEN}'}

# Test API
response = requests.get('https://api.discogs.com/releases/1', headers=HEADERS)
print(f"API Status: {response.status_code}")
print(f"Rate limit remaining: {response.headers.get('X-Discogs-Ratelimit-Remaining')}")

API Status: 200
Rate limit remaining: 60


In [4]:
# Cell 3: Peek at the XML structure
xml_path = '/Users/viktorsuter/Downloads/discogs_20260101_releases.xml'

# Read first 5000 characters to see the structure
with open(xml_path, 'r', encoding='utf-8') as f:
    sample = f.read(10000)
    
print(sample)

<releases>
<release id="1" status="Accepted"><artists><artist><id>1</id><name>The Persuader</name></artist></artists><title>Stockholm</title><labels><label name="Svek" catno="SK032" id="5"/></labels><extraartists><artist><id>507025</id><name>George Cutmaster General</name><anv>G Phrupmastergeneral</anv><role>Lacquer Cut By</role></artist><artist><id>239</id><name>Jesper Dahlbäck</name><role>Written-By [All Tracks By]</role></artist></extraartists><formats><format name="Vinyl" qty="2" text=""><descriptions><description>12"</description><description>33 ⅓ RPM</description></descriptions></format></formats><genres><genre>Electronic</genre></genres><styles><style>Deep House</style></styles><country>Sweden</country><released>1999-03-00</released><notes>The song titles are the names of six Stockholm districts.

Title on labels: - Stockholm -

[Labels:]
Recorded at the Globe studio, Stockholm

Fax: +46 8 679 64 53</notes><data_quality>Needs Vote</data_quality><master_id is_main_release="true">

In [None]:
# Cell 3: Random sample from XML (without status and catno)
import xml.etree.ElementTree as ET
import pandas as pd
import random

xml_path = '/Users/viktorsuter/Downloads/discogs_20260101_releases.xml'
output_path = '/Users/viktorsuter/Desktop/discogs/data/releases_random_sample.csv'

# First pass: count total releases
print("Counting total releases (this takes a few minutes)...")
total = 0
for event, elem in ET.iterparse(xml_path, events=['end']):
    if elem.tag == 'release':
        total += 1
        elem.clear()
        if total % 500000 == 0:
            print(f"Counted {total:,}...")

print(f"Total releases: {total:,}")

# Generate random indices to keep
sample_size = 50000
keep_indices = set(random.sample(range(total), sample_size))
print(f"Will sample {sample_size:,} random releases")

# Second pass: extract only sampled releases
releases = []
count = 0

print("Extracting random sample...")
for event, elem in ET.iterparse(xml_path, events=['end']):
    if elem.tag == 'release':
        if count in keep_indices:
            release = {
                'id': elem.get('id'),
                'title': elem.findtext('title'),
                'country': elem.findtext('country'),
                'released': elem.findtext('released'),
                'notes': elem.findtext('notes'),
                'data_quality': elem.findtext('data_quality'),
                'master_id': elem.findtext('master_id'),
            }
            
            # Artists
            artists = [a.findtext('name') for a in elem.findall('artists/artist')]
            release['artists'] = ', '.join([a for a in artists if a])
            
            # Label (name only)
            labels = elem.findall('labels/label')
            if labels:
                release['label'] = labels[0].get('name')
            
            # Format
            formats = elem.findall('formats/format')
            if formats:
                release['format'] = formats[0].get('name')
                release['format_qty'] = formats[0].get('qty')
                descs = [d.text for d in formats[0].findall('descriptions/description') if d.text]
                release['format_desc'] = ', '.join(descs)
            
            # Genres
            genres = [g.text for g in elem.findall('genres/genre') if g.text]
            release['genres'] = ', '.join(genres)
            
            # Styles
            styles = [s.text for s in elem.findall('styles/style') if s.text]
            release['styles'] = ', '.join(styles)
            
            # Track count
            tracks = elem.findall('tracklist/track')
            release['track_count'] = len(tracks)
            
            releases.append(release)
            
            if len(releases) % 10000 == 0:
                print(f"Sampled {len(releases):,} releases...")
        
        count += 1
        elem.clear()

df = pd.DataFrame(releases)
df.to_csv(output_path, index=False)
print(f"\nDone! Saved {len(df):,} random releases to {output_path}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Counting total releases (this takes a few minutes)...
Counted 500,000...
Counted 1,000,000...
