In [8]:
# Setup and test API
import os
import requests
from dotenv import load_dotenv

# Load token
load_dotenv()
DISCOGS_TOKEN = os.getenv('DISCOGS_TOKEN')

if not DISCOGS_TOKEN:
    raise ValueError("Create a .env file with: DISCOGS_TOKEN=your_token_here")

HEADERS = {'Authorization': f'Discogs token={DISCOGS_TOKEN}'}

# Test API
response = requests.get('https://api.discogs.com/releases/1', headers=HEADERS)
print(f"API Status: {response.status_code}")
print(f"Rate limit remaining: {response.headers.get('X-Discogs-Ratelimit-Remaining')}")

API Status: 200
Rate limit remaining: 60


In [None]:
# Load data from Dropbox
import pandas as pd

# Paste the prefered Dropbox link
url = 'https://www.dropbox.com/scl/fi/k4d8e47q9ky8hhd5hwwkn/releases_50k_random_sample.csv?rlkey=xnicb09k8ouji0qd6yqdc7xag&dl=1'
df = pd.read_csv(url)

print(f"Loaded {len(df):,} releases")
print(f"Columns: {df.columns.tolist()}")
df.head()

In [None]:
# Collect additional data through the Discogs API
import time
import os

def get_release_data(release_id):
    """Fetch community & market data for a release"""
    url = f'https://api.discogs.com/releases/{release_id}'
    try:
        response = requests.get(url, headers=HEADERS)
        if response.status_code == 200:
            data = response.json()
            community = data.get('community', {})
            rating = community.get('rating', {})
            return {
                'have': community.get('have'),
                'want': community.get('want'),
                'rating_avg': rating.get('average'),
                'rating_count': rating.get('count'),
                'num_for_sale': data.get('num_for_sale'),
                'lowest_price': data.get('lowest_price'),
                'discogs_url': f"https://www.discogs.com/release/{release_id}",
            }
        elif response.status_code == 429:
            print("Rate limited, waiting 60s...")
            time.sleep(60)
            return get_release_data(release_id)
    except Exception as e:
        print(f"Error for {release_id}: {e}")
    return {}

# Checkpoint file
checkpoint_path = 'api_checkpoint.csv'

# Check for previous progress
if os.path.exists(checkpoint_path):
    df_progress = pd.read_csv(checkpoint_path)
    completed_ids = set(df_progress['id'])
    print(f"Resuming: {len(completed_ids):,} already done")
else:
    df_progress = pd.DataFrame()
    completed_ids = set()

# IDs still to fetch
remaining = df[~df['id'].isin(completed_ids)]
print(f"Remaining: {len(remaining):,} releases")
print(f"Estimated time: {len(remaining)/60/60:.1f} hours")

# Fetch with rate limiting and checkpointing
results = []
start_time = time.time()

for i, row in remaining.iterrows():
    release_id = row['id']
    
    # Fetch data
    api_data = get_release_data(release_id)
    api_data['id'] = release_id
    results.append(api_data)
    
    # Progress every 100
    if len(results) % 100 == 0:
        elapsed = time.time() - start_time
        rate = len(results) / elapsed * 3600
        remaining_count = len(remaining) - len(results)
        eta_hours = remaining_count / rate if rate > 0 else 0
        print(f"Progress: {len(results):,}/{len(remaining):,} | Rate: {rate:.0f}/hr | ETA: {eta_hours:.1f} hrs")
    
    # Checkpoint every 500
    if len(results) % 500 == 0:
        checkpoint_df = pd.concat([df_progress, pd.DataFrame(results)], ignore_index=True)
        checkpoint_df.to_csv(checkpoint_path, index=False)
        print(f"Checkpoint saved: {len(checkpoint_df):,} total")
    
    # Rate limit: 60/min
    time.sleep(1.01)

# Final save
api_df = pd.concat([df_progress, pd.DataFrame(results)], ignore_index=True)
api_df.to_csv(checkpoint_path, index=False)

# Merge with original data
df_enriched = df.merge(api_df, on='id')
df_enriched.to_csv('releases_50k_enriched.csv', index=False)

print(f"\nDone! Saved {len(df_enriched):,} releases to releases_50k_enriched.csv")

Remaining: 50,000 releases
Estimated time: 13.9 hours
Progress: 100/50,000 | Rate: 2305/hr | ETA: 21.6 hrs
Progress: 200/50,000 | Rate: 2318/hr | ETA: 21.5 hrs
Progress: 300/50,000 | Rate: 2351/hr | ETA: 21.1 hrs
