Scraping Pitchfork

Scraping album reviews to determine the average score by genre, artist
Will feed into a dataviz project.

Pitchforks rating system explained
https://www.reddit.com/r/Music/comments/xuhw3y/pitchforks_ratings_system_explained_via_archived/?rdt=36515

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
def collect_review_links(base_url, start_page, end_page):
    review_links = []
    for page in range(start_page, end_page + 1):
        url = f"{base_url}?page={page}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Searching for links by partial class name if there is a common identifiable part
        links = soup.find_all('a', href=True)
        # Filter links specifically pointing to album reviews
        review_links.extend("https://pitchfork.com" + link['href'] for link in links if '/reviews/albums/' in link['href'])

    return review_links

base_url = 'https://pitchfork.com/reviews/albums'
review_links = collect_review_links(base_url, 1, 276)
print(f"Collected {len(review_links)} links.")


Collected 53412 links.


In [5]:
print(review_links[5:50])

['https://pitchfork.com/reviews/albums/big-brave-a-chaos-of-flowers/', 'https://pitchfork.com/reviews/albums/still-house-plants-if-i-dont-make-it-i-love-u/', 'https://pitchfork.com/reviews/albums/still-house-plants-if-i-dont-make-it-i-love-u/', 'https://pitchfork.com/reviews/albums/pearl-jam-dark-matter/', 'https://pitchfork.com/reviews/albums/pearl-jam-dark-matter/', 'https://pitchfork.com/reviews/albums/elyanna-woledto/', 'https://pitchfork.com/reviews/albums/elyanna-woledto/', 'https://pitchfork.com/reviews/albums/taylor-swift-the-tortured-poets-department-the-anthology/', 'https://pitchfork.com/reviews/albums/taylor-swift-the-tortured-poets-department-the-anthology/', 'https://pitchfork.com/reviews/albums/bbymutha-sleep-paralysis/', 'https://pitchfork.com/reviews/albums/bbymutha-sleep-paralysis/', 'https://pitchfork.com/reviews/albums/water-damage-in-e/', 'https://pitchfork.com/reviews/albums/water-damage-in-e/', 'https://pitchfork.com/reviews/albums/bark-psychosis-hex/', 'https://

In [6]:
def remove_duplicates(links):
    # Convert list to a set to remove duplicates, then convert it back to a list
    unique_links = list(set(links))
    return unique_links

# Assuming `review_links` is your list with duplicates
review_links = remove_duplicates(review_links)
print(f"Collected {len(review_links)} unique links.")


Collected 26431 unique links.


In [7]:
review_links_test = review_links[0:9]
print(review_links_test)

['https://pitchfork.com/reviews/albums/21703-yyy-ep/', 'https://pitchfork.com/reviews/albums/17811-the-delfonics-adrian-younge-presents-the-delfonics/', 'https://pitchfork.com/reviews/albums/yg-stay-dangerous/', 'https://pitchfork.com/reviews/albums/15239-too-young-to-be-in-love/', 'https://pitchfork.com/reviews/albums/17188-tragicomedies/', 'https://pitchfork.com/reviews/albums/6909-world-of-echo/', 'https://pitchfork.com/reviews/albums/2296-of-this-blood/', 'https://pitchfork.com/reviews/albums/584-believe-it-mammals/', 'https://pitchfork.com/reviews/albums/14366-mare/']


In [15]:
import requests
from bs4 import BeautifulSoup

def scrape_album_review(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Artist
    artist_div = soup.find('a', class_=lambda x: x and 'SplitScreenContentHeaderArtistLink' in x)
    artist_name = artist_div.div.text.strip() if artist_div and artist_div.div else 'No artist found'

    # Album
    album_h1 = soup.find('h1', {'data-testid': 'ContentHeaderHed'})
    album_name = album_h1.em.text.strip() if album_h1 and album_h1.em else 'No album name found'

# Genre, Label, Review Date
    info_slices = soup.find_all('div', class_=lambda x: x and 'InfoSliceItem' in x)
    genre = label = review_date = 'Not found'
    for item in info_slices:
        key = item.find('p', class_=lambda x: x and 'InfoSliceKey' in x)
        value = item.find('p', class_=lambda x: x and 'InfoSliceValue' in x)
        if key and value:
            key_text = key.text.strip()
            if 'Genre:' in key_text:
                genre = value.text.strip()
            elif 'Label:' in key_text:
                label = value.text.strip()
            elif 'Reviewed:' in key_text:
                review_date = value.text.strip()
    # Reviewer
    reviewer_a = soup.find('a', class_=lambda x: x and 'BylineLink' in x)
    reviewer = reviewer_a.text.strip() if reviewer_a else 'No reviewer found'

    # Score
    score_p = soup.find('p', class_=lambda x: x and 'Rating' in x)
    score = score_p.text.strip() if score_p else 'No score found'

    return {
        'artist_name': artist_name,
        'album_name': album_name,
        'genre': genre,
        'label': label,
        'review_date': review_date,
        'reviewer': reviewer,
        'score': score
    }



import time
import random

# Initialize an empty list to store all review data
all_reviews = []

# Loop through each URL and scrape the data with a delay
for index, url in enumerate (review_links):
    review_data = scrape_album_review(url)
    all_reviews.append(review_data)
    
    # Sleep for a random time between requests to mimic human browsing
    #time.sleep(random.uniform(1, 3))  # Random delay between 1 and 3 seconds

    # Print progress every 1000 reviews scraped
    if (index + 1) % 1000 == 0:
        print(f"Scraped {index + 1} reviews so far.")


KeyboardInterrupt: 

In [17]:
import pandas as pd

df = pd.DataFrame(all_reviews)
df.to_csv('pitchfork_album_reviews.csv', index=False)

In [19]:
import os
print(os.getcwd())

/Users/tomweatherburn/Library/CloudStorage/OneDrive-Personal/dev/tdubolyou.github.io
