In [1]:
import scrapy
import re # for text parsing
import logging

class ChartSpider(scrapy.Spider):
    name = 'usChartSpider'
    # page to scrape
    start_urls = ['https://www.billboard.com/charts/hot-100/1983-06-04/']
    # if you want to impose a delay between sucessive scrapes
#   download_delay = 1.0 

    def parse(self, response):
        self.logger.info('Scraping page: %s', response.url)
        chart_week = response.xpath('.//time/@datetime').extract_first()
        
        for num, (artist, track, lastweek, peak_pos, weeks_on_chart) in \
            enumerate(zip(response.css('.chart-row__artist::text').extract(), 
                                 response.css('.chart-row__song::text').extract(),
                                 response.css('.chart-row__rank .chart-row__last-week::text').extract(),
                                 response.css('.chart-row__top-spot .chart-row__value::text').extract(),
                                 response.css('.chart-row__weeks-on-chart .chart-row__value::text').extract())):
            yield {'chart_week': chart_week, 'chart_pos':num+1, 'track': track, 'artist': artist.strip(), 
                   'last_week':re.findall('\d+|$', lastweek)[0],
                  'peak_pos':re.findall('\d+|$', peak_pos)[0], 
                   'weeks_on_chart':re.findall('\d+|$', weeks_on_chart)[0]}

# move onto next page (if it exists)             
        for next_page in response.css('.chart-nav__link'):
            if next_page.css('a::attr(title)').extract_first() == 'Previous Week':
                yield response.follow(next_page, self.parse)

In [2]:
from scrapy.crawler import CrawlerProcess

process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'FEED_FORMAT': 'json',
'FEED_URI': 'uk_charts.json'
})

# minimising the information presented on the scrapy log
logging.getLogger('scrapy').setLevel(logging.WARNING)
process.crawl(ChartSpider)
process.start()

2022-10-24 15:54:32 [scrapy.utils.log] INFO: Scrapy 2.7.0 started (bot: scrapybot)
2022-10-24 15:54:32 [scrapy.utils.log] INFO: Versions: lxml 4.7.1.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 2.0.1, Twisted 22.8.0, Python 3.9.15 (main, Oct 11 2022, 22:27:25) - [Clang 14.0.0 (clang-1400.0.29.102)], pyOpenSSL 22.1.0 (OpenSSL 3.0.5 5 Jul 2022), cryptography 38.0.1, Platform macOS-12.6-x86_64-i386-64bit


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

  exporter = cls(crawler)

2022-10-24 15:54:33 [usChartSpider] INFO: Scraping page: https://www.billboard.com/charts/hot-100/1983-06-04/


In [3]:
import pandas as pd
uk_charts = pd.read_json('https://raw.githubusercontent.com/dashee87/blogScripts/master/files/uk_charts.json')
# convert the date column to the correct date format
uk_charts = uk_charts.assign(chart_week=pd.to_datetime(uk_charts['chart_week']))
uk_charts.head(5)

Unnamed: 0,peak_pos,chart_week,weeks_on_chart,artist,track,label,artist_num,chart_pos,last_week
0,1,2017-12-08,30,ED SHEERAN,PERFECT,ASYLUM,6692,1,3.0
1,2,2017-12-08,1,RAK-SU FT WYCLEF/NAUGHTY BOY,DIMELO,SYCO MUSIC,52716,2,
2,2,2017-12-08,7,RITA ORA,ANYWHERE,ATLANTIC,7418,3,2.0
3,1,2017-12-08,18,CAMILA CABELLO FT YOUNG THUG,HAVANA,EPIC/SYCO MUSIC,51993,4,1.0
4,2,2017-12-08,83,MARIAH CAREY,ALL I WANT FOR CHRISTMAS IS YOU,COLUMBIA,25943,5,22.0


In [4]:
uk_charts.shape

(253198, 9)

In [5]:
import pandas as pd
uk_charts = pd.read_json('test.json')
# convert the date column to the correct date format
uk_charts = uk_charts.assign(chart_week=pd.to_datetime(uk_charts['chart_week']))

In [6]:
uk_charts.head(5)

Unnamed: 0,peak_pos,chart_week,weeks_on_chart,artist,track,label,artist_num,chart_pos,last_week
0,1,1970-01-01 00:25:12.691200,30,ED SHEERAN,PERFECT,ASYLUM,6692,1,3.0
1,2,1970-01-01 00:25:12.691200,1,RAK-SU FT WYCLEF/NAUGHTY BOY,DIMELO,SYCO MUSIC,52716,2,
2,2,1970-01-01 00:25:12.691200,7,RITA ORA,ANYWHERE,ATLANTIC,7418,3,2.0
3,1,1970-01-01 00:25:12.691200,18,CAMILA CABELLO FT YOUNG THUG,HAVANA,EPIC/SYCO MUSIC,51993,4,1.0
4,2,1970-01-01 00:25:12.691200,83,MARIAH CAREY,ALL I WANT FOR CHRISTMAS IS YOU,COLUMBIA,25943,5,22.0
