In [1]:
import sys
from datetime import datetime
from typing import List

import bs4
from pydantic import BaseModel, HttpUrl, ValidationError
from requests import get

TARGET_URL = 'https://www.billboard.com/charts/billboard-200/'


class BillBoardItem(BaseModel):
    # scraped_timestamp is of type datetime. if not suplied, it will be set by default to datetime.now()
    scraped_timestamp: datetime = datetime.now()
    this_week_rank: int
    # url is of type HttpUrl, which is a str with a valid URL.
    url_photo: HttpUrl
    artist: str
    # awards is of type List[str], which is a list of strings. i.e ['Grammy', 'Oscar']
    awards: List[str]
    track_name: str
    last_week_rank: str
    peak_position: int
    weeks_on_chart: str


def get_parser(text) -> bs4.BeautifulSoup:
    parser = bs4.BeautifulSoup(
        markup=text,
        features='html.parser'
    )
    return parser


def parse_result(container_row: bs4.Tag) -> BillBoardItem:
    last_week_rank,peak_position, weeks_on_chart =[t.text.strip() for t in container_row.select('li.o-chart-results-list__item > span.c-label')[-3:]]
    this_week_rank, last_week_compare, track_name, artist_name = [t.text.strip() for t in container_row.select('li.o-chart-results-list__item span.c-label')[:4]]
    image_urls = [img.attrs['data-lazy-src'] for img in container_row.select('img.c-lazy-image__img')]
    track_name = container_row.select_one('#title-of-a-story').text.strip()
    label= container_row.select_one('h3.c-title ~ p.c-tagline').text.strip
    try:
        awards = [x for x in container_row.select_one('div.o-chart-awards').get_text().split('\n') if x != '']
    except AttributeError:
        awards = []
    data = {
        'this_week_rank': this_week_rank,
        'url_photo': image_urls[-1],
        'label': label,
        'track_name': track_name,
        'artist': artist_name,
        'awards': awards,
        'last_week_rank': last_week_rank,
        'peak_position': peak_position,
        'weeks_on_chart': weeks_on_chart
        }
    try:
        return BillBoardItem(**data)
    except ValidationError as e:
        print(data)
        raise e
def parse_website(text: str) -> List[BillBoardItem]:
    parser = get_parser(text)
    items: List[BillBoardItem] = []
    for idx, tag in enumerate(parser.find_all('div', class_={'o-chart-results-list-row-container'})):
        items.append(parse_result(container_row=tag))
    return items


def main(**kwargs):
    output = kwargs.get('output', sys.stdout)

    r = get(url=TARGET_URL)
    r.raise_for_status()
    text = r.text
    elements = parse_website(text)
    for e in elements:
        print(e.json(), file=output)


with open('billboard200wAwards.jsonl', 'w') as f:
    main(output=f)
