# art contest leaderboard scrape

## setup

In [32]:
from bs4 import BeautifulSoup
import requests
import re

import pandas as pd

## params

In [None]:
submissions_url = lambda page: f'https://nouveaulabelcontest.com/submissions/page/{page}/'

## class building

In [12]:
class ArtContestScraper:
    def __init__(self):
        # attributes
        self.submissions_url = lambda page: f'https://nouveaulabelcontest.com/submissions/page/{page}/'
        self.data_dict_list = []
        
    

    def create_entry_list(self, soup):
        entry_list = list(set([
            item for sublist in
            [
                item.find_all_previous(
                    'a', 
                    href=re.compile(r'https://nouveaulabelcontest.com/[a-z-0-9]+/#(comments|respond)')
                )
                for item in soup.find_all('a', attrs={'class':'more-link style2-button'})
            ]
            for item in sublist
        ]))
        
        entry_list.sort(key=lambda x: str(x)) 
        print(f'entry list entry count: {len(entry_list)}')
        
        return entry_list
    
    def create_data_dict(self, soup):
        # get entry list
        entry_list = self.create_entry_list(soup)
        
        # build data dict
        data_dict = {
            # key
            re.findall(
                r'https://nouveaulabelcontest.com/([a-z-0-9]+)/',
                entry.attrs['href']
            )[0]: {
                
                # metadata subdict        
                'link': re.findall(
                    r'https://nouveaulabelcontest.com/[a-z-0-9]+/',
                    entry.attrs['href']
                )[0],
                
                'comments': re.findall(
                    r'(\d)\s+comments', 
                    entry.get_text()
                )[0],
                
                'artist': '',
                'title': ''
            }
            for entry in entry_list
        }
        
        print(f'data dict entry count: {len(data_dict.keys())}')
        assert len(entry_list) == len(data_dict.keys()), 'entry count mismatch'
        
        return data_dict    
    
    def get_page_data(self, url):
        # get page
        r = requests.get(url)
        print(f'status: {r.status_code}')
        
        # soupify
        soup = BeautifulSoup(r.content, 'lxml')    
        
        # build data dict
        return self.create_data_dict(soup)
    
    
    def get_entry_details(self, data_dict, key, link):
        r = requests.get(link)
        print(f'status: {r.status_code}')
    
        soup = BeautifulSoup(r.content, 'lxml')
        
        # get entry metadata
        entry_meta = soup.find_all('h4')
        
        # store in data dict
        try:
            data_dict[key]['artist'] = re.findall(r'Artist: ([A-Za-z ]+)', entry_meta[0].get_text())[0]
        except IndexError:
            data_dict[key]['artist'] = 'not found'
            
        try:
            data_dict[key]['title'] = re.findall(r'Title: ([A-Za-z ]+)', entry_meta[1].get_text())[0]
        except IndexError:
            data_dict[key]['title'] = 'not found'
                
        return data_dict
    
    def get_all_entry_details(self, data_dict):
        for key, subdict in data_dict.items():
            print(f'getting details for {key}')
            data_dict = self.get_entry_details(data_dict, key, subdict['link'])
            
        return data_dict
    
    def page_has_data(self, url):
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')
        
        if len(soup.find_all('a', attrs={'class':'more-link style2-button'})) > 0:
            return True
        return False
        
    
    def main(self):
        i = 1
        
        while True:
            if self.page_has_data(self.submissions_url(i)):
                print(f'getting page {i}')
        
                # get all details
                data_dict = self.get_all_entry_details(
                    self.get_page_data(
                        self.submissions_url(i)
                    )
                )
                
                self.data_dict_list.append(data_dict)
                
                i += 1
            
            else:
                print('scrape complete')
                break
        

        

In [10]:
ACS = ArtContestScraper()
ACS.main()

getting page 1
status: 200
entry list entry count: 10
data dict entry count: 10
getting details for finding-kore
status: 200
getting details for first-harvest
status: 200
getting details for gymnopedie
status: 200
getting details for playful-tyrst
status: 200
getting details for resuello-del-verano
status: 200
getting details for texas-sunrise
status: 200
getting details for titanias-promenade
status: 200
getting details for untitled-14
status: 200
getting details for untitled-15
status: 200
getting details for untitled-16
status: 200
getting page 2
status: 200
entry list entry count: 10
data dict entry count: 10
getting details for a-garden
status: 200
getting details for blue-collar-rose
status: 200
getting details for carried-on-the-current
status: 200
getting details for celebrating-the-harvest-moon
status: 200
getting details for grapes-at-sunrise
status: 200
getting details for harvest-jubilee
status: 200
getting details for ode-to
status: 200
getting details for untitled-11
stat

KeyboardInterrupt: 

In [14]:
len(ACS.data_dict_list)

56

In [18]:
ACS.data_dict_list[0]

{'finding-kore': {'link': 'https://nouveaulabelcontest.com/finding-kore/',
  'comments': '4',
  'artist': 'Samantha Lee',
  'title': 'Finding Kore'},
 'first-harvest': {'link': 'https://nouveaulabelcontest.com/first-harvest/',
  'comments': '3',
  'artist': 'Valerie Yoder',
  'title': 'First Harvest'},
 'gymnopedie': {'link': 'https://nouveaulabelcontest.com/gymnopedie/',
  'comments': '0',
  'artist': 'Samantha Lee',
  'title': 'Gymnopedie'},
 'playful-tyrst': {'link': 'https://nouveaulabelcontest.com/playful-tyrst/',
  'comments': '0',
  'artist': 'Adina Faye Karten',
  'title': 'Playful Tyrst'},
 'resuello-del-verano': {'link': 'https://nouveaulabelcontest.com/resuello-del-verano/',
  'comments': '0',
  'artist': 'Samantha Lee',
  'title': 'Resuello Del Verano'},
 'texas-sunrise': {'link': 'https://nouveaulabelcontest.com/texas-sunrise/',
  'comments': '2',
  'artist': 'Rina Kazavchinski',
  'title': 'Texas Sunrise'},
 'titanias-promenade': {'link': 'https://nouveaulabelcontest.com/

In [29]:
data_records = [
    {
        'key': key,
        'link': subdict['link'],
        'title': subdict['title'],
        'artist': subdict['artist'],
        'comments': subdict['comments']
    }
    for data in ACS.data_dict_list
    for key, subdict in data.items()
    
]

In [30]:
len(data_records)

560

In [37]:
df = (
    pd
    .DataFrame(data_records)
    [['artist','title','comments','key','link']]
)

df.head(10)

Unnamed: 0,artist,title,comments,key,link
0,Samantha Lee,Finding Kore,4,finding-kore,https://nouveaulabelcontest.com/finding-kore/
1,Valerie Yoder,First Harvest,3,first-harvest,https://nouveaulabelcontest.com/first-harvest/
2,Samantha Lee,Gymnopedie,0,gymnopedie,https://nouveaulabelcontest.com/gymnopedie/
3,Adina Faye Karten,Playful Tyrst,0,playful-tyrst,https://nouveaulabelcontest.com/playful-tyrst/
4,Samantha Lee,Resuello Del Verano,0,resuello-del-verano,https://nouveaulabelcontest.com/resuello-del-v...
5,Rina Kazavchinski,Texas Sunrise,2,texas-sunrise,https://nouveaulabelcontest.com/texas-sunrise/
6,Samantha Lee,Titania,3,titanias-promenade,https://nouveaulabelcontest.com/titanias-prome...
7,Lara Goff Parham,Untitled,0,untitled-14,https://nouveaulabelcontest.com/untitled-14/
8,Laura Goff Parham,Untitled,0,untitled-15,https://nouveaulabelcontest.com/untitled-15/
9,Laura Goff Parham,Untitled,0,untitled-16,https://nouveaulabelcontest.com/untitled-16/


In [38]:
%store df

Stored 'df' (DataFrame)
