In [130]:
import requests
import csv
import errno
import json
import os
import re
import pandas as pd
import unicodedata
from abc import ABCMeta
from bs4 import BeautifulSoup
from collections import Counter
from datetime import datetime, date, time, timedelta
from dateutil import relativedelta
from lxml import html
from pathlib import Path
from random import randint
from time import sleep
import wikiart
import auctionhouse 
#from christies import Christies
#from sothebys import Sothebys

In [184]:
class Artlytic(object):
    '''Main class to load data and analyse'''
    
    def __init__(self):
        # data paths
        self.data_path         = 'data/'
        
        self.search_URLs_path  = self.data_path + 'search_urls.json'
        self.lot_URLs_path     = self.data_path + 'lot_urls.json'
        self.lots_path         = self.data_path + 'lots.json'
        self.auctions_path     = self.data_path + 'auctions.json'
        self.artists_path      = self.data_path + 'wikiart/wikiart_artists_19th_century.json'
        # data lists
        self.search_URLs = []
        self.auctions    = []
        self.lot_URLs    = []
        self.artists     = []
        self.lots        = []
        # create objects
        christies = Christies()
        sothebys  = Sothebys()
        # populate lists if data exists
        self.load_existing_data()


    def create_datafiles_upon_existence_check(self):
        '''Checks if there are any datafiles. If not, creates empty data files'''
        
        # for each datafile check if file exists. If not create en empty json file
        for file in [self.search_URLs_path,
                     self.auctions_path,
                     self.lot_URLs_path, 
                     self.lots_path,
                     self.artists_path]:
            flags = os.O_CREAT | os.O_EXCL | os.O_WRONLY
            try:
                file_handle = os.open(file, flags)
            except OSError as e:
                if e.errno == errno.EEXIST:  # Failed as the file already exists.
                    pass
                else:  # Something unexpected went wrong so reraise the exception.
                    raise
            else:  # No exception, so the file must have been created successfully.
                with os.fdopen(file_handle, 'a') as file_obj:
                    file_obj.write("[]")

                    
    def load_existing_data(self):
        '''Populates the lists for lots, search URLs, artists with existing data. 
        If there's no data empty files are created.'''
        
        # check existince of data files
        self.create_datafiles_upon_existence_check()
        
        # load the data
        if os.stat(self.search_URLs_path).st_size > 0:
            with open(self.search_URLs_path) as fp:
                self.search_URLs = json.load(fp)
            print('loaded', self.search_URLs_path, 'to search_URLs with', len(self.search_URLs), 'entries')
        if os.stat(self.lot_URLs_path).st_size > 0:
            with open(self.lot_URLs_path) as fp:
                self.lot_URLs = json.load(fp)
            print('loaded', self.lot_URLs_path, 'to self.lot_URLs_path', len(self.lot_URLs), 'entries')
        if os.stat(self.lots_path).st_size > 0:
            with open(self.lots_path) as fp:
                self.lots = json.load(fp)
            print('loaded', self.lots_path, 'to self.lots_path with', len(self.lots), 'entries')
        if os.stat(self.artists_path).st_size > 0:
            with open(self.artists_path) as fp:
                self.artists = json.load(fp)
            print('loaded', self.artists_path, 'to self.artists with', len(self.artists), 'entries')

            
    def update_lotlist_from_search_url(self):
        '''Iterate the search_URLs, unless already scraped, fetch the content, parse the URLs,
        add them to the lot_URLs. Save the new list as file to self.lot_URLs_path.'''  
        
        # load artlytic.search_URLs_path to artlytic.search_URLs
        try:
            with open(self.search_URLs_path) as fp:
                self.search_URLs = json.load(fp)
            print('loaded', self.search_URLs_path, 'to search_URLs with', len(self.search_URLs), 'entries')
        except IOError:
            print('*** Could not read file:', self.search_URLs_path)
            return
        
        # load artlytic.lot_URLs_path to artlytic.lot_URLs
        try:
            with open(self.lot_URLs_path) as fp:
                self.lot_URLs = json.load(fp)
            print('loaded', self.lot_URLs_path, 'to lot_URLs with', len(self.lot_URLs), 'entries')
        except IOError:
            print('*** Could not read file:', self.lot_URLs_path)
            return
        
        print('Beginning to create lot_URLs entries from search_URLs list')
        print('Currently there are', len(self.lot_URLs),'entries in artlytic.lot_URLs')
        # a number of counters for outputing the status
        counter_all_newly_added_entries = 0
        skipped_search_url_entries_counter = 0
        processed_search_url_entry_number = 0
        search_URLs_list_size = len(self.search_URLs)
        
        # process search_URLs entries one by one. Scrabe the search_URL, parse lot_URLs, add them to artlytic.lot_URLs
        for search_url_entry in self.search_URLs:
            if search_url_entry['scraped']:  
                # search_URLs entry was processed already. Do nothing
                skipped_search_url_entries_counter += 1
                processed_search_url_entry_number += 1
                pass
            else:                                   
                # process search URL
                counter = 0
                url_list = []
                
                # parse the page and return the lot URLs 
                auction_house = search_url_entry['auction_house']
                if auction_house == "Christie's" : 
                    url_list = christies.parse_search_results_page(search_url_entry['url'])
                elif auction_house == "Sotheby's" : 
                    url_list = sothebys.parse_search_results_page(search_url_entry['url'])
                elif auction_house == "Phillips" : 
                    url_list = phillips.parse_search_results_page(search_url_entry['url'])
                else: print('Auction house not known:', auction_house)
                    
                # iterate the lot urls and add them to lot_URLs 
                for url in url_list: 
                    # add new entry to lot_URLs
                    new_lot_url_entry = {}
                    new_lot_url_entry['auction_house'] = search_url_entry['auction_house']
                    new_lot_url_entry['url'] = url
                    new_lot_url_entry['scraped'] = False
                    new_lot_url_entry['parsed'] = False
                    new_lot_url_entry['timestamp'] = datetime.now().isoformat().replace(':', '-').replace('.', '-')
                    self.lot_URLs.append(new_lot_url_entry)
                    counter = counter + 1
                processed_search_url_entry_number += 1
                print(processed_search_url_entry_number,'/', search_URLs_list_size, \
                      ': Added', counter,'entries for',\
                      search_url_entry['auction_house'], search_url_entry['artist_name'])
                
                # update the search_URLs entry, so it doesn't get scraped again next time
                search_url_entry['scraped'] = True
                search_url_entry['timestamp'] = datetime.now().isoformat().replace(':', '-').replace('.', '-')  
                counter_all_newly_added_entries += counter
                sleep(randint(0,2))
                
        print('Updated lot_URLs with',counter_all_newly_added_entries,'new artworks. New size:', len(self.lot_URLs)) 
        print('Skipped', skipped_search_url_entries_counter, 'already processed search URLs' )
        
        # write lot_URLs list to file
        try:
            with open(self.lot_URLs_path, 'w') as fp:
                json.dump(self.lot_URLs, fp, sort_keys=True, indent=4, separators=(',', ': '))
            print('Updated', self.lot_URLs_path, '. File contains now', len(self.lot_URLs), 'entries')
        except IOError: 
            print('*** something went wrong. there was no', self.lot_URLs_path, '. Check why file is not there!')
            return
        
        # write search_URLs list to file
        try:
            with open(self.search_URLs_path, 'w') as fp:
                json.dump(self.search_URLs, fp, sort_keys=True, indent=4, separators=(',', ': '))
            print('Updated scraped status in', self.search_URLs_path)
        except IOError: 
            print('*** something went wrong. there was no', self.search_URLs_path, '. Check why file is not there!')
            return

        
    def eliminate_duplicates_from_lot_URLs(self):
        '''Eliminate duplicates in the self.lot_URLs list and output the list again to self.lot_URLs_path.
        Because we try to gather a max. of lots of an artist sometimes we use more than one search URL. 
        As a result, the chances are high, that a number of items are duplicates. To eliminate these, 
        we import the lot URLs into a dataframe, eliminate the duplicates and update the URL list thereafter.'''
        
        # load lot_URLs into a Pandas DataFrame
        with open(self.lot_URLs_path) as json_file:
            self.lot_URLs = json.load(json_file)

        lot_URLs_dataframe = pd.DataFrame(self.lot_URLs)
        old_size = len(lot_URLs_dataframe)
        print('Size of lot_URLs before eliminating duplicates:', old_size)

        # remove duplicates
        lot_URLs_dataframe.drop_duplicates(subset='url', keep='first', inplace=True)
        print('Size of lot_URLs after eliminating duplicates:', len(lot_URLs_dataframe))
        print(old_size - len(lot_URLs_dataframe), 'duplicate urls from lot_URLs were removed.')

        # save to file self.lot_URLs_path
        with open(self.lot_URLs_path, 'w') as fp:
            json.dump(lot_URLs_dataframe.to_dict('records'), fp, indent=4, separators=(',', ': '))
        print('saved list without duplicates to', self.lot_URLs_path)

        
    def get_lot_HTML(self):
        '''Iterate through the lot_URLs list, fetch the HTML code for each entry and store it as a local file.
        We''re doing this to minimize the requests to an auction house website. We're fetching all data and
        parse the files later'''

        # open lot URLs file
        with open(self.lot_URLs_path) as fp:
            self.lot_URLs = json.load(fp)
        print('Loaded', len(self.lot_URLs),'lots from file', self.lot_URLs_path)    

        # iterate through the list
        counter = 0
        for lot_URLs_entry in self.lot_URLs:
                if not lot_URLs_entry['scraped']:
                    # get the html and store in file
                    self.save_single_lot_HTML_to_local_file(lot_URLs_entry)
                    counter += 1
                    lot_URLs_entry['scraped'] = True 
                    # time out to prevent to be blocked
                    sleep(randint(61,67))
        print('Loaded', counter,'lots and saved to', self.lots_path)

        # saving artlytic.lot_URLs list to artlytic.lot_URLs_path file
        with open(self.lot_URLs_path, 'w') as fp:
            json.dump(self.lot_URLs, fp, sort_keys=True, indent=4, separators=(',', ': '))
            print('Saved', len(self.lot_URLs),'lots to file', self.lot_URLs_path) 
            
    
    def save_single_lot_HTML_to_local_file(self, lot_URLs_entry):
        '''Takes a lot_URLs list entry as input, requests the HTML code and stores it as a local .html file.'''

        # Build the path for the local HTML
        local_lot_HTML_path = self.data_path + \
                              lot_URLs_entry['auction_house'].lower().replace("'", "") + "/" + \
                              lot_URLs_entry['timestamp'].replace(':','-').replace('.','-') + \
                              '.html'

        # get the HTML code
        try:
            response = requests.get(lot_URLs_entry['url'])
            response.raise_for_status() # ensure we notice bad responses
        except (requests.HTTPError, requests.ConnectionError):
            print("*** HTTPError or ConnectionError while accessing", lot_URLs_entry['url'])

        # store as local file
        with open(local_lot_HTML_path, 'w') as fp:
            fp.write(response.text)
            print(local_lot_HTML_path, "saved")
            
            
    def parse_lot_HTMLs(self):
        
        # populate lists with file data
        with open(self.lots_path) as fp:
            self.lots = json.load(fp)

        with open(self.lot_URLs_path) as fp:
            self.lot_URLs = json.load(fp)

        # iterate lot_URLs
        counter_added_lots = 0
        counter_skipped_lots = 0
        for lot_URLs_entry in self.lot_URLs:
            # if entry has not been parsed yet, parse it
            if lot_URLs_entry['parsed'] == False:
                # build path of local HTML file
                local_HTML_path = self.data_path + \
                                  lot_URLs_entry['auction_house'].lower().replace("'", "") + '/' + \
                                  lot_URLs_entry['timestamp'] + '.html' 
                # needs refactoring!!!!!
                lot = dict()
                try:
                    with open(local_HTML_path, 'rb') as fp:
                        html = fp.read()
                        if lot_URLs_entry['auction_house']   == "Christie's":
                            lot = christies.parse(html)
                        elif lot_URLs_entry['auction_house'] == "Sotheby's":
                            lot = sothebys.parse(html)
                        elif lot_URLs_entry['auction_house'] == "Phillips":
                            lot = phillips.parse(html)
                        else:
                            print('*** Auctionhouse', lot_URLs_entry['auction_house'],'in',lot_URLs_entry['timestamp'],'not found')

                        lot['id'] = lot_URLs_entry['timestamp']
                        artlytic.lots.append(lot)
                        lot_URLs_entry['parsed'] = True
                        counter_added_lots += 1
                        print(counter_added_lots, ":", lot_URLs_entry['timestamp'], "parsed and added to artlytic.lots")
                        

                except FileNotFoundError:  
                    print('*** could not find file:', local_HTML_path)
                    
            else: # entry was already parsed
                counter_skipped_lots += 1

        # write lists to file
        
        with open(self.lot_URLs_path, 'w') as fp:
            json.dump(self.lot_URLs, fp, sort_keys=True, indent=4, separators=(',', ': '))
            print('updated parsed status of',counter_added_lots, 'lot_URLs eintries to True and saved to file', self.lot_URLs_path)
            print('skipped', counter_skipped_lots, 'lot_URLs entries because they were already parsed.' )
            
        with open(self.lots_path, 'w') as fp:
            json.dump(self.lots, fp, sort_keys=True, indent=4, separators=(',', ': '))
            print('added',counter_added_lots, 'new lots to artlytic.lots. New list size:', len(self.lots))      

In [185]:
artlytic  = Artlytic()

loaded data/search_urls.json to search_URLs with 3427 entries
loaded data/lot_urls.json to self.lot_URLs_path 160065 entries
loaded data/lots.json to self.lots_path with 94364 entries
loaded data/wikiart/wikiart_artists_19th_century.json to self.artists with 857 entries


In [137]:
sothebys.build_search_URLs(artlytic.artists)
with open(artlytic.search_URLs_path, 'w') as fp:
    json.dump(artlytic.search_URLs, fp, sort_keys=True, indent=4, separators=(',', ': '))

Beginning with adding new search URLs. Current list size: 2570
Added to search_URLs 857 new entries. New size: 3427


In [173]:
artlytic.eliminate_duplicates_from_lot_URLs()

Size of lot_URLs before eliminating duplicates: 166476
Size of lot_URLs after eliminating duplicates: 160065
6411 duplicate urls from lot_URLs were removed.
saved list without duplicates to data/lot_urls.json


In [166]:
with open(artlytic.lot_URLs_path, 'w') as fp:
    json.dump(artlytic.lot_URLs, fp, sort_keys=True, indent=4, separators=(',', ': '))
print('Updated', artlytic.lot_URLs_path, '. File contains now', len(artlytic.lot_URLs), 'entries')

Updated data/lot_urls.json . File contains now 126740 entries


In [167]:
with open(artlytic.search_URLs_path, 'w') as fp:
    json.dump(artlytic.search_URLs, fp, sort_keys=True, indent=4, separators=(',', ': '))
print('Updated scraped status in', artlytic.search_URLs_path)

Updated scraped status in data/search_urls.json
