# Preparing Artcollect data for pre-processing
The Artcollect auction house data capturing process collects artwork information in JSON files.
For further pre-processing some additional actions need to be taken: 
- Unify names of attributes, since they are not indentical for each auction house
- Filter out non art related auctions, e.g. watches, cars etc (in particullar from Christie's)
- set new index to 'key'
- Save all data into one large file for further pre-processing

In [96]:
import numpy as np
import pandas as pd
import csv

In [97]:
DATAPATH = "/Users/denizthemenace/dev/artcollect/data/"

## Christie's clean-up
### Step 1: Concatenate Christie's datafiles to one DataFrame

In [98]:
lots_christies_1          = pd.read_json(DATAPATH + 'christies_1998-2005.json')
lots_christies_2          = pd.read_json(DATAPATH + 'christies_2006-2017.json')
lots_christies_raw        = pd.concat([lots_christies_1, lots_christies_2])

### Step 2: Filter-out non art related auctions

In [100]:
# Read file containing names of art related auctions
with open(DATAPATH + 'christies_auctions_only_art_cleaned_up.csv', 'r') as f:
    reader = csv.reader(f)
    christies_only_art_auctions = list(reader)
    christies_only_art_auctions = [auction[0] for auction in christies_only_art_auctions]

print('# of Christie''s lots before filtering: ', len(lots_christies_raw))

# filter out auctions which aren't in the cleaned-up auction list
lots_christies_filtered = lots_christies_raw[lots_christies_raw['sale_title'].isin(christies_only_art_auctions)]
lots_christies_filtered.set_index('key', inplace=True)
print('# of Christie''s lots after  filtering: ', len(lots_christies_filtered))

# change name of auction_house_name column
lots_christies = lots_christies_filtered.copy()
lots_christies.loc[:,'auction_house'] = 'christies'
lots_christies.drop('auction_house_name', axis=1, inplace=True)

# change name of image_url column
lots_christies['image_urls'] = lots_christies['image_url']
lots_christies.drop('image_url', axis=1, inplace=True)

# of Christies lots before filtering:  927643
# of Christies lots after  filtering:  232542


## Phillips & Sotheby's clean-up

In [101]:
lots_phillips = pd.read_json(DATAPATH + 'phillips_all_lots_from_auction_pages_1-40.json')
lots_phillips.set_index('key', inplace=True)
lots_phillips['auction_house'] = 'phillips'
lots_phillips.drop('auction_house_name', axis=1, inplace=True)
print("# of Phillips lots after  filtering: ", len(lots_phillips))

lots_sothebys = pd.read_json(DATAPATH + 'sothebys.json')
lots_sothebys.set_index('key', inplace=True)
lots_sothebys['auction_house'] = 'sothebys'
lots_sothebys.drop('auction_house_name', axis=1, inplace=True)
print("# of Sotheby's lots after  filtering: ", len(lots_sothebys))

# of Phillips lots after  filtering:  64000
# of Sotheby's lots after  filtering:  246362


## Create one DataFrame containing all lots

In [102]:
lots_not_preprocessed = pd.concat([lots_christies, lots_phillips, lots_sothebys])
print("Total # of lots fro pre-processing:", len(lots_not_preprocessed))

Total # of lots fro pre-processing: 542904
Loaded 542904 lots for pre-processing into /Users/denizthemenace/dev/artcollect/data/lots_not_preprocessed.json


## Currency conversion to USD

In [105]:
lots_not_preprocessed['currency'].value_counts()

GBP    188886
USD    153951
EUR     65877
HKD     43359
CHF      3381
NLG      2791
AUD       959
CAD       781
SGD       430
CNY       368
RMB       319
INR       128
Name: currency, dtype: int64

### Step 1: Convert prices to USD

In [107]:
def convert_to_usd_basic(row):
    '''Takes a lots dataframe as input and converts a price to USD'''
    
    exchange_rate = {'GBP': 0.747620,
                     'EUR': 0.848868,
                     'USD': 1.0,
                     'INR': 65.3090,
                     'JPY': 112.651,
                     'AUD': 1.27604,
                     'CAD': 1.24811,
                     'SGD': 1.35749,
                     'CHF': 0.969412,
                     'CNY': 6.65199,
                     'RMB': 6.65199,
                     'TWD': 30.3466,
                     'HKD': 7.81040,
                     'RUB': 57.5945,
                     'SKK': 8.17056,
                     'DKK': 6.31823,
                     'NLG': 1.87082,
                     'FRF': 5.56820,
                     'ITL': 1.64331,
                     'DEM': 1.65949,
                     'ESP': 141.173,
                    }
    try:
        return exchange_rate[row['currency']]
    except:
        return None

# create lots dataframe with USD prices only
lots_not_preprocessed_usd = lots_not_preprocessed.copy()

lots_not_preprocessed_usd['conversion_rate'] = \
                    lots_not_preprocessed_usd.apply(convert_to_usd_basic, axis=1)
lots_not_preprocessed_usd['price'] = \
                    lots_not_preprocessed_usd['price']/lots_not_preprocessed_usd['conversion_rate']
lots_not_preprocessed_usd['max_estimated_price'] = \
                    lots_not_preprocessed_usd['max_estimated_price']/lots_not_preprocessed_usd['conversion_rate']
lots_not_preprocessed_usd['min_estimated_price'] = \
                    lots_not_preprocessed_usd['min_estimated_price']/lots_not_preprocessed_usd['conversion_rate']

### Step 2: Drop currency columns because all data is in USD now

In [109]:
columns_to_drop = ['currency', 'estimate_currency', 'conversion_rate' ]
lots_not_preprocessed_usd.drop(columns_to_drop, axis=1, inplace=True)

## Save to file

In [113]:
lots_not_preprocessed_usd.to_json(DATAPATH + 'lots_not_preprocessed.json')
print('Saved', len(lots_not_preprocessed_usd), 'lots in USD for pre-processing to\n', DATAPATH + 'lots_not_preprocessed_usd.json')

Saved 542904 lots in USD for pre-processing to
 /Users/denizthemenace/dev/artcollect/data/lots_not_preprocessed_usd.json
