# Data Clean Up processes

In [2]:
import re, numbers
import numpy as np
import pandas as pd
from functools import partial #currency -> USD?

#Converts Currencies
def currency_conv(curr):
    if   curr == "GBP": return 1.4
    elif curr == "EUR": return 1.24
    elif curr == "USD": return 1.24
    else: return np.NaN

# Mainly used for execution Yr Data
def ckInt(is_int, tf=False, dbg=False, btwnYrs=True):
    if is_int != is_int: return np.NaN
    elif isinstance(is_int, numbers.Number):
        if dbg: print("dbg: numeric")
        if tf: return True
        else:
            #year of execution Not ridiculous
            if btwnYrs and (is_int < 1800 or is_int > 2019): return np.NaN
            return is_int
    else:
        if dbg: print("dbg: Notnum")
        if tf: return False
        else:
            r = re.search("[1-9][0-9][0-9][0-9]", is_int)
            if not(r): return np.NaN
            else:
                nYr = float(r.group(0))
                if btwnYrs and (nYr < 1800 or nYr > 2019): return np.NaN
                return nYr
            
# Using partial functions for quick apply to columnal data
TFInt = partial(ckInt,True, True)
TFint = partial(ckInt,True, False)

In [4]:
# Reading the input Data in
#data = pd.read_csv('data.csv', encoding='iso-8859-1')
data = pd.read_csv('data.csv', encoding='latin-1')
print(data.shape)
data.info()

(107578, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107578 entries, 0 to 107577
Data columns (total 18 columns):
artist_birth_year        107578 non-null int64
artist_death_year        71186 non-null float64
artist_name              107578 non-null object
artist_nationality       107578 non-null object
auction_date             107578 non-null object
category                 107578 non-null object
currency                 107578 non-null object
edition                  13376 non-null object
estimate_high            68404 non-null float64
estimate_low             68409 non-null float64
hammer_price             101254 non-null float64
location                 89733 non-null object
materials                105739 non-null object
measurement_depth_cm     107578 non-null int64
measurement_height_cm    107578 non-null int64
measurement_width_cm     107578 non-null int64
title                    107118 non-null object
year_of_execution        79818 non-null object
dtypes: float64(4)

In [13]:
print(data[['artist_birth_year', 'artist_death_year', 'artist_name','year_of_execution']].head())
print(data.year_of_execution.value_counts(dropna=False))
print(data.currency.value_counts(dropna=False)).head()
print(data.artist_nationality.value_counts(dropna=False))

   artist_birth_year  artist_death_year  artist_name year_of_execution
0               1902             1982.0  Wifredo Lam              1944
1               1902             1982.0  Wifredo Lam              1964
2               1902             1982.0  Wifredo Lam              1955
3               1902             1982.0  Wifredo Lam              1973
4               1902             1982.0  Wifredo Lam               NaN
NaN                                       27760
1969                                       2113
1989                                       1921
1982                                       1900
1973                                       1893
1988                                       1886
1971                                       1732
1980                                       1729
1970                                       1728
1966                                       1670
1967                                       1635
1984                                       160

AttributeError: 'NoneType' object has no attribute 'head'

In [7]:
# Adding currency coversion rates
data['CONVrate'] = data.currency.apply(currency_conv)
# Mark if work is unique == more $$$ than others
data['unique'] = data.edition == "unique"
#auction date post death? or near/at death = death increase price
data['auct_yr'] =  pd.DatetimeIndex(data.auction_date).year
data['exec_pmortem'] = data.auct_yr >= data.artist_death_year
data["yr_exec"] = data.year_of_execution.apply(ckInt)
print(data.year_of_execution.value_counts(dropna=False)) # check to see if conversion to int done right

NaN                                       27760
1969                                       2113
1989                                       1921
1982                                       1900
1973                                       1893
1988                                       1886
1971                                       1732
1980                                       1729
1970                                       1728
1966                                       1670
1967                                       1635
1984                                       1607
1974                                       1581
1986                                       1564
1968                                       1531
1983                                       1516
1985                                       1502
1990                                       1486
1981                                       1481
1965                                       1462
1972                                    

In [9]:
print("hammer < highEstm", np.sum(data.estimate_high < data.hammer_price))
print("hammer > low Estm", np.sum(data.estimate_low  > data.hammer_price))
data[['exec_pmortem', 'auct_yr', 'artist_death_year']].query("exec_pmortem == 1")

hammer < highEstm 18984
hammer > low Estm 24572


Unnamed: 0,exec_pmortem,auct_yr,artist_death_year
0,True,1987,1982.0
1,True,2005,1982.0
2,True,1994,1982.0
3,True,1987,1982.0
4,True,2002,1982.0
5,True,2009,1982.0
6,True,2015,1982.0
7,True,2006,1982.0
8,True,2008,1982.0
9,True,1990,1982.0


In [10]:
isinstance(data.year_of_execution.iloc[-1] , numbers.Number)
isinstance(np.NaN, numbers.Number)

True

In [11]:
#birth yr > death yr
print(np.sum(data.artist_birth_year > data.artist_death_year)) # 0 good
print(data.yr_exec.value_counts())

0
1969.0    2136
1989.0    1958
1982.0    1923
1973.0    1923
1988.0    1916
1970.0    1811
1980.0    1778
1971.0    1752
1966.0    1700
1967.0    1673
1984.0    1642
1974.0    1592
1968.0    1589
1986.0    1583
1983.0    1574
1985.0    1553
1990.0    1520
1965.0    1501
1981.0    1498
1972.0    1459
1987.0    1447
1964.0    1411
1960.0    1343
1961.0    1242
1978.0    1209
1999.0    1170
1979.0    1162
1976.0    1161
1962.0    1158
1975.0    1148
          ... 
1874.0      30
1868.0      29
1876.0      29
1864.0      28
1879.0      27
1869.0      25
1877.0      20
1860.0      17
1863.0      16
1856.0      14
1861.0      13
1858.0      12
1853.0      11
1850.0       9
1852.0       9
2016.0       7
1855.0       5
1854.0       5
1820.0       5
1851.0       3
1840.0       3
1857.0       2
1862.0       2
1842.0       2
1859.0       1
1848.0       1
1831.0       1
1821.0       1
1812.0       1
1806.0       1
Name: yr_exec, Length: 175, dtype: int64


In [12]:
def cnv_mean(x): return float(x.replace(',',''))
#Adding Dji Avg Data Given Acution years goes back to 80's. This will be used as asset value normalizer

dji = pd.read_csv("DJind.txt", sep="\t")
dji['DJlevel'] = dji.ValueClose.apply(cnv_mean)
DJi = dji.groupby(['Year'], as_index=False).agg({'DJlevel': 'mean'})
df = data.merge(DJi, how='left', left_on=['auct_yr'], right_on=['Year'])
del df['Year']
print(DJi.head())

   Year  DJlevel
0  1900   68.130
1  1901   70.440
2  1902   64.320
3  1903   56.855
4  1904   70.050


In [None]:
# Save Data to Pickle File
df.to_pickle('./cData.pickle')

In [None]:
# Some additional Data checking
"""
DATA Conversion & insight
year of execution -> numeric early, mid, late
attempt to id the artist by yr first 3 digits or by 50 year interval?
Long vs Short title?
check to see if market level (SPX) or volatility? (VIX not that long) has anything to do w/ price?

data.year_of_execution.head().apply(ckInt)
data.year_of_execution.apply(ckInt)
data[['year_of_execution', ]]
data.year_of_execution[data.year_of_execution.apply(TFint)]
data.year_of_execution.apply(TFint)
data.year_of_execution.iloc[-3:].apply(ckInt)
data.year_of_execution.iloc[-3:].apply(TFInt) # what the #$@#$!
data.year_of_execution.iloc[-3:]
ckInt(data.year_of_execution.iloc[-1], True, True)
"""