In [2]:
import pandas as pd
# Define the URL of the CSV file
csv_url = "https://media.githubusercontent.com/media/metmuseum/openaccess/master/MetObjects.csv"

# Download the CSV file using pandas and create a Pandas DataFrame
raw_df = pd.read_csv(csv_url,low_memory=False)

In [155]:
use_cols = [
    'Object ID', # Cleaned
    'Object Name', 
    'Country', # Cleaned
    'Culture', # Cleaned 
    'Period', # Cleanish - picked the first, broader description, if multiple present ex.: Min dynasty, dude 1 => ming dynasty
    'Dynasty', # removed
    'Reign', # removed
    'Medium', 
    'Is Highlight', # Cleaned
    'Gallery Number', # Cleaned
    'Department', # Cleared
    'Artist Display Name', 
    'Artist Gender', 
    'Artist Role'
]

# Define function to split by '|' and return unique countries
def get_unique_values(row, delim='|'):
    values = []
    if (row == 'nan') or pd.isna(row):
        return [None]
    for v in row.split(delim):
        if v.strip():
            values.append(v.strip())
            if v.strip() == 'nan':
                values.append(None)
        else: 
            values.append(None)
    # if len(list(values)) > 0:
    #     return list(values)
    return values


df = raw_df.loc[:, use_cols]

In [44]:
artist_roles = df_pandas['Artist Role'].values

In [None]:
df_pandas[]

In [346]:
df_pandas.isna().sum()

Object ID                   0
Object Name              1691
Country                402063
Period                 386872
Culture                270752
Dynasty                454571
Reign                  466578
Medium                   7121
Is Highlight                0
Gallery Number         426028
Department                  0
Artist Display Name         0
Artist Gender               0
Artist Role                 0
dtype: int64

In [45]:
df.isna().sum()

Object ID                   0
Object Name              1691
Country                402053
Period                 386848
Culture                270425
Dynasty                454571
Reign                  466578
Medium                   7120
Is Highlight                0
Gallery Number         426028
Department                  0
Artist Display Name    202269
Artist Gender          374743
Artist Role            204368
dtype: int64

In [134]:
# No point joining period & other values since the set difference is null
contains_dynasty = set(df['Dynasty'].dropna().index.values)
contains_reign = set(df['Reign'].dropna().index.values)
contains_period = set(df['Period'].dropna().index.values)
print(len(contains_period))
print(len(contains_reign))
print(len(contains_dynasty))
print(len(contains_reign - contains_period), len(contains_dynasty - contains_period))

90956
11226
23233
0 0


In [282]:
pd.DataFrame(one_culture.value_counts()).reset_index().rename(columns= {0: 'counts'}).query('counts > 10').sort_values(by='index').style

Unnamed: 0,index,counts
192,Abelam,41
68,Achaemenid,171
424,Acheulean,11
131,Aegean,66
126,Afghan,72
79,African,143
127,Akan,71
181,Akkadian,44
175,Alanic,45
213,Albanian,36


In [337]:
%%time
# df_pandas is clean data
df_pandas = df.copy()

# Artist Attributes
df_pandas['Artist Role'] = df_pandas['Artist Role'].astype(str)
df_pandas['Artist Role'] = df_pandas['Artist Role'].apply(get_unique_values)
df_pandas['Artist Display Name'] = df_pandas['Artist Display Name'].astype(str)
df_pandas['Artist Display Name'] = df_pandas['Artist Display Name'].apply(get_unique_values)
df_pandas['Artist Gender'] = df_pandas['Artist Gender'].astype(str)
df_pandas['Artist Gender'] = df_pandas['Artist Gender'].apply(get_unique_values)

# Gallery Number
df_pandas['Gallery Number'].replace("in Great Hall", 1001, inplace=True)
df_pandas['Gallery Number'].replace("Petrie Ct. Café", 1002, inplace=True)
df_pandas['Gallery Number'].replace("on Fifth Avenue", 1003, inplace=True)
df_pandas['Gallery Number'].replace("Watson Library", 1004, inplace=True)
df_pandas['Gallery Number'] = df_pandas['Gallery Number'].astype(float)
df_pandas['Gallery Number'] = df_pandas['Gallery Number'].astype("Int64")

# Period
# Join Period & Dynasty/Reign
df_pandas.loc[list(contains_reign - contains_period), 'Period'] = df_pandas.loc[list(contains_reign - contains_period), 'Reign']
df_pandas.loc[list(contains_dynasty - contains_period), 'Period'] = df_pandas.loc[list(contains_dynasty - contains_period), 'Dynasty']
df_pandas.drop(columns = ['Dynasty', 'Reign'])

# Culture
# regular expression pattern to remove elements within parentheses
pattern_parenthesis = r'\((.*?)\)'
cleaned_culture = df_pandas['Culture'].str.replace(pattern_parenthesis, '', regex=True).str.strip()
add_culture_info = df_pandas['Culture'].dropna().str.findall(pattern_parenthesis) # ignore this for scope of project
# Get only the first cuture ( more often than not, the secondary values are city-specific, which we don't want to worry about)
one_culture = cleaned_culture.apply(get_unique_values, delim= ',').apply(lambda x: x[0])
# remove filler words
one_culture = one_culture.str.replace(r'\b(possibly|probably|Possibly|Probably)\b', '', regex=True).str.strip()
one_culture = one_culture.str.replace(r'\b(peoples|people|People|Peoples)\b', '', regex=True).str.strip()
one_culture = one_culture.str.replace('?', '', regex=False).str.strip()
one_culture = one_culture.str.replace(r'\s+', ' ', regex=True).str.strip()
one_culture = one_culture.str.split(";", expand=True)[0]
one_culture = one_culture.str.split(":", expand=True)[0]
other_replace = {
    "Afghanistani": 'Afghan',
    'Afghanistan': 'Afghan',
    'India': 'Indian',
    'Argentinean':'Argentinian',
    'China': 'Chinese',
    'Columbia': 'Columbia', 
    'Guatemala': 'Guatemalan',
    'Indonesia': 'Indonesian',
    'Iran': 'Iranian',
    'Korea' : 'Korean',
    'Malayan': 'Malaysian',
    'Mexico': 'Mexican',
    'Nepal': 'Nepalese',
    'Nigeria' : 'Nigerian',
    'Netherlandish' : 'Dutch',
    'Peru': 'Peruvian',
    'Philippines': 'Philippine',
    'Papua New Guinea': 'Papua New Guinean',
    'Sri Lanka': 'Sri Lankan',
    'Sumatra': 'Sumatran',
    'Thailand': 'Thai',
    'Tibet':'Tibetan',
    'Unknown fabric': 'Unknown',
    'blade' : 'Unknown',
    'hilt': 'Unknown',
    'Sumba Island': 'Sumba' 
}
one_culture = one_culture.replace(other_replace)
df_pandas['Culture'] = one_culture

# Country
# regular expression pattern to remove elements within parentheses
cleaned_country = df_pandas['Country'].str.replace(pattern_parenthesis, '', regex=True).str.strip()
# Get only the first country ( more often than not, the secondary values are city-specific, which we don't want to worry about)
one_country = cleaned_country.apply(get_unique_values, delim= '|').apply(lambda x: x[0])

# remove filler words
one_country = one_country.str.replace(r'\b(possibly|probably|Possibly|Probably)\b', '', regex=True).str.strip()
one_country = one_country.str.replace(r'\b(present-day|Present-Day|Present-day|modern-day|perhaps)\b', '', regex=True).str.strip()
one_country = one_country.str.replace('from the', '', regex=False).str.strip()

one_country = one_country.str.replace('?', '', regex=False).str.strip()
one_country = one_country.str.replace(r'\s+', ' ', regex=True).str.strip()
one_country = one_country.str.split(";", expand=True)[0]
one_country = one_country.str.split(":", expand=True)[0]
other_replace_country = {
    "USA": "United States",
    "U.S.A.": "United States",
    "US": "United States",
    "United States of America": "United States",
    'UK':'United Kingdom',
    'The Netherlands':'Netherlands',
    'Unknown country': 'Unknown',
    'italy':'Italy', 
    'india': 'India', 
    'iran': 'Iran'
}
one_country = one_country.replace(other_replace_country)
df_pandas['Country'] = one_country

# Period

cleaned_period = df_pandas['Period'].str.replace(pattern_parenthesis, '', regex=True).str.strip()
# Get only the first period ( more often than not, the secondary values are too specific, which we don't want to worry about)
one_period = cleaned_period.apply(get_unique_values, delim= ',').apply(lambda x: x[0])

# remove filler words
one_period = one_period.str.replace(r'\b(possibly|probably|Possibly|Probably)\b', '', regex=True).str.strip()
# one_period = one_period.str.replace(r'\b(middle|late|early|)\b', '', regex=True).str.strip()
one_period = one_period.str.replace('from the', '', regex=False).str.strip()

one_period = one_period.str.replace('?', '', regex=False).str.strip()
one_period = one_period.str.replace(' /', '/', regex=False).str.strip()
one_period = one_period.str.replace('/ ', '/', regex=False).str.strip()
one_period = one_period.str.replace(r'\s+', ' ', regex=True).str.strip()
one_period = one_period.str.split(";", expand=True)[0]
one_period = one_period.str.split(":", expand=True)[0]
other_replace_period = {
    "Date being researched":"Unknown",
    "Dates being researched":"Unknown",
#     "Cypro-Archaic I" : "Cypro-Archaic",
#     "Cypro-Archaic II": "Cypro-Archaic",
#     "Cypro-Archaic I–II": "Cypro-Archaic",
#     "Cypro-Classical I":"Cypro-Classical",
#     "Cypro-Classical II":"Cypro-Classical",
#     "Cypro-Classical I or II": "Cypro-Classical",
#     "Cypro-Geometric I":"Cypro-Geometric",
#     "Cypro-Geometric II":"Cypro-Geometric",
#     "Cypro-Geometric III":"Cypro-Geometric"
}
one_period = one_period.replace(other_replace_period)
df_pandas['Period'] = one_period

# change other values to None
# df_pandas.replace([None], None)
df_pandas = df_pandas.replace('nan', None)
df_pandas = df_pandas.replace('Unknown', None)

Wall time: 42.6 s


Unnamed: 0,Object ID,Object Name,Country,Period,Culture,Dynasty,Reign,Medium,Is Highlight,Gallery Number,Department,Artist Display Name,Artist Gender,Artist Role
0,1,Coin,,,,,,Gold,False,,The American Wing,[James Barton Longacre],[None],[Maker]
1,2,Coin,,,,,,Gold,False,,The American Wing,[Christian Gobrecht],[None],[Maker]
2,3,Coin,,,,,,Gold,False,,The American Wing,[None],[None],[None]
3,4,Coin,,,,,,Gold,False,,The American Wing,[None],[None],[None]
4,5,Coin,,,,,,Gold,False,,The American Wing,[None],[None],[None]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477799,860869,Drawing,,,,,,"Graphite on cardstock adhered to canvas, perfo...",False,,Drawings and Prints,[Brewster & Co.],[None],[Manufacturer]
477800,860870,Drawing,,,,,,Graphite and ink,False,,Drawings and Prints,[Brewster & Co.],[None],[Manufacturer]
477801,860871,Print,,,,,,Etching,False,,Drawings and Prints,[Paul Sandby],[None],[Artist and engraver]
477802,860872,Drawing,,,,,,Graphite,False,,Drawings and Prints,[Brewster & Co.],[None],[Manufacturer]


In [362]:
df_pandas['Medium'].str.replace(pattern_parenthesis, '', regex=True).loc[:6832]

0            Gold
1            Gold
2            Gold
3            Gold
4            Gold
          ...    
6828    Porcelain
6829    Porcelain
6830    Porcelain
6831    Porcelain
6832    Porcelain
Name: Medium, Length: 6833, dtype: object

In [380]:
# Medium
# regular expression pattern to remove elements within parentheses

pattern_parenthesis = r'\((.*?)\)'
cleaned_medium = df_pandas['Medium'].str.replace(pattern_parenthesis, '', regex=True).str.strip()
one_medium = cleaned_medium.str.lower()

# remove filler words
one_medium = one_medium.str.replace(r'\b(possibly|probably|Possibly|Probably)\b', '', regex=True).str.strip()
one_medium = one_medium.str.replace('from the', '', regex=False).str.strip()

one_medium = one_medium.str.replace('?', '', regex=False).str.strip()
one_medium = one_medium.str.replace('a,b)', '', regex=False).str.strip()
one_medium = one_medium.str.replace('a, b)', '', regex=False).str.strip()
one_medium = one_medium.str.replace('a-c)', '', regex=False).str.strip()
one_medium = one_medium.str.replace('a)', '', regex=False).str.strip()
one_medium = one_medium.str.replace('b)', '', regex=False).str.strip()
# one_medium = one_medium.str.replace('alloy of', '', regex=False).str.strip()
# one_medium = one_medium.str.replace(' and ', ',', regex=False).str.strip()



one_medium = one_medium.str.replace('/ ', '/', regex=False).str.strip()
one_medium = one_medium.str.replace(r'\s+', ' ', regex=True).str.strip()
# one_medium = one_medium.str.split(";", expand=True)[0]
# one_medium = one_medium.str.split(":", expand=True)[0]
other_replace_medium = {
    "[medium not available]":"Unknown",
    "[no medium available]":"Unknown",

}
one_medium = one_medium.replace(other_replace_medium)

split_medium = cleaned_period.apply(get_unique_values, delim= ',')#.apply(lambda x: x[0])

# one_period = one_period.replace(other_replace_period)
# df_pandas['Period'] = one_period


In [381]:
type(df_pandas['Gallery Number'].values[0])

pandas._libs.missing.NAType

In [382]:
pd.DataFrame(one_medium.value_counts()).reset_index().rename(columns= {'Medium': 'counts'}).query('counts >10').sort_values(by='index').style

Unnamed: 0,index,counts
37,Unknown,1691
2059,acrylic,12
1172,acrylic and silkscreen on canvas,25
396,acrylic on canvas,99
1345,acrylic on paper,21
189,agate,236
755,"agate, banded",43
2316,agate; cut and drilled,11
342,alabaster,118
2020,alabaster with paint and gilding,13


In [366]:
one_medium[one_medium == 'a) rock crystal; b) bronze']

37952    a) rock crystal; b) bronze
37953    a) rock crystal; b) bronze
37954    a) rock crystal; b) bronze
Name: Medium, dtype: object

In [None]:
df_

In [367]:
df_pandas.loc[37950:37958, 'Medium']

37950                      Rock crystal
37951                      Rock crystal
37952    a) Rock crystal; b) Bronze\r\n
37953    a) Rock crystal; b) Bronze\r\n
37954        a) Rock crystal; b) Bronze
37955                      Rock crystal
37956    a) Rock crystal; b) Silver\r\n
37957                      Rock crystal
37958              Silk tapestry (kesi)
Name: Medium, dtype: object

In [321]:
cleaned_period.unique().shape

(1699,)

In [304]:
one_country.unique().shape

(584,)

## Scratch Space
----

In [335]:
var = 'Medium'
pd.DataFrame(df_pandas[var].value_counts()).reset_index().rename(columns= {var: 'counts'}).query('counts > 10').sort_values(by='index').style

Unnamed: 0,index,counts
2218,"(a, b) leather",12
2168,"(a, b) silk",12
1229,Acrylic and silkscreen on canvas,25
406,Acrylic on canvas,99
1408,Acrylic on paper,21
194,Agate,229
808,"Agate, banded",42
2405,Agate; cut and drilled,11
375,Alabaster,107
2086,Alabaster with paint and gilding,13


In [306]:
pd.DataFrame(one_country.value_counts()).reset_index().rename(columns= {0: 'counts'})#.query('counts > 10')#.sort_values(by='index').style

Unnamed: 0,index,counts
0,Egypt,31446
1,United States,9590
2,Iran,6595
3,Peru,3427
4,France,1966
...,...,...
578,Lower Austria,1
579,Cyprus or Turkey,1
580,Constantinople,1
581,South Italy,1
