### setup

In [285]:
import pandas as pd
import time
import datetime
import re
import matplotlib.pyplot as plt
import urllib.request
from PIL import Image

# show all dataframe
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_columns', None)  

from functions import *

## import MET Open Access CSV

The Metropolitan Museum of Art Open Access CSV <br>
https://github.com/metmuseum/openaccess

The csv is slightly different than pull directly from MET open access [API page](https://metmuseum.github.io/)<br>
Size is slightly smaller, more features, so still worth to pull.

The pulling using the API is slow, so we could use the csv to filter to get the object ID, and pull from API.

In [64]:
start_time = time.time() # start the time counter

df = pd.read_csv('openaccess\MetObjects.csv', dtype='str')

time_spent = round(time.time()-start_time)
print(f'{str(datetime.timedelta(seconds=time_spent))} slipped...') # print out the time spent

0:00:05 slipped...


#### basic check

In [65]:
df.shape

(477804, 54)

In [66]:
df.head(3)

Unnamed: 0,Object Number,Is Highlight,Is Timeline Work,Is Public Domain,Object ID,Gallery Number,Department,AccessionYear,Object Name,Title,Culture,Period,Dynasty,Reign,Portfolio,Constituent ID,Artist Role,Artist Prefix,Artist Display Name,Artist Display Bio,Artist Suffix,Artist Alpha Sort,Artist Nationality,Artist Begin Date,Artist End Date,Artist Gender,Artist ULAN URL,Artist Wikidata URL,Object Date,Object Begin Date,Object End Date,Medium,Dimensions,Credit Line,Geography Type,City,State,County,Country,Region,Subregion,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Object Wikidata URL,Metadata Date,Repository,Tags,Tags AAT URL,Tags Wikidata URL
0,1979.486.1,False,False,False,1,,The American Wing,1979,Coin,One-dollar Liberty Head Coin,,,,,,16429.0,Maker,,James Barton Longacre,"American, Delaware County, Pennsylvania 1794–1869 Philadelphia, Pennsylvania",,"Longacre, James Barton",American,1794.0,1869.0,,http://vocab.getty.edu/page/ulan/500011409,https://www.wikidata.org/wiki/Q3806459,1853,1853,1853,Gold,Dimensions unavailable,"Gift of Heinz L. Stoppelmann, 1979",,,,,,,,,,,,,,http://www.metmuseum.org/art/collection/search/1,,,"Metropolitan Museum of Art, New York, NY",,,
1,1980.264.5,False,False,False,2,,The American Wing,1980,Coin,Ten-dollar Liberty Head Coin,,,,,,107.0,Maker,,Christian Gobrecht,1785–1844,,"Gobrecht, Christian",,1785.0,1844.0,,http://vocab.getty.edu/page/ulan/500077295,https://www.wikidata.org/wiki/Q5109648,1901,1901,1901,Gold,Dimensions unavailable,"Gift of Heinz L. Stoppelmann, 1980",,,,,,,,,,,,,,http://www.metmuseum.org/art/collection/search/2,,,"Metropolitan Museum of Art, New York, NY",,,
2,67.265.9,False,False,False,3,,The American Wing,1967,Coin,Two-and-a-Half Dollar Coin,,,,,,,,,,,,,,,,,,,1909–27,1909,1927,Gold,Diam. 11/16 in. (1.7 cm),"Gift of C. Ruxton Love Jr., 1967",,,,,,,,,,,,,,http://www.metmuseum.org/art/collection/search/3,,,"Metropolitan Museum of Art, New York, NY",,,


In [176]:
df[df['Object ID']=='45189']

Unnamed: 0,Object Number,Is Highlight,Is Timeline Work,Is Public Domain,Object ID,Gallery Number,Department,AccessionYear,Object Name,Title,Culture,Period,Dynasty,Reign,Portfolio,Constituent ID,Artist Role,Artist Prefix,Artist Display Name,Artist Display Bio,Artist Suffix,Artist Alpha Sort,Artist Nationality,Artist Begin Date,Artist End Date,Artist Gender,Artist ULAN URL,Artist Wikidata URL,Object Date,Object Begin Date,Object End Date,Medium,Dimensions,Credit Line,Geography Type,City,State,County,Country,Region,Subregion,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Object Wikidata URL,Metadata Date,Repository,Tags,Tags AAT URL,Tags Wikidata URL
38935,1975.283.1,False,False,True,45189,,Asian Art,1975,Robe,,Japan,,,,,,,,,,,,,,,,,,19th century,1800,1899,Silk satin with supplementary weft patterning in silk and metallic thread,Overall: 43 1/2 x 80 1/2 in. (110.5 x 204.5 cm),"Gift of Arthur M. Crocker and William R. Crocker, 1975",,,,,,,,,,,,Costumes,,http://www.metmuseum.org/art/collection/search/45189,,,"Metropolitan Museum of Art, New York, NY",Robes|Flowers,http://vocab.getty.edu/page/aat/300209852|http://vocab.getty.edu/page/aat/300132399,https://www.wikidata.org/wiki/Q345127|https://www.wikidata.org/wiki/Q506


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 477804 entries, 0 to 477803
Data columns (total 54 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Object Number            477804 non-null  object
 1   Is Highlight             477804 non-null  object
 2   Is Timeline Work         477804 non-null  object
 3   Is Public Domain         477804 non-null  object
 4   Object ID                477804 non-null  object
 5   Gallery Number           51776 non-null   object
 6   Department               477804 non-null  object
 7   AccessionYear            474248 non-null  object
 8   Object Name              476113 non-null  object
 9   Title                    448619 non-null  object
 10  Culture                  207379 non-null  object
 11  Period                   90956 non-null   object
 12  Dynasty                  23233 non-null   object
 13  Reign                    11226 non-null   object
 14  Portfolio           

#### check to pick a way to filter the clothing

In [68]:
df['Department'].value_counts(normalize=True)

Drawings and Prints                          0.349834
European Sculpture and Decorative Arts       0.089857
Photographs                                  0.077839
Asian Art                                    0.077055
Greek and Roman Art                          0.070636
Costume Institute                            0.065742
Egyptian Art                                 0.058522
The American Wing                            0.038535
Islamic Art                                  0.032384
Modern and Contemporary Art                  0.029805
Arms and Armor                               0.028487
Arts of Africa, Oceania, and the Americas    0.025816
Medieval Art                                 0.014964
Ancient Near Eastern Art                     0.013022
Musical Instruments                          0.010904
European Paintings                           0.005477
Robert Lehman Collection                     0.005412
The Cloisters                                0.004893
The Libraries               

In [82]:
df['Department'].nunique() # 19 department

19

In [71]:
df['Classification'].value_counts(normalize=True) # too many, not a good choice

Prints                           0.202656
Prints|Ephemera                  0.071041
Photographs                      0.067796
Drawings                         0.056850
Vases                            0.053341
                                   ...   
Ornament & Architecture|Books    0.000003
Albums|Books                     0.000003
Albums|Drawings|Sketchbooks      0.000003
Paper-Graphics-Inscribed         0.000003
Ephemera|Postcards               0.000003
Name: Classification, Length: 1212, dtype: float64

In [72]:
df['Object Name'].value_counts(normalize=True) # too many, not a good choice

Print                                           0.208856
Photograph                                      0.059772
Drawing                                         0.054164
Book                                            0.028130
Fragment                                        0.020092
                                                  ...   
Mastos                                          0.000002
Rhyton in the form of a cow's head, fragment    0.000002
Strainer for wine                               0.000002
Helmet attachment                               0.000002
Ephemera; postcard                              0.000002
Name: Object Name, Length: 28449, dtype: float64

#### use manully picked title keywords 

In [107]:
df['Title'].isna().sum()

29185

In [113]:
df['Object Name'].isna().sum()

1691

#### test function

In [195]:
df_title_pick = df[df['Title'].isna() == False]

In [212]:
df_temp = df_object_pick[df_object_pick['Object Name'].str.contains('robe',case=False)]

In [204]:
df[df['Object ID']=='45189']

Unnamed: 0,Object Number,Is Highlight,Is Timeline Work,Is Public Domain,Object ID,Gallery Number,Department,AccessionYear,Object Name,Title,Culture,Period,Dynasty,Reign,Portfolio,Constituent ID,Artist Role,Artist Prefix,Artist Display Name,Artist Display Bio,Artist Suffix,Artist Alpha Sort,Artist Nationality,Artist Begin Date,Artist End Date,Artist Gender,Artist ULAN URL,Artist Wikidata URL,Object Date,Object Begin Date,Object End Date,Medium,Dimensions,Credit Line,Geography Type,City,State,County,Country,Region,Subregion,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Object Wikidata URL,Metadata Date,Repository,Tags,Tags AAT URL,Tags Wikidata URL
38935,1975.283.1,False,False,True,45189,,Asian Art,1975,Robe,,Japan,,,,,,,,,,,,,,,,,,19th century,1800,1899,Silk satin with supplementary weft patterning in silk and metallic thread,Overall: 43 1/2 x 80 1/2 in. (110.5 x 204.5 cm),"Gift of Arthur M. Crocker and William R. Crocker, 1975",,,,,,,,,,,,Costumes,,http://www.metmuseum.org/art/collection/search/45189,,,"Metropolitan Museum of Art, New York, NY",Robes|Flowers,http://vocab.getty.edu/page/aat/300209852|http://vocab.getty.edu/page/aat/300132399,https://www.wikidata.org/wiki/Q345127|https://www.wikidata.org/wiki/Q506


In [213]:
df_temp[df_temp['Object ID']=='45189']

Unnamed: 0,Object Number,Is Highlight,Is Timeline Work,Is Public Domain,Object ID,Gallery Number,Department,AccessionYear,Object Name,Title,Culture,Period,Dynasty,Reign,Portfolio,Constituent ID,Artist Role,Artist Prefix,Artist Display Name,Artist Display Bio,Artist Suffix,Artist Alpha Sort,Artist Nationality,Artist Begin Date,Artist End Date,Artist Gender,Artist ULAN URL,Artist Wikidata URL,Object Date,Object Begin Date,Object End Date,Medium,Dimensions,Credit Line,Geography Type,City,State,County,Country,Region,Subregion,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Object Wikidata URL,Metadata Date,Repository,Tags,Tags AAT URL,Tags Wikidata URL
38935,1975.283.1,False,False,True,45189,,Asian Art,1975,Robe,,Japan,,,,,,,,,,,,,,,,,,19th century,1800,1899,Silk satin with supplementary weft patterning in silk and metallic thread,Overall: 43 1/2 x 80 1/2 in. (110.5 x 204.5 cm),"Gift of Arthur M. Crocker and William R. Crocker, 1975",,,,,,,,,,,,Costumes,,http://www.metmuseum.org/art/collection/search/45189,,,"Metropolitan Museum of Art, New York, NY",Robes|Flowers,http://vocab.getty.edu/page/aat/300209852|http://vocab.getty.edu/page/aat/300132399,https://www.wikidata.org/wiki/Q345127|https://www.wikidata.org/wiki/Q506


##### def function

In [267]:
name = 'Bob'

my_str = fr'\b{name}\b'
print(my_str)

\bBob\b


In [253]:
chr(92)

'\\'

In [273]:
# to print out how many objects 
# in this dataframe 
# of a certain column

def print_num_object(keywords,dataframe,column):
    size = 0
    
    for k in keywords:
        kk = fr'\b{k}\b'
        number = len(dataframe[dataframe[column].str.contains(kk,case=False,regex=True)]) # not case sensitive
        print(f'There are {number} {k}') 
        size += number

    print('--------------------------')
    print(f'My data size will be {size}')


In [129]:
# man picked keywords
# rough pick 

keywords = ['blouse', 'bodice', 'bolero', 'caftan', 'cape', 'chasuble', 'chemise', 'coat', 
 'dress', 'ensemble', 'gown', 'jacket', 'robe', 'shirt', 'shorts', 'skirt', 'suit']

In [134]:
df_title_pick = df[df['Title'].isna() == False]
df_title_pick.shape

(448619, 54)

In [271]:
print_num_object(keywords, df_title_pick,'Title')

There are 333 blouse
There are 150 bodice
There are 25 bolero
There are 46 caftan
There are 475 cape
There are 157 chasuble
There are 61 chemise
There are 1509 coat
There are 6058 dress
There are 2967 ensemble
There are 316 gown
There are 752 jacket
There are 461 robe
There are 660 shirt
There are 50 shorts
There are 422 skirt
There are 1526 suit
--------------------------
My data size will be 15968


In [136]:
df_object_pick = df[df['Object Name'].isna() == False]
df_object_pick.shape

(476113, 54)

In [274]:
print_num_object(keywords,df_object_pick,'Object Name')

There are 316 blouse
There are 77 bodice
There are 19 bolero
There are 45 caftan
There are 326 cape
There are 144 chasuble
There are 48 chemise
There are 1247 coat
There are 5507 dress
There are 3061 ensemble
There are 261 gown
There are 744 jacket
There are 724 robe
There are 597 shirt
There are 32 shorts
There are 377 skirt
There are 1379 suit
--------------------------
My data size will be 14904


#### create dataframe

##### def function

In [278]:
def dataframe_keywords(keywords, dataframe, column):
    # initiate an empty df
    keys = dataframe.columns
    new_df = pd.DataFrame(columns=keys)

    for k in keywords:
        kk = fr'\b{k}\b'
        thisdf = dataframe[dataframe[column].str.contains(kk,case=False,regex= True)]
        frames = [thisdf, new_df]
        new_df = pd.concat(frames)

    return new_df

In [279]:
df_con1 = dataframe_keywords(keywords,df_title_pick,'Title')

In [280]:
df_con1.shape

(15968, 54)

In [281]:
df_con2 = dataframe_keywords(keywords,df_object_pick,'Object Name')

In [282]:
df_con2.shape

(14904, 54)

In [327]:
df_con = pd.concat([df_con1,df_con2])
df_con.shape

(30872, 54)

In [328]:
df_con.drop_duplicates(inplace=True)
df_con.shape

(16968, 54)

### simple clean up

In [332]:
selector = ['Costume Institute', 'Asian Art', 'Islamic Art','The American Wing']

print(f'shape before: {df_con.shape}')

df_con = df_con[df_con['Department'].isin(selector)]

print(f'shape after: {df_con.shape}')

shape before: (16968, 54)
shape after: (14292, 54)


In [333]:
df_con['Department'].value_counts()

Costume Institute    13194
Asian Art              682
Islamic Art            345
The American Wing       71
Name: Department, dtype: int64

In [340]:
df_con.sample(1)

Unnamed: 0,Object Number,Is Highlight,Is Timeline Work,Is Public Domain,Object ID,Gallery Number,Department,AccessionYear,Object Name,Title,Culture,Period,Dynasty,Reign,Portfolio,Constituent ID,Artist Role,Artist Prefix,Artist Display Name,Artist Display Bio,Artist Suffix,Artist Alpha Sort,Artist Nationality,Artist Begin Date,Artist End Date,Artist Gender,Artist ULAN URL,Artist Wikidata URL,Object Date,Object Begin Date,Object End Date,Medium,Dimensions,Credit Line,Geography Type,City,State,County,Country,Region,Subregion,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Object Wikidata URL,Metadata Date,Repository,Tags,Tags AAT URL,Tags Wikidata URL
67979,"C.I.60.7a, b",False,False,False,81685,980,Costume Institute,1959,Suit,Suit,American,,,,,16561,Designer,,Gilbert Adrian,"American, Naugatuck, Connecticut 1903–1959 Hollywood, California",,"Adrian, Gilbert",American,1903,1959,,http://vocab.getty.edu/page/ulan/500401806,https://www.wikidata.org/wiki/Q1366554,1948,1948,1948,"Wool, silk",,"Gift of Janet Gaynor Adrian, 1960",,,,,,,,,,,,,,http://www.metmuseum.org/art/collection/search/81685,,,"Metropolitan Museum of Art, New York, NY",,,


In [352]:
df_con.to_csv('tran_file/df_full_selection.csv', index=False)

## prepare dataframe of Asian Art for scrapping

In [344]:
df_asian = df_con[df_con['Department']=='Asian Art']

In [347]:
df_asian.shape

(682, 54)

In [348]:
df_asian.to_csv('tran_file/df_asian.csv',index=False)