# Step 3 :  `#6` "objects_constituents.csv" Cleaning

## 3.1: Load cleaned  "objects.csv" in (as calibrat)

In [65]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 50
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 200

#df1 = pd.read_excel('01_2_objects_cleaned.xlsx')
df1 = pd.read_csv('01_2_objects_cleaned.csv', encoding = 'utf-8', low_memory = False) 

# df #(overview of the table)
df1.shape
#df1

(81868, 18)

## 3.2: Load raw "objects_constituents.csv" in¶

In [66]:
df6 = pd.read_csv('06_0_objects_constituents_clean.csv', encoding = 'utf-8', low_memory = False)
df6 # 468364 raw pairs of Object-Constituents
df6.columns

Index(['objectID', 'constituentID', 'displayOrder', 'roleType', 'role'], dtype='object')

Python loads `CSV` files 100 times faster than `Excel` files. 
* https://towardsdatascience.com/read-excel-files-with-python-1000x-faster-407d07ad0ed8

`Series.unique()` function to get the unique values in a column
* https://www.w3resource.com/pandas/series/series-unique.php

In [67]:
df_1_6 = pd.merge(left= df1, right = df6, how = "inner", left_on = "objectID", right_on = "objectID")
df_1_6.shape #271165 pairs 2DObject-Contituents (related to our interest)

(271165, 22)

In [68]:
df_1_6[['objectID', 'constituentID', 'displayOrder', 'roleType', 'role']]
#df_1_6["roleType"].unique()
# there are only two types of constituents: donor or artist
# we only want the artist of an artwork

Unnamed: 0,objectID,constituentID,displayOrder,roleType,role
0,0,427,1,donor,donor
1,0,427,2,donor,source
2,0,38613,1,artist,painter
3,1,427,1,donor,donor
4,1,427,2,donor,source
...,...,...,...,...,...
271160,32452,7174,2,donor,source
271161,32572,122,2,artist,artist after
271162,32572,5849,1,artist,artist
271163,32572,7174,1,donor,donor


In [69]:
df_2D_6 = df_1_6[df_1_6["roleType"] == "artist"]
df_2D_6[['objectID', 'constituentID', 'displayOrder', 'roleType', 'role']] # 103256 2D_Object-Artist pairs

Unnamed: 0,objectID,constituentID,displayOrder,roleType,role
2,0,38613,1,artist,painter
5,1,1204,1,artist,painter
8,2,1327,1,artist,painter
11,18,1689,1,artist,painter
14,19,1109,1,artist,painter
...,...,...,...,...,...
271154,222965,5441,1,artist,artist
271157,32452,122,2,artist,artist after
271158,32452,5849,1,artist,artist
271161,32572,122,2,artist,artist after


In [70]:
df6_clean = df_2D_6[['objectID', 'constituentID', 'displayOrder', 'roleType', 'role']]
#df6_clean["role"].unique()

# ----- Keep the duplicated pairs, their displayOrder are distinct -----------
df6_dup = df6_clean[df6_clean[['objectID', 'constituentID', "role"]].duplicated(keep = False)]
# df6_dup = df6_clean[df6_clean[['objectID', 'constituentID',"displayOrder", "role"]].duplicated(keep = False)]
# df6_dup.shape #44 duplicates identified
#df6_dup

Use this URL format to check an artwork-object (i.e. if duplicated artists are shown):  
* https://www.nga.gov/collection/art-object-page.62654.html
* https://www.nga.gov/collection/art-object-page.213186.html
* https://www.nga.gov/collection/art-object-page.220552.html

Use this URL format to see who is this artist
* https://www.nga.gov/collection/artist-info.8286.html (artist = "Italian 13th Century")

**Different types of artists**:  
* ['painter', 'artist', 'related artist', 'artist after', 'sculptor', 'architect', 'publisher', 'author', 'printer', 'artist/author','etcher', 'technical collaborator', 'edition production', 'editor','cabinetmaker', 'carver', 'lithographer','collaborator & supervisor', 'processing and proofing','typesetter', 'processor', 'collaborator', 'designer','printmaker', 'composer', 'artist/publisher', 'workshop printer','engraver', 'wood-engraver', 'aquatinter', 'calligrapher']

In [71]:
# ----- Keep the duplicated pairs, their displayOrder are distinct -----------
df6_clean = df6_clean.drop_duplicates(subset=["objectID", "constituentID","displayOrder", "role"])
df6_clean = df6_clean.reset_index(drop=True)
df6_clean # 103256 tuples without duplicates

Unnamed: 0,objectID,constituentID,displayOrder,roleType,role
0,0,38613,1,artist,painter
1,1,1204,1,artist,painter
2,2,1327,1,artist,painter
3,18,1689,1,artist,painter
4,19,1109,1,artist,painter
...,...,...,...,...,...
103251,222965,5441,1,artist,artist
103252,32452,122,2,artist,artist after
103253,32452,5849,1,artist,artist
103254,32572,122,2,artist,artist after


In [72]:
NumObject = df6_clean["objectID"].nunique()
NumConstituent = df6_clean["constituentID"].nunique()
print("#objectID=", NumObject, " #constituentID= ", NumConstituent)

#objectID= 81867  #constituentID=  9943


## 3.3: Output cleaned "objects_constituents.csv"¶

In [73]:
df6_clean.to_csv("06_1_objects_constituents_cleaned.csv", encoding = "UTF-8", index = False)
# .csv file checked to be intact (i.e. 103256 tuples)

# Step 4:  `#8` "constituents.csv" Cleaning

## 4.1 : Load raw "constituents.csv" in¶

In [74]:
df8 = pd.read_csv('08_0_constituents_clean.csv', encoding = 'utf-8', low_memory = False)
df8 # 18281 raw pairs of Object-Constituents
# df8.columns

Unnamed: 0,constituentID,preferredDisplayName,forwardDisplayName,lastName,displayDate,beginYear,endYear,visualBrowserTimeSpan,visualBrowserNationality
0,9,Anonymous,Anonymous,,Anonymous,,,,Other
1,11,"Baldung, Hans",Hans Baldung,Baldung,,,,,Other
2,13,Anonymous Artist,Anonymous Artist,Anonymous Artist,,,,,Other
3,14,"Evans, Thomas M., Mrs.",Mrs. Thomas M. Evans,Evans,"American, 1923 - 2013",1923.0,2013.0,1901 to 1925,American
4,15,"Haupt, Enid Annenberg",Enid Annenberg Haupt,Haupt,1906 - 2005,1906.0,2005.0,1901 to 1925,Other
...,...,...,...,...,...,...,...,...,...
18276,52213,The Spring Street Workshop,The Spring Street Workshop,,,,,,Other
18277,52222,Janus Press,Janus Press,,,,,,Other
18278,52225,"Miller-Brown, Andrew",Andrew Miller-Brown,Miller-Brown,American,,,,American
18279,52229,"Renard, Jean Augustin",Jean Augustin Renard,Renard,,1744.0,1807.0,1726 to 1750,French


Pandas equivalent for SQL Subquery 
* https://stackoverflow.com/questions/59989722/select-using-sub-query-in-pandas

In [75]:
uniqueArtist = df6_clean["constituentID"].drop_duplicates()
uniqueArtist  #9943 counts of unique artist

df8_clean = df8[df8["constituentID"].isin(uniqueArtist)] 
# 1) use the cleaned up Object-Constituent pairs to get all the unique constituentIDs
# 2) parsing through all the constituents, and only keep thoses who are part of these constitentes (i.e. artists of our 2D-artworks)
df8_clean

#dfMA = df8[~df8["constituentID"].isin(uniqueArtist)] 
#dfMA[dfMA["constituentID"].isin(uniqueArtist)]

Unnamed: 0,constituentID,preferredDisplayName,forwardDisplayName,lastName,displayDate,beginYear,endYear,visualBrowserTimeSpan,visualBrowserNationality
1,11,"Baldung, Hans",Hans Baldung,Baldung,,,,,Other
2,13,Anonymous Artist,Anonymous Artist,Anonymous Artist,,,,,Other
10,22,"Abbott, Lemuel Francis",Lemuel Francis Abbott,Abbott,"British, c. 1755/1761 - 1802",1755.0,1802.0,1751 to 1775,British
12,25,"Pippin, Horace",Horace Pippin,Pippin,"American, 1888 - 1946",1888.0,1946.0,1876 to 1900,American
13,27,"Aelst, Willem van",Willem van Aelst,Aelst,"Dutch, 1627 - 1683",1627.0,1683.0,1601 to 1650,Dutch
...,...,...,...,...,...,...,...,...,...
18245,52019,"Pecis, Hilary",Hilary Pecis,Pecis,"American, born 1979",1979.0,,1976 to 2000,American
18252,52060,"Marcus, Peter",Peter Marcus,Marcus,"American, born 1939",1939.0,,1926 to 1950,American
18254,52071,"Inig, Ludovico",Ludovico Inig,Inig,"Italian, active late 18th century",1750.0,1800.0,1726 to 1750,Italian
18256,52092,"Adolfsz., Harmen",Harmen Adolfsz.,Adolfsz.,"Dutch, active 1603 - 1622",1603.0,1622.0,1601 to 1650,Dutch


In [76]:
# df8_clean["visualBrowserNationality"].unique()

Artist's Nationality:
* 'Other', 'British', 'American', 'Dutch', 'German', 'Italian','French', 'Swiss', 'Netherlandish', 'Russian', 'Spanish','Flemish', 'Austrian', 'Scottish', 'English', 'Mexican', 'Belgian','Swedish', 'Japanese', 'Canadian', 'Czech', 'Bohemian', 'Danish','Chinese', 'Norwegian', '\u206eItalian', 'italian']

## 4.4 : Output cleaned "constituents.csv"¶

In [77]:
df8_clean.to_csv("08_1_constituents_cleaned.csv", encoding = "UTF-8", index = False)
# .csv file checked to be intact (i.e. 9941 tuples)

# Step 5:  `#2` "objects_terms.csv" Cleaning

## 5.1 : Load raw "objects_terms.csv" in

In [78]:
df2 = pd.read_csv('02_0_objects_terms_clean.csv', encoding = 'utf-8', low_memory = False)
df2 #  raw terms
# df2.columns

Unnamed: 0,termID,objectID,termType,term,visualBrowserTheme
0,2027476,28342,Place Executed,Rhode Island,
1,2027476,28923,Place Executed,Rhode Island,
2,2027477,17753,Place Executed,Newport,
3,2027477,21914,Place Executed,Newport,
4,2027477,26565,Place Executed,Newport,
...,...,...,...,...,...
387228,2027455,29636,Place Executed,New York,
387229,2027455,29637,Place Executed,New York,
387230,2027455,29649,Place Executed,New York,
387231,2027458,12833,Place Executed,Georgia,


In [79]:
df_1_2 = pd.merge(left= df1, right = df2, how = "inner", left_on = "objectID", right_on = "objectID")
# objects# = 81868
df_1_2.shape # objects & terms pairs # = 187180 tuples

keyAtts = ["objectID", "title","attribution","termID", "termType", "term", "visualBrowserTheme"]
df_1_2[keyAtts]

Unnamed: 0,objectID,title,attribution,termID,termType,term,visualBrowserTheme
0,0,Saint James Major,Grifo di Tancredi,2034497,Technique,painted surface,
1,0,Saint James Major,Grifo di Tancredi,2055286,Keyword,James Major,
2,0,Saint James Major,Grifo di Tancredi,2034742,Theme,saints,religious
3,0,Saint James Major,Grifo di Tancredi,2034506,School,Florentine,
4,0,Saint James Major,Grifo di Tancredi,2034491,Style,Gothic,
...,...,...,...,...,...,...,...
187175,32452,American White Pelican,Robert Havell after John James Audubon,2034601,Technique,aquatint,
187176,32452,American White Pelican,Robert Havell after John James Audubon,2034525,School,American,
187177,32572,American Flamingo,Robert Havell after John James Audubon,2034787,Technique,engraving,
187178,32572,American Flamingo,Robert Havell after John James Audubon,2034601,Technique,aquatint,


In [81]:
df_1_2["termType"].unique()
# 'Technique', 'Keyword', 'Theme', 'School', 'Style','Systematic Catalogue Volume', 'Place Executed'
df_1_2["termType"].unique()

array(['Technique', 'Keyword', 'Theme', 'School', 'Style',
       'Systematic Catalogue Volume', 'Place Executed'], dtype=object)

In [None]:
uniqueObjects = df1["objectID"].drop_duplicates()
uniqueObjects  #81868 counts of unique objects (objectID)

df2_clean = df2[df2["objectID"].isin(uniqueObjects)].reset_index()
df2_clean

In [None]:
# ------------- this is a CHECK -------
df2_clean["objectID"].nunique()  # 73607 objects having terms (we have 81868 objects in total)

## 5.3 : Output cleaned "objects_terms.csv"

In [None]:
df2_clean.to_csv("02_1_objects_terms_cleaned.csv", encoding = "UTF-8", index = False)
# .csv file checked to be intact (i.e. 187180 tuples)

# Step 6:  `#3` "objects_associations.csv" Cleaning

## 6.1 : Load raw "objects_associations.csv" in

In [None]:
df3 = pd.read_csv('03_0_objects_associations_clean.csv', encoding = 'utf-8', low_memory = False)
df3 #  24655 pairs of associations

## 6.2 : Check if both the `parent-object` AND the `child-object` exists in our `2D-objects collection`
Check if values exist using `isin()`:   
* https://thispointer.com/pandas-check-if-a-value-exists-in-a-dataframe-using-in-not-in-operator-isin/

In [None]:
df3_child = df3[df3["childObjectID"].isin(uniqueObjects)].reset_index() # 6457 childObjects (out of 81868 objects)
df3_child

In [None]:
df3_parent = df3[df3["parentObjectID"].isin(uniqueObjects)].reset_index()
df3_parent # 1091 parentsObjects in our objects' collection

# -------------------------------------------------------------
# df1["parentID"].nunique() #1149 unique parentIDs in "objects.csv"
# df1_PC = df1.loc[df1["parentID"].notnull()] # 6453 objects has parentID (i.e. has a parent)
# df1_PC.columns
# df1_PC[["objectID", "parentID"]]
#df1[df1["parentID"].isin(uniqueObjects)] # 911 parentIDs in our artwork-objects collection
# --------------------------------------------------------------

In [None]:
df3_pc = df3[ (df3["parentObjectID"].isin(uniqueObjects)) & (df3["childObjectID"].isin(uniqueObjects))].reset_index()
df3_pc # 915 pairs of parent-child associations

## 6.3 : Check if any duplicated pairs of parent-child exist

In [None]:
df3_pc = df3_pc.drop_duplicates(subset= ["parentObjectID", "childObjectID"]) 
df3_pc # no duplicates in these pairs

In [None]:
# ------- use this to check if an objectID (whether parentID or childID) exist in our objects'collection ----
checkExist = df1["objectID"] == 24520
# not in our pool: 64671, 214689, 214991
#     in our pool: 214998
df1[checkExist]

## 6.4 : Output cleaned "objects_associations.csv"

In [None]:
df3_pc.to_csv("03_1_objects_associations_cleaned.csv", encoding = "UTF-8", index = False)
# .csv file checked to be intact (i.e. 915 tuples)

# Step 7: #4 "objects_dimensions.csv" Cleaning

## 7.1 : Load raw "objects_dimensions.csv" in

In [None]:
df4 = pd.read_csv('04_0_objects_dimensions_clean.csv', encoding = 'utf-8', low_memory = False)
df4 #  207099 total counts of dimension
df4.columns

In [None]:
df4.nunique() #103189 distinct objectIDs, 7 dimensionTypes, 6 unitName
# df4["dimensionType"].unique()
df4["unitName"].unique()

#### `objects_dimensions.csv` table information:
* `7` `dimensionTypes`: `'width', 'height', 'depth', 'diameter', '(not specified)','weight', 'weight1'`
* `6` `unitName`: `'centimeters', 'inches', '(not specified)', 'pounds', 'grams','kilograms'`

## 7.2 : Extract  `dimensions` related to our 2D-Artworks collection

In [None]:
keyAtts = ['objectID', 'title', 'attribution', 'dimensionType', 'dimension', 'unitName']
df_1_4 = pd.merge (left= df1, right = df4, how = "inner", left_on = "objectID", right_on = "objectID")
df_1_4[keyAtts] #113726 counts of dimensions with objectID belonging to our artworks' collection

In [None]:
# ------------ Naughty Search Artwork by weight -------------
#df_14w = df_1_4[ (df_1_4["unitName"] == 'kilograms') | (df_1_4["unitName"] == 'pounds') | (df_1_4["unitName"] == 'grams')]
#df_14w[keyAtts]

In [None]:
df_1_4["objectID"].nunique() #56737 distinct objectIDs out of 113726 dimensions

#------- sort by objectID to check how many dimension tuples per objectID ---------
#df_14Sort = df_1_4.sort_values(by=['objectID'])
#df_14Sort[keyAtts]
# NOTE: typically every artwork-object has at least 2~3 dimension tuples (height, width; sometimes depth, and weight)

### DataFrame.sort_values() function
* https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html

## 7.3: Check if any duplicated values

In [None]:
df4_clean = df_1_4[['objectID', 'dimensionType', 'dimension', 'unitName']]
df4_clean #113726 dimension tuples
df4_clean.drop_duplicates(subset=["objectID", "dimensionType"]) # still 113726 dimension tuples (no duplicates)

## 7.4 : output cleaned "objects_dimensions.csv" 

In [None]:
df4_clean.to_csv("04_1_objects_dimensions_cleaned.csv", encoding = "UTF-8", index = False)
# .csv file checked to be intact (i.e. 113726 tuples)

# Step 8: `#5` "objects_text_entries.csv" Cleaning

## 8.1 : Load raw "objects_text_entries.csv" in

In [None]:
df5 = pd.read_csv('05_0_objects_text_entries_clean.csv', encoding = 'utf-8', low_memory = False)
#df5 # 208329 total counts of text-entries
df5.columns

In [None]:
keyAtts = ['objectID', 'title', 'attribution', 'text', 'textType', 'Year']
df_1_5 = pd.merge (left= df1, right = df5, how = "inner", left_on = "objectID", right_on = "objectID")
df_1_5[keyAtts] #148949 counts of text-entries with objectID belonging to our artworks' collection

In [None]:
df_1_5['textType'].unique()
df_1_5['objectID'].nunique() #44631 objectIDs (of our 81868 objects) have some text-entries 
# typically every artwork-object has more than one text-entries (i.e. an artwork may have multiple bibliographies) 

### objects_dimensions.csv table information:
* 5 textTypes : `'bibliography', 'exhibition_history', 'inscription_footnote','exhibition_history_footnote', 'documentary_labels_inscriptions'`

In [None]:
df5_clean = df_1_5[['objectID', 'text', 'textType', 'Year']]
df5_clean #148949 text-entries for 44631 objectsIDs
#df5_clean['objectID'].nunique() #44631

df5_clean = df5_clean.drop_duplicates(subset = ["objectID", "text","textType", "Year"]) 
#148942 distinct text-entries (checking "objectID", "text", "Year" attributes)
df5_clean

## 8.4 : output cleaned "objects_text_entries.csv"

In [None]:
df5_clean.to_csv("05_1_objects_text_entries_cleaned.csv", encoding = "UTF-8", index = False)
# .csv file checked to be intact (i.e. 148942 tuples)

# Step 9: `#9` "media_items.csv" Cleaning

## 9.1 : Load raw "media_items.csv" in

In [None]:
#df9 = pd.read_csv('09_0_media_items_raw.csv', encoding = 'utf-8', low_memory = False)
#df9 = df9[["mediaid", "mediatype", "title", "language", "playurl", "imageurl"]] #  total counts of media_item
#df9.rename(columns={"mediaid":"mediaID", "mediatype": "mediaType", "playurl": "playURL", "imageurl": "imageURL"})
# df9.columns

raw "media_items.csv" columns:
* 'mediaid', 'mediatype', 'title', 'description', 'duration', 'language','thumbnailurl', 'playurl', 'downloadurl', 'keywords', 'tags','imageurl', 'presentationdate', 'releasedate', 'lastmodified'
* columns of interests: mediaID, mediaType, title, language, playURL, imageURL

#### Pandas Data Types
* https://pbpython.com/pandas_dtypes.html

#### rename column names
* https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html