# Step 3 :  `#6` "objects_constituents.csv" Cleaning

## 3.1: Load cleaned  "objects.csv" in (as calibrat)

In [15]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 50
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 200

#df1 = pd.read_excel('01_2_objects_cleaned.xlsx')
df1 = pd.read_csv('01_2_objects_cleaned.csv', encoding = 'utf-8', low_memory = False) 

# df #(overview of the table)
df1.shape # (81868 x 15)
#df1

(81868, 15)

## 3.2: Load raw "objects_constituents.csv" in¶

In [4]:
df6 = pd.read_csv('06_0_objects_constituents_clean.csv', encoding = 'utf-8', low_memory = False)
df6 # 468364 raw pairs of Object-Constituents
df6.columns

Index(['objectID', 'constituentID', 'displayOrder', 'roleType', 'role'], dtype='object')

Python loads `CSV` files 100 times faster than `Excel` files. 
* https://towardsdatascience.com/read-excel-files-with-python-1000x-faster-407d07ad0ed8

`Series.unique()` function to get the unique values in a column
* https://www.w3resource.com/pandas/series/series-unique.php

In [5]:
df_1_6 = pd.merge(left= df1, right = df6, how = "inner", left_on = "objectID", right_on = "objectID")
df_1_6.shape #271165 pairs 2DObject-Contituents (related to our interest)

(271165, 19)

In [6]:
df_1_6[['objectID', 'constituentID', 'displayOrder', 'roleType', 'role']]
#df_1_6["roleType"].unique()
# there are only two types of constituents: donor or artist
# we only want the artist of an artwork

Unnamed: 0,objectID,constituentID,displayOrder,roleType,role
0,0,427,1,donor,donor
1,0,427,2,donor,source
2,0,38613,1,artist,painter
3,1,427,1,donor,donor
4,1,427,2,donor,source
...,...,...,...,...,...
271160,32452,7174,2,donor,source
271161,32572,122,2,artist,artist after
271162,32572,5849,1,artist,artist
271163,32572,7174,1,donor,donor


In [7]:
df_2D_6 = df_1_6[df_1_6["roleType"] == "artist"]
df_2D_6[['objectID', 'constituentID', 'displayOrder', 'roleType', 'role']] # 103256 2D_Object-Artist pairs

Unnamed: 0,objectID,constituentID,displayOrder,roleType,role
2,0,38613,1,artist,painter
5,1,1204,1,artist,painter
8,2,1327,1,artist,painter
11,18,1689,1,artist,painter
14,19,1109,1,artist,painter
...,...,...,...,...,...
271154,222965,5441,1,artist,artist
271157,32452,122,2,artist,artist after
271158,32452,5849,1,artist,artist
271161,32572,122,2,artist,artist after


In [8]:
df6_clean = df_2D_6[['objectID', 'constituentID', 'displayOrder', 'roleType', 'role']]
#df6_clean["role"].unique()

# ----- Keep the duplicated pairs, their displayOrder are distinct -----------
df6_dup = df6_clean[df6_clean[['objectID', 'constituentID', "role"]].duplicated(keep = False)]
# df6_dup = df6_clean[df6_clean[['objectID', 'constituentID',"displayOrder", "role"]].duplicated(keep = False)]
# df6_dup.shape #44 duplicates identified
#df6_dup

Use this URL format to check an artwork-object (i.e. if duplicated artists are shown):  
* https://www.nga.gov/collection/art-object-page.62654.html
* https://www.nga.gov/collection/art-object-page.213186.html
* https://www.nga.gov/collection/art-object-page.220552.html

Use this URL format to see who is this artist
* https://www.nga.gov/collection/artist-info.8286.html (artist = "Italian 13th Century")

**Different types of artists**:  
* ['painter', 'artist', 'related artist', 'artist after', 'sculptor', 'architect', 'publisher', 'author', 'printer', 'artist/author','etcher', 'technical collaborator', 'edition production', 'editor','cabinetmaker', 'carver', 'lithographer','collaborator & supervisor', 'processing and proofing','typesetter', 'processor', 'collaborator', 'designer','printmaker', 'composer', 'artist/publisher', 'workshop printer','engraver', 'wood-engraver', 'aquatinter', 'calligrapher']

In [11]:
# ----- Keep the duplicated pairs, their displayOrder are distinct -----------

#df6_clean = df6_clean.drop_duplicates(subset=["objectID", "constituentID","displayOrder", "role"])
# 103256 tuples without duplicates

df6_clean = df6_clean.drop_duplicates(subset=["objectID", "constituentID"])
df6_clean = df6_clean.reset_index(drop=True)
df6_clean # 103053 tuples without duplicates

Unnamed: 0,objectID,constituentID,displayOrder,roleType,role
0,0,38613,1,artist,painter
1,1,1204,1,artist,painter
2,2,1327,1,artist,painter
3,18,1689,1,artist,painter
4,19,1109,1,artist,painter
...,...,...,...,...,...
103048,222965,5441,1,artist,artist
103049,32452,122,2,artist,artist after
103050,32452,5849,1,artist,artist
103051,32572,122,2,artist,artist after


In [12]:
NumObject = df6_clean["objectID"].nunique()
NumConstituent = df6_clean["constituentID"].nunique()
print("#objectID=", NumObject, " #constituentID= ", NumConstituent)
# 81867 objectIDs -->  9943 constituentIDs

#objectID= 81867  #constituentID=  9943


## 3.3: Output cleaned "objects_constituents.csv"¶

In [13]:
df6_clean.to_csv("06_1_objects_constituents_cleaned.csv", encoding = "UTF-8", index = False)

# ----------to `Ready` folder (Database Ready) ------------ 
df6_clean.to_csv("../Ready/06_objects_constituents.csv", encoding = 'utf-8', index = False)
# .csv file checked to be intact (i.e. 103053 tuples)

# Step 4:  `#8` "constituents.csv" Cleaning

## 4.1 : Load raw "constituents.csv" in¶

In [60]:
df8 = pd.read_csv('08_0_constituents_clean.csv', encoding = 'utf-8', low_memory = False)
df8 # 18281 raw pairs of Object-Constituents
# df8.columns

Unnamed: 0,constituentID,preferredDisplayName,forwardDisplayName,lastName,displayDate,beginYear,endYear,visualBrowserTimeSpan,visualBrowserNationality
0,9,Anonymous,Anonymous,,Anonymous,,,,Other
1,11,"Baldung, Hans",Hans Baldung,Baldung,,,,,Other
2,13,Anonymous Artist,Anonymous Artist,Anonymous Artist,,,,,Other
3,14,"Evans, Thomas M., Mrs.",Mrs. Thomas M. Evans,Evans,"American, 1923 - 2013",1923.0,2013.0,1901 to 1925,American
4,15,"Haupt, Enid Annenberg",Enid Annenberg Haupt,Haupt,1906 - 2005,1906.0,2005.0,1901 to 1925,Other
...,...,...,...,...,...,...,...,...,...
18276,52213,The Spring Street Workshop,The Spring Street Workshop,,,,,,Other
18277,52222,Janus Press,Janus Press,,,,,,Other
18278,52225,"Miller-Brown, Andrew",Andrew Miller-Brown,Miller-Brown,American,,,,American
18279,52229,"Renard, Jean Augustin",Jean Augustin Renard,Renard,,1744.0,1807.0,1726 to 1750,French


Pandas equivalent for SQL Subquery 
* https://stackoverflow.com/questions/59989722/select-using-sub-query-in-pandas

In [61]:
uniqueArtist = df6_clean["constituentID"].drop_duplicates()
uniqueArtist  #9943 counts of unique artist

df8_clean = df8[df8["constituentID"].isin(uniqueArtist)] 
# 1) use the cleaned up Object-Constituent pairs to get all the unique constituentIDs
# 2) parsing through all the constituents, and only keep thoses who are part of these constitentes (i.e. artists of our 2D-artworks)
df8_clean

#dfMA = df8[~df8["constituentID"].isin(uniqueArtist)] 
#dfMA[dfMA["constituentID"].isin(uniqueArtist)]

Unnamed: 0,constituentID,preferredDisplayName,forwardDisplayName,lastName,displayDate,beginYear,endYear,visualBrowserTimeSpan,visualBrowserNationality
1,11,"Baldung, Hans",Hans Baldung,Baldung,,,,,Other
2,13,Anonymous Artist,Anonymous Artist,Anonymous Artist,,,,,Other
10,22,"Abbott, Lemuel Francis",Lemuel Francis Abbott,Abbott,"British, c. 1755/1761 - 1802",1755.0,1802.0,1751 to 1775,British
12,25,"Pippin, Horace",Horace Pippin,Pippin,"American, 1888 - 1946",1888.0,1946.0,1876 to 1900,American
13,27,"Aelst, Willem van",Willem van Aelst,Aelst,"Dutch, 1627 - 1683",1627.0,1683.0,1601 to 1650,Dutch
...,...,...,...,...,...,...,...,...,...
18245,52019,"Pecis, Hilary",Hilary Pecis,Pecis,"American, born 1979",1979.0,,1976 to 2000,American
18252,52060,"Marcus, Peter",Peter Marcus,Marcus,"American, born 1939",1939.0,,1926 to 1950,American
18254,52071,"Inig, Ludovico",Ludovico Inig,Inig,"Italian, active late 18th century",1750.0,1800.0,1726 to 1750,Italian
18256,52092,"Adolfsz., Harmen",Harmen Adolfsz.,Adolfsz.,"Dutch, active 1603 - 1622",1603.0,1622.0,1601 to 1650,Dutch


In [62]:
# df8_clean["visualBrowserNationality"].unique()

Artist's Nationality:
* 'Other', 'British', 'American', 'Dutch', 'German', 'Italian','French', 'Swiss', 'Netherlandish', 'Russian', 'Spanish','Flemish', 'Austrian', 'Scottish', 'English', 'Mexican', 'Belgian','Swedish', 'Japanese', 'Canadian', 'Czech', 'Bohemian', 'Danish','Chinese', 'Norwegian', '\u206eItalian', 'italian']

## 4.4 : Output cleaned "constituents.csv"¶

In [63]:
df8_clean.to_csv("08_1_constituents_cleaned.csv", encoding = "UTF-8", index = False)

# ----------to `Ready` folder (Database Ready) ------------ 
df8_clean.to_csv("../Ready/08_constituents.csv", encoding = 'utf-8', index = False)

# .csv file checked to be intact (i.e. 9941 tuples)

# Step 5:  `#2` "objects_terms.csv" Cleaning

## 5.1 : Load raw "objects_terms.csv" in

In [64]:
df2 = pd.read_csv('02_0_objects_terms_clean.csv', encoding = 'utf-8', low_memory = False)
df2 #  raw terms
# df2.columns

Unnamed: 0,termID,objectID,termType,term,visualBrowserTheme
0,2027476,28342,Place Executed,Rhode Island,
1,2027476,28923,Place Executed,Rhode Island,
2,2027477,17753,Place Executed,Newport,
3,2027477,21914,Place Executed,Newport,
4,2027477,26565,Place Executed,Newport,
...,...,...,...,...,...
387228,2027455,29636,Place Executed,New York,
387229,2027455,29637,Place Executed,New York,
387230,2027455,29649,Place Executed,New York,
387231,2027458,12833,Place Executed,Georgia,


In [65]:
df_1_2 = pd.merge(left= df1, right = df2, how = "inner", left_on = "objectID", right_on = "objectID")
# objects# = 81868
df_1_2.shape # objects-terms pairs # = 187180 tuples

keyAtts = ["objectID", "title","attribution","termID", "termType", "term", "visualBrowserTheme"]
df_1_2[keyAtts]

Unnamed: 0,objectID,title,attribution,termID,termType,term,visualBrowserTheme
0,0,Saint James Major,Grifo di Tancredi,2034497,Technique,painted surface,
1,0,Saint James Major,Grifo di Tancredi,2055286,Keyword,James Major,
2,0,Saint James Major,Grifo di Tancredi,2034742,Theme,saints,religious
3,0,Saint James Major,Grifo di Tancredi,2034506,School,Florentine,
4,0,Saint James Major,Grifo di Tancredi,2034491,Style,Gothic,
...,...,...,...,...,...,...,...
187175,32452,American White Pelican,Robert Havell after John James Audubon,2034601,Technique,aquatint,
187176,32452,American White Pelican,Robert Havell after John James Audubon,2034525,School,American,
187177,32572,American Flamingo,Robert Havell after John James Audubon,2034787,Technique,engraving,
187178,32572,American Flamingo,Robert Havell after John James Audubon,2034601,Technique,aquatint,


In [66]:
df_1_2["termType"].unique()
# 'Technique', 'Keyword', 'Theme', 'School', 'Style','Systematic Catalogue Volume', 'Place Executed'
df_1_2["termType"].unique()

array(['Technique', 'Keyword', 'Theme', 'School', 'Style',
       'Systematic Catalogue Volume', 'Place Executed'], dtype=object)

In [67]:
uniqueObjects = df1["objectID"].drop_duplicates()
uniqueObjects  #81868 counts of unique objects (objectID)

df2_clean = df2[df2["objectID"].isin(uniqueObjects)].reset_index(drop=True)
df2_clean

Unnamed: 0,termID,objectID,termType,term,visualBrowserTheme
0,2027476,28342,Place Executed,Rhode Island,
1,2027476,28923,Place Executed,Rhode Island,
2,2027477,17753,Place Executed,Newport,
3,2027477,21914,Place Executed,Newport,
4,2027477,26565,Place Executed,Newport,
...,...,...,...,...,...
187175,2027455,29636,Place Executed,New York,
187176,2027455,29637,Place Executed,New York,
187177,2027455,29649,Place Executed,New York,
187178,2027458,12833,Place Executed,Georgia,


In [68]:
# ------------- this is a CHECK -------
df2_clean["objectID"].nunique()  # 73607 objects having terms (we have 81868 objects in total)

73607

## 5.3 : Output cleaned "objects_terms.csv"

In [69]:
df2_clean.to_csv("02_1_objects_terms_cleaned.csv", encoding = "UTF-8", index = False)

# ----------to `Ready` folder (Database Ready) ------------ 
df2_clean.to_csv("../Ready/02_objects_terms.csv", encoding = 'utf-8', index = False)

# .csv file checked to be intact (i.e. 187180 tuples)

# Step 6:  `#3` "objects_associations.csv" Cleaning

## 6.1 : Load raw "objects_associations.csv" in

In [70]:
df3 = pd.read_csv('03_0_objects_associations_clean.csv', encoding = 'utf-8', low_memory = False)
df3 #  24655 pairs of associations

Unnamed: 0,parentObjectID,childObjectID,relationship
0,108011,108076,inseparable
1,54572,54587,separable
2,64668,64671,inseparable
3,81196,81197,inseparable
4,37154,37310,inseparable
...,...,...,...
24650,114898,114927,inseparable
24651,135456,158714,separable
24652,44350,44351,inseparable
24653,139948,159854,separable


## 6.2 : Check if both the `parent-object` AND the `child-object` exists in our `2D-objects collection`
Check if values exist using `isin()`:   
* https://thispointer.com/pandas-check-if-a-value-exists-in-a-dataframe-using-in-not-in-operator-isin/

In [71]:
df3_child = df3[df3["childObjectID"].isin(uniqueObjects)].reset_index(drop=True) # 6457 childObjects (out of 81868 objects)
df3_child

Unnamed: 0,parentObjectID,childObjectID,relationship
0,64668,64671,inseparable
1,74169,74170,inseparable
2,140079,139875,separable
3,214689,214695,inseparable
4,60910,164621,inseparable
...,...,...,...
6452,214991,214998,inseparable
6453,197353,206582,inseparable
6454,123113,123748,separable
6455,60639,60641,separable


In [72]:
df3_parent = df3[df3["parentObjectID"].isin(uniqueObjects)].reset_index(drop=True)
df3_parent # 1091 parentsObjects in our objects' collection

# -------------------------------------------------------------
# df1["parentID"].nunique() #1149 unique parentIDs in "objects.csv"
# df1_PC = df1.loc[df1["parentID"].notnull()] # 6453 objects has parentID (i.e. has a parent)
# df1_PC.columns
# df1_PC[["objectID", "parentID"]]
#df1[df1["parentID"].isin(uniqueObjects)] # 911 parentIDs in our artwork-objects collection
# --------------------------------------------------------------

Unnamed: 0,parentObjectID,childObjectID,relationship
0,74169,74170,inseparable
1,123152,124418,separable
2,46957,46958,inseparable
3,122970,144516,inseparable
4,164865,125320,separable
...,...,...,...
1086,56587,56588,inseparable
1087,142946,149642,inseparable
1088,45632,45634,inseparable
1089,57049,57050,inseparable


In [73]:
df3_pc = df3[ (df3["parentObjectID"].isin(uniqueObjects)) & (df3["childObjectID"].isin(uniqueObjects))].reset_index(drop=True)
df3_pc # 915 pairs of parent-child associations

Unnamed: 0,parentObjectID,childObjectID,relationship
0,74169,74170,inseparable
1,123152,124418,separable
2,46957,46958,inseparable
3,122970,144516,inseparable
4,164865,125320,separable
...,...,...,...
910,39979,39980,inseparable
911,56587,56588,inseparable
912,142946,149642,inseparable
913,57049,57050,inseparable


## 6.3 : Check if any duplicated pairs of parent-child exist

In [98]:
df3_pc = df3_pc.drop_duplicates(subset= ["parentObjectID", "childObjectID"]) 
df3_pc # no duplicates in these pairs

# ------ checking how many pairs of inseperable -------------
df3_pc[df3_pc["relationship"] == 'inseparable'] # 712 pairs of inseperable parent-child
# these parent-child associations should be put as a single section in display of single-object-webpage

Unnamed: 0,parentObjectID,childObjectID,relationship
0,74169,74170,inseparable
2,46957,46958,inseparable
3,122970,144516,inseparable
5,68569,68570,inseparable
8,74188,74189,inseparable
...,...,...,...
910,39979,39980,inseparable
911,56587,56588,inseparable
912,142946,149642,inseparable
913,57049,57050,inseparable


In [100]:
dfpc = pd.merge(left= df1, right = df3_pc, how = "inner", left_on = "objectID", right_on = "childObjectID")
dfpc

Unnamed: 0,objectID,title,displayDate,beginYear,endYear,timeSpan,medium,dimensions,attributionInverted,attribution,classification,parentID,portfolio,series,volume,parentObjectID,childObjectID,relationship
0,0,Saint James Major,c. 1310,1310.0,1310.0,1300 to 1400,tempera on panel,painted surface (top of gilding): 62.2 × 34.8 cm (24 1/2 × 13 11/16 in.)\r\npainted surface (including painted border): 64.8 × 34.8 cm (25 1/2 × 13 11/16 in.)\r\noverall: 66.7 × 36.7 × 1.2 cm (26 ...,Grifo di Tancredi,Grifo di Tancredi,painting,34.0,,,,34,0,separable
1,2,Saint Andrew and Saint Benedict with the Archangel Gabriel [left panel],shortly before 1387,1387.0,1387.0,1300 to 1400,tempera on poplar panel,overall: 197 × 80 cm (77 9/16 × 31 1/2 in.),"Gaddi, Agnolo",Agnolo Gaddi,painting,206122.0,,,,206122,2,separable
2,29,"The Crucifixion with the Virgin, Saint John, Saint Jerome, and Saint Mary Magdalene [left panel]",c. 1482/1485,1482.0,1485.0,1401 to 1500,oil on panel transferred to canvas,left panel: 95 x 30.1 cm (37 3/8 x 11 7/8 in.)\r\nframed: 134 x 165.1 x 7.3 cm (52 3/4 x 65 x 2 7/8 in.),"Perugino, Pietro",Pietro Perugino,painting,206127.0,,,,206127,29,separable
3,30,"The Crucifixion with the Virgin, Saint John, Saint Jerome, and Saint Mary Magdalene [middle panel]",c. 1482/1485,1482.0,1485.0,1401 to 1500,oil on panel transferred to canvas,middle panel: 101.5 x 56.5 cm (39 15/16 x 22 1/4 in.)\r\nframed: 134 x 165.1 x 7.3 cm (52 3/4 x 65 x 2 7/8 in.),"Perugino, Pietro",Pietro Perugino,painting,206127.0,,,,206127,30,separable
4,272,The Baptism of Christ,c. 1335,1335.0,1335.0,1300 to 1400,tempera on panel,painted surface: 46.3 × 49 cm (18 1/4 × 19 5/16 in.)\r\noverall (original panel): 48.8 × 41.2 cm (19 3/16 × 16 1/4 in.)\r\noverall (including added strips): 49.6 × 42 cm (19 1/2 × 16 9/16 in.)\r\n...,"Baronzio, Giovanni",Giovanni Baronzio,painting,12143.0,,,,12143,272,separable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910,134402,Napthali,c. 1585,1585.0,1585.0,1551 to 1600,hand-colored engraving on laid paper,overall (cut within platemark): 21.5 x 14.5 cm (8 7/16 x 5 11/16 in.),"Sadeler I, Jan after Crispen van den Broecke",Jan Sadeler I after Crispen van den Broecke,print,134401.0,The Twelve Sons of Jacob,The Twelve Sons of Jacob,,134401,134402,inseparable
911,134404,Benjamin,c. 1585,1585.0,1585.0,1551 to 1600,hand-colored engraving on laid paper,overall (cut within platemark): 21.8 x 14.5 cm (8 9/16 x 5 11/16 in.),"Sadeler I, Jan after Crispen van den Broecke",Jan Sadeler I after Crispen van den Broecke,print,134403.0,The Twelve Sons of Jacob,The Twelve Sons of Jacob,,134403,134404,inseparable
912,144443,The Laundry (La Lessive),1888,1888.0,1888.0,1876 to 1900,woodcut,,"Bernard, Emile",Emile Bernard,print,118734.0,,,,118734,144443,inseparable
913,210477,"The Night-Boats to The Hague, Delft, and Amsterdam",c. 1652/1654,1647.0,1657.0,1601 to 1650,etching with drypoint on laid paper,,"Nooms, called Zeeman, Reinier","Reinier Nooms, called Zeeman",print,206585.0,,Verscheÿde Schepen en Gesichten van Amstelredam (Various Ships and Views of Amsterdam): Part III: plate 5,,206585,210477,separable


In [99]:
# ------- use this to check if an objectID (whether parentID or childID) exist in our objects'collection ----
checkExist = df1["objectID"] == 206582
# not in our pool: 64671, 214689, 214991
#     in our pool: 214998
df1[checkExist]

Unnamed: 0,objectID,title,displayDate,beginYear,endYear,timeSpan,medium,dimensions,attributionInverted,attribution,classification,parentID,portfolio,series,volume
11177,206582,Tropical Bay [verso],1936,1936.0,1936.0,1926 to 1950,graphite on wove paper,sheet: 28.89 × 36.67 cm (11 3/8 × 14 7/16 in.),"Cash, William Vaughn",William Vaughn Cash,drawing,197353.0,,,


## 6.4 : Output cleaned "objects_associations.csv"

In [76]:
df3_pc.to_csv("03_1_objects_associations_cleaned.csv", encoding = "UTF-8", index = False)

# ----------to `Ready` folder (Database Ready) ------------ 
df3_pc.to_csv("../Ready/03_objects_associations.csv", encoding = 'utf-8', index = False)

# .csv file checked to be intact (i.e. 915 tuples)

# Step 7: #4 "objects_dimensions.csv" Cleaning

## 7.1 : Load raw "objects_dimensions.csv" in

In [77]:
df4 = pd.read_csv('04_0_objects_dimensions_clean.csv', encoding = 'utf-8', low_memory = False)
df4 #  207099 total counts of dimension
df4.columns

Index(['objectID', 'dimensionType', 'dimension', 'unitName'], dtype='object')

In [78]:
df4.nunique() #103189 distinct objectIDs, 7 dimensionTypes, 6 unitName
# df4["dimensionType"].unique()
df4["unitName"].unique()

array(['centimeters', 'inches', '(not specified)', 'pounds', 'grams',
       'kilograms'], dtype=object)

#### `objects_dimensions.csv` table information:
* `7` `dimensionTypes`: `'width', 'height', 'depth', 'diameter', '(not specified)','weight', 'weight1'`
* `6` `unitName`: `'centimeters', 'inches', '(not specified)', 'pounds', 'grams','kilograms'`

## 7.2 : Extract  `dimensions` related to our 2D-Artworks collection

In [79]:
keyAtts = ['objectID', 'title', 'attribution', 'dimensionType', 'dimension', 'unitName']
df_1_4 = pd.merge (left= df1, right = df4, how = "inner", left_on = "objectID", right_on = "objectID")
df_1_4[keyAtts] #113726 counts of dimensions with objectID belonging to our artworks' collection

Unnamed: 0,objectID,title,attribution,dimensionType,dimension,unitName
0,0,Saint James Major,Grifo di Tancredi,height,62.200000,centimeters
1,0,Saint James Major,Grifo di Tancredi,width,34.800000,centimeters
2,1,Saint Paul and a Group of Worshippers,Bernardo Daddi,height,224.800000,centimeters
3,1,Saint Paul and a Group of Worshippers,Bernardo Daddi,width,77.000000,centimeters
4,2,Saint Andrew and Saint Benedict with the Archangel Gabriel [left panel],Agnolo Gaddi,height,197.000000,centimeters
...,...,...,...,...,...,...
113721,222965,Caernarvon Castle (Night),Paul Sandby,width,31.500000,centimeters
113722,32452,American White Pelican,Robert Havell after John James Audubon,height,89.535179,centimeters
113723,32452,American White Pelican,Robert Havell after John James Audubon,width,60.007620,centimeters
113724,32572,American Flamingo,Robert Havell after John James Audubon,height,87.630175,centimeters


In [80]:
# ------------ Naughty Search Artwork by weight -------------
#df_14w = df_1_4[ (df_1_4["unitName"] == 'kilograms') | (df_1_4["unitName"] == 'pounds') | (df_1_4["unitName"] == 'grams')]
#df_14w[keyAtts]

In [81]:
df_1_4["objectID"].nunique() #56737 distinct objectIDs out of 113726 dimensions

#------- sort by objectID to check how many dimension tuples per objectID ---------
#df_14Sort = df_1_4.sort_values(by=['objectID'])
#df_14Sort[keyAtts]
# NOTE: typically every artwork-object has at least 2~3 dimension tuples (height, width; sometimes depth, and weight)

56737

### DataFrame.sort_values() function
* https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html

## 7.3: Check if any duplicated values

In [82]:
df4_clean = df_1_4[['objectID', 'dimensionType', 'dimension', 'unitName']]
df4_clean #113726 dimension tuples
df4_clean.drop_duplicates(subset=["objectID", "dimensionType"]) # still 113726 dimension tuples (no duplicates)

Unnamed: 0,objectID,dimensionType,dimension,unitName
0,0,height,62.200000,centimeters
1,0,width,34.800000,centimeters
2,1,height,224.800000,centimeters
3,1,width,77.000000,centimeters
4,2,height,197.000000,centimeters
...,...,...,...,...
113721,222965,width,31.500000,centimeters
113722,32452,height,89.535179,centimeters
113723,32452,width,60.007620,centimeters
113724,32572,height,87.630175,centimeters


## 7.4 : output cleaned "objects_dimensions.csv" 

In [83]:
df4_clean.to_csv("04_1_objects_dimensions_cleaned.csv", encoding = "UTF-8", index = False)

# ----------to `Ready` folder (Database Ready) ------------ 
df4_clean.to_csv("../Ready/04_objects_dimensions.csv", encoding = 'utf-8', index = False)

# .csv file checked to be intact (i.e. 113726 tuples)

# Step 8: `#5` "objects_text_entries.csv" Cleaning

## 8.1 : Load raw "objects_text_entries.csv" in

In [16]:
df5 = pd.read_csv('05_0_objects_text_entries_clean.csv', encoding = 'utf-8', low_memory = False)
#df5 # 208329 total counts of text-entries
df5.columns

Index(['objectID', 'text', 'textType', 'Year'], dtype='object')

In [17]:
keyAtts = ['objectID', 'title', 'attribution', 'text', 'textType', 'Year']
df_1_5 = pd.merge (left= df1, right = df5, how = "inner", left_on = "objectID", right_on = "objectID")
df_1_5[keyAtts] #148949 counts of text-entries with objectID belonging to our artworks' collection

Unnamed: 0,objectID,title,attribution,text,textType,Year
0,0,Saint James Major,Grifo di Tancredi,"_European Paintings: An Illustrated Catalogue_. National Gallery of Art, Washington, 1985: 90, repro.",bibliography,1985.0
1,0,Saint James Major,Grifo di Tancredi,"Duveen Brothers. _Duveen Pictures in Public Collections of America_. New York, 1941: no. 5, repro., as by Cimabue.",bibliography,1941.0
2,0,Saint James Major,Grifo di Tancredi,"_Preliminary Catalogue of Paintings and Sculpture_. National Gallery of Art, Washington, 1941: 41, no. 2, as by Cimabue.",bibliography,1941.0
3,0,Saint James Major,Grifo di Tancredi,"_Book of Illustrations_. National Gallery of Art, Washington, 1942: 239, repro. 85, as by Cimabue.",bibliography,1942.0
4,0,Saint James Major,Grifo di Tancredi,"_Paintings and Sculpture from the Mellon Collection_. National Gallery of Art, Washington, 1949 (reprinted 1953 and 1958): 5, repro.",bibliography,1949.0
...,...,...,...,...,...,...
148944,32452,American White Pelican,Robert Havell after John James Audubon,"Three Centuries of American Prints: from the National Gallery of Art, National Gallery of Art, Washington; National Gallery in Prague, Prague 1; Antiguo Colegio de San Ildefonso, Mexico City; Dall...",exhibition_history,2016.0
148945,32572,American Flamingo,Robert Havell after John James Audubon,"Audubon, John James. _The Birds of America_. London: 1827-1838.",bibliography,1827.0
148946,32572,American Flamingo,Robert Havell after John James Audubon,"National Gallery of Art. _Highlights from the National Gallery of Art, Washington_. Washington, 2016: 198, repro.",bibliography,2016.0
148947,32572,American Flamingo,Robert Havell after John James Audubon,"Audubon's Dream Realized: Selections from ""The Birds of America"". The National Gallery of Art, Washington, 2005.",exhibition_history,2005.0


In [18]:
df_1_5['textType'].unique()
df_1_5['objectID'].nunique() #44631 objectIDs (of our 81868 objects) have some text-entries 
# typically every artwork-object has more than one text-entries (i.e. an artwork may have multiple bibliographies) 

44631

### objects_dimensions.csv table information:
* 5 textTypes : `'bibliography', 'exhibition_history', 'inscription_footnote','exhibition_history_footnote', 'documentary_labels_inscriptions'`

In [28]:
df5_clean = df_1_5[['objectID', 'text', 'textType', 'Year']].rename(columns={'Year': "year"})
#df5_clean #148949 text-entries for 44631 objectsIDs
#df5_clean['objectID'].nunique() #44631

#df5_clean = df5_clean.drop_duplicates(subset = ["objectID", "text","textType", "Year"]) 
#148942 distinct text-entries (checking "objectID", "text", "Year" attributes)

df5_clean = df5_clean.drop_duplicates(subset = ["objectID", "textType", "year"])
#136672 distinct text-entries (checking "objectID", "textType" attributes)

# use (objectID, textType, year) as primary key for this table
dfbib = df5_clean[df5_clean["textType"] == "bibliography"] # 95197 bibliography
dfhis = df5_clean[df5_clean["textType"] == "exhibition_history"] # 41345 exhibition_history
df5_clean = dfbib.append(dfhis) # 136542 text_entries 
df5_clean['year'] = df5_clean['year'].astype('Int64')
df5_clean

Unnamed: 0,objectID,text,textType,year
0,0,"_European Paintings: An Illustrated Catalogue_. National Gallery of Art, Washington, 1985: 90, repro.",bibliography,1985
1,0,"Duveen Brothers. _Duveen Pictures in Public Collections of America_. New York, 1941: no. 5, repro., as by Cimabue.",bibliography,1941
3,0,"_Book of Illustrations_. National Gallery of Art, Washington, 1942: 239, repro. 85, as by Cimabue.",bibliography,1942
4,0,"_Paintings and Sculpture from the Mellon Collection_. National Gallery of Art, Washington, 1949 (reprinted 1953 and 1958): 5, repro.",bibliography,1949
5,0,"Einstein, Lewis. _Looking at Italian Pictures in the National Gallery of Art_. Washington, 1951: 16-18, repro., as by Cimabue.",bibliography,1951
...,...,...,...,...
148941,222965,"Medieval to Modern: Recent Acquisitions of Drawings, Prints, and Illustrated Books, National Gallery of Art, Washington, 2008, no. 134 b.",exhibition_history,2008
148943,32452,"Audubon's Dream Realized: Selections from ""The Birds of America"". The National Gallery of Art, Washington, 2005.",exhibition_history,2005
148944,32452,"Three Centuries of American Prints: from the National Gallery of Art, National Gallery of Art, Washington; National Gallery in Prague, Prague 1; Antiguo Colegio de San Ildefonso, Mexico City; Dall...",exhibition_history,2016
148947,32572,"Audubon's Dream Realized: Selections from ""The Birds of America"". The National Gallery of Art, Washington, 2005.",exhibition_history,2005


## 8.4 : output cleaned "objects_text_entries.csv"

In [29]:
df5_clean.to_csv("05_1_objects_text_entries_cleaned.csv", encoding = "UTF-8", index = False)

# ----------to `Ready` folder (Database Ready) ------------ 
df5_clean.to_csv("../Ready/05_objects_text_entries.csv", encoding = 'utf-8', index = False)

# .csv file checked to be intact (i.e. 136672 tuples)

# Step 9: `#9` "media_items.csv" Cleaning

## 9.1 : Load raw "media_items.csv" in

In [89]:
#df9 = pd.read_csv('09_0_media_items_raw.csv', encoding = 'utf-8', low_memory = False)
#df9 = df9[["mediaid", "mediatype", "title", "language", "playurl", "imageurl"]] #  total counts of media_item
#df9.rename(columns={"mediaid":"mediaID", "mediatype": "mediaType", "playurl": "playURL", "imageurl": "imageURL"})
# df9.columns

raw "media_items.csv" columns:
* 'mediaid', 'mediatype', 'title', 'description', 'duration', 'language','thumbnailurl', 'playurl', 'downloadurl', 'keywords', 'tags','imageurl', 'presentationdate', 'releasedate', 'lastmodified'
* columns of interests: mediaID, mediaType, title, language, playURL, imageURL

#### Pandas Data Types
* https://pbpython.com/pandas_dtypes.html

#### rename column names
* https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html