In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob

In [2]:
MetCity = pd.read_csv('MetCity.csv')
materials = pd.read_csv('Materials_freq.csv')

In [3]:
med = list(materials['Medium'])

In [4]:
#Using TextBlob each item is classified using POS tagging to separate the methods from the materials
tags = []
for i in med:
    txt = TextBlob(i)
    tag = txt.tags
    tags.append(list(tag[-1]))
    
tagsPD = pd.DataFrame(tags)

In [6]:
#Joining the tag classifications with the materials dataframe
matTag = pd.merge(materials, tagsPD, left_index=True, right_index=True).drop([0, 'Unnamed: 0'], axis = 1)
matTag.columns = ['Medium', 'Frequency', 'Tag']
TG = list(tagsPD[1].unique())
matTag = matTag.set_index('Tag')

Go to https://medium.com/@gianpaul.r/tokenization-and-parts-of-speech-pos-tagging-in-pythons-nltk-library-2d30f70af13b to understand the POS tags, basically verbs (VBG and VBN) constituted methods of artwork and nouns (NNS) constituted a material

In [8]:
#Below are the tags identified
TG

['VBG', 'NN', 'NNS', 'VBN', 'JJ', 'IN', 'RB', 'CD']

# Artwork Method Data

#### The code below matches the method/material tags with the MET dataset and creates columns indicating whether the artwork is made a certain way or of a certain material. This new dataframe is indexed by the Object ID and so can be referred back to the original dataset. Setting the data in this format prepares it for a clustering analysis or other kinds of analyses we wish to do.

In [9]:

#Extracting list of methods
#TextBlob misidentifed a few methods as material as these had to be returned to the method list

method = matTag.loc[['VBG', 'VBN']]
method.loc['NN'] = ['woodcut', 1018]
method.loc['L'] = ['lithography',834 ]

#Extracting list of materials
mat = matTag.loc['NN']
mat = mat.set_index('Medium')
mat = mat.loc[
    ['wood',
 'earthenware',
 'silver',
 'glass',
 'stucco',
 'gold',
 'ink',
 'copper',
 'steel',
 'glaze',
 'préaud',
 'paper',
 'silk',
 'brass',
 'paint',
 'porcelain',
 'book',
 'mahogany',
 'watercolor',
 'ivory',
 'stonepaste',
 'lithograph',
 'metal',
 'cotton',
 'alloy',
 'enamel',
 'pine',
 'bronze',
 'bone',
 'iron',
 'leather',
 'pochoir',
 'linen',
 'wool',
 'maple',
 'gilt',
 'palm',
 'walnut',
 'limestone',
 'thread',
 'pewter',
 'poplar',
 'fiber',
 'marble']
]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [22]:
#This function takes in the MET data and the method/materials list and produces a dataframe indicating whether
#the artwork incoporates a certain method/material, this is show as either 1 (True) or 0 (False)

def mediumMatch(data, mediumItems):
    mediumItems = mediumItems.reset_index()
    tagList = list(mediumItems['Medium'])
    tagList.append('Object ID')      
    PD = pd.DataFrame(columns = tagList)
    PD['Object ID'] = data['Object ID'] 
    PD = PD.set_index('Object ID')
    medList = list(data['Medium'])
    IDList = list(data['Object ID'])
    tagList.remove('Object ID')
    
    for i, ID in zip(medList, IDList):
    
        match = []
        medium = str(i).lower()
    
        for j in tagList:
        
            if j in str(medium):
                match.append(1)
            if j not in str(medium): 
                match.append(0)
    
            if len(match) == len(PD.columns):
                PD.loc[ID] = match
                break
            
    print(PD.sum())
    return PD

## Final Output

In [17]:
MethodsPD = mediumMatch(MetCity, method)

etching        4476
engraving      3163
carved         2092
painted        1838
printed        1296
glazed         1007
pressed         621
incised         577
engraved        437
molded          299
embroidered     210
woodcut        1147
lithography     808
dtype: int64


In [11]:
MaterialsPD = mediumMatch(MetCity, mat)

wood           4187
earthenware    2402
silver         2322
glass          1965
stucco         1732
gold           1593
ink            1569
copper         1165
steel          1070
glaze          2176
préaud         1044
paper           997
silk            934
brass           933
paint          2735
porcelain       846
book            676
mahogany        536
watercolor      592
ivory           558
stonepaste      548
lithograph     1649
metal           619
cotton          461
alloy           405
enamel          447
pine            400
bronze          398
bone            389
iron            384
leather         373
pochoir         371
linen           352
wool            322
maple           315
gilt            379
palm            279
walnut          242
limestone       248
thread          274
pewter          224
poplar          214
fiber           211
marble          192
dtype: int64


In [21]:
MaterialsPD.to_csv('Materials.csv')
MethodsPD.to_csv('Methods.csv')
method.to_csv('methodList.csv')
mat.to_csv('materialList.csv')