## SETUP

In [1]:
import pandas as pd
import numpy as np
import cv2
import os
import time
import matplotlib.pyplot as plt
from matplotlib import image as mpimg
from PIL import Image
import shutil


# show all dataframe
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_columns', None)  

## IMPORT

### met

In [44]:
df_met_raw = pd.read_csv('scrapped_data\MET_full_selection_Keywords.csv')

In [3]:
df_met_raw.columns

Index(['objectID', 'isHighlight', 'accessionNumber', 'accessionYear',
       'isPublicDomain', 'primaryImage', 'primaryImageSmall',
       'additionalImages', 'constituents', 'department', 'objectName', 'title',
       'culture', 'period', 'dynasty', 'reign', 'portfolio', 'artistRole',
       'artistPrefix', 'artistDisplayName', 'artistDisplayBio', 'artistSuffix',
       'artistAlphaSort', 'artistNationality', 'artistBeginDate',
       'artistEndDate', 'artistGender', 'artistWikidata_URL', 'artistULAN_URL',
       'objectDate', 'objectBeginDate', 'objectEndDate', 'medium',
       'dimensions', 'measurements', 'creditLine', 'geographyType', 'city',
       'state', 'county', 'country', 'region', 'subregion', 'locale', 'locus',
       'excavation', 'river', 'classification', 'rightsAndReproduction',
       'linkResource', 'metadataDate', 'repository', 'objectURL', 'tags',
       'objectWikidata_URL', 'isTimelineWork', 'GalleryNumber'],
      dtype='object')

In [45]:
# select some columns for easier management
col_sel = ['objectID','isPublicDomain', 'primaryImage' ,'department','objectName', 
            'title', 'culture']

df_met = df_met_raw[col_sel]

In [46]:
# only take the ones in public domain
# because the first batch is public domain images only
df_met = df_met[df_met['isPublicDomain']==True]

In [47]:
# get rid of the one without primary image link
df_met = df_met[df_met['primaryImage'].notna()==True]

In [48]:
df_met.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3017 entries, 13 to 14289
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   objectID        3017 non-null   int64 
 1   isPublicDomain  3017 non-null   bool  
 2   primaryImage    3017 non-null   object
 3   department      3017 non-null   object
 4   objectName      3017 non-null   object
 5   title           3016 non-null   object
 6   culture         2903 non-null   object
dtypes: bool(1), int64(1), object(5)
memory usage: 167.9+ KB


### va

In [49]:
df_va_raw = pd.read_csv('scrapped_data/VA_all_sel.csv')
df_va_raw.head(1)

Unnamed: 0,systemNumber,accessionNumber,objectType,_currentLocation,_primaryTitle,_primaryMaker,_primaryImageId,_primaryDate,_primaryPlace,_warningTypes,_images,image_value,image_base
0,O1314778,S.1658-2015,costume,"{'id': 'THES49318', 'displayName': 'In store', 'type': 'storage', 'site': 'BH', 'onDisplay': False, 'detail': {'free': '', 'case': '', 'shelf': '', 'box': ''}}",Costume,"{'name': 'Strassner, Joe', 'association': 'designers'}",2015HV9075,1936,Great Britain,[],"{'_primary_thumbnail': 'https://framemark.vam.ac.uk/collections/2015HV9075/full/!100,100/0/default.jpg', '_iiif_image_base_url': 'https://framemark.vam.ac.uk/collections/2015HV9075/', '_iiif_presentation_url': None, 'imageResolution': 'low'}","dict_values(['https://framemark.vam.ac.uk/collections/2015HV9075/full/!100,100/0/default.jpg', 'https://framemark.vam.ac.uk/collections/2015HV9075/', None, 'low'])",https://framemark.vam.ac.uk/collections/2015HV9075/


In [50]:
df_va_raw.columns

Index(['systemNumber', 'accessionNumber', 'objectType', '_currentLocation',
       '_primaryTitle', '_primaryMaker', '_primaryImageId', '_primaryDate',
       'image_base'],
      dtype='object')

In [51]:
col_sel = ['systemNumber', 'objectType', '_primaryTitle', '_primaryDate', '_primaryPlace', 'image_base']
df_va = df_va_raw[col_sel]

In [52]:
df_va.head(1)

Unnamed: 0,systemNumber,objectType,_primaryTitle,_primaryDate,_primaryPlace,image_base
0,O1314778,costume,Costume,1936,Great Britain,https://framemark.vam.ac.uk/collections/2015HV9075/


In [25]:
df_va.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7096 entries, 0 to 7095
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   systemNumber   7096 non-null   object
 1   objectType     7095 non-null   object
 2   _primaryTitle  1167 non-null   object
 3   _primaryDate   6779 non-null   object
 4   _primaryPlace  6104 non-null   object
 5   image_base     7096 non-null   object
dtypes: object(6)
memory usage: 332.8+ KB


In [53]:
df_va.nunique()

systemNumber     7096
objectType        564
_primaryTitle     790
_primaryDate     1603
_primaryPlace     349
image_base       7096
dtype: int64

## cleanup culture

### combine two dataset

In [123]:
df_va.columns

Index(['systemNumber', 'objectType', '_primaryTitle', '_primaryDate',
       'culture', 'image_base'],
      dtype='object')

In [124]:
df_va.rename(columns={'_primaryPlace': 'culture','objectType': 'objectName'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_va.rename(columns={'_primaryPlace': 'culture','objectType': 'objectName'}, inplace=True)


In [125]:
df = pd.concat([df_met,df_va])

In [57]:
df.culture.nunique()

422

In [58]:
df.culture.unique()

array(['British', 'American', 'French', 'Italian', 'Spanish',
       'probably British', 'probably French', 'European',
       'probably American', 'American or European', 'probably Mexican',
       'China', nan, 'Greek', 'Russian', 'French (Breton)', 'Romanian',
       'Hungarian', 'Danish', 'Slovak', 'Bulgarian',
       'Crow, Native American', 'Japan', 'Japanese', 'probably Italian',
       'Greek (Attic)', 'probably Scottish', 'Chinese',
       'Arapaho, Native American', 'German', 'Irish', 'Belgian',
       'probably Russian', 'Czech (Moravian)', 'probably Japanese',
       'probably European', 'Philippine', 'Macedonian', 'Croatian',
       'Albanian', 'Albanian (Malissori)', 'Greek (Corinthian)',
       'Scottish', 'European, Eastern', 'Innu/Naskapi, Native American',
       'Peruvian', 'Montenegrin', 'Norwegian', 'Czech (Hanáci)',
       'Portuguese', 'British or French', 'probably Spanish', 'Austrian',
       'Dutch', 'probably Swiss', 'probably German', 'Serbian',
       'Wasc

### create culture label

fill culture with department name

In [79]:
df['culture_Label'] = df['culture']

In [80]:
df_temp = df[df['culture_Label'].isna()==True]

In [81]:
df_temp['department'].value_counts()

Islamic Art    114
Name: department, dtype: int64

In [82]:
cond1 = (df['culture_Label'].isna() == True)

df.loc[cond1,'culture_Label'] = 'Islamic'

In [83]:
df['culture_Label'].isna().sum()

0

### combine cultures

In [87]:
# title all labels
df['culture_Label'].nunique()

423

In [88]:
df['culture_Label'] = df['culture_Label'].map(lambda x: x.title())

In [90]:
df['culture_Label'].nunique()

412

In [118]:
# remove 'probably' and possibly
df['culture_Label'] = df['culture_Label'].str.replace('Probably ','')
df['culture_Label'] = df['culture_Label'].str.replace(', Possibly','')


In [92]:
df['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish',
       'Probably British', 'Probably French', 'European',
       'Probably American', 'American Or European', 'Probably Mexican',
       'China', 'Islamic', 'Greek', 'Russian', 'French (Breton)',
       'Romanian', 'Hungarian', 'Danish', 'Slovak', 'Bulgarian',
       'Crow, Native American', 'Japan', 'Japanese', 'Probably Italian',
       'Greek (Attic)', 'Probably Scottish', 'Chinese',
       'Arapaho, Native American', 'German', 'Irish', 'Belgian',
       'Probably Russian', 'Czech (Moravian)', 'Probably Japanese',
       'Probably European', 'Philippine', 'Macedonian', 'Croatian',
       'Albanian', 'Albanian (Malissori)', 'Greek (Corinthian)',
       'Scottish', 'European, Eastern', 'Innu/Naskapi, Native American',
       'Peruvian', 'Montenegrin', 'Norwegian', 'Czech (Hanáci)',
       'Portuguese', 'British Or French', 'Probably Spanish', 'Austrian',
       'Dutch', 'Probably Swiss', 'Probably German', 'Serbian',
      

In [93]:
# delete content in ()
df['culture_Label'] = df['culture_Label'].str.replace(r"\(.*\)","")

  df['culture_Label'] = df['culture_Label'].str.replace(r"\(.*\)","")


In [94]:
df['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish',
       'Probably British', 'Probably French', 'European',
       'Probably American', 'American Or European', 'Probably Mexican',
       'China', 'Islamic', 'Greek', 'Russian', 'French ', 'Romanian',
       'Hungarian', 'Danish', 'Slovak', 'Bulgarian',
       'Crow, Native American', 'Japan', 'Japanese', 'Probably Italian',
       'Greek ', 'Probably Scottish', 'Chinese',
       'Arapaho, Native American', 'German', 'Irish', 'Belgian',
       'Probably Russian', 'Czech ', 'Probably Japanese',
       'Probably European', 'Philippine', 'Macedonian', 'Croatian',
       'Albanian', 'Albanian ', 'Scottish', 'European, Eastern',
       'Innu/Naskapi, Native American', 'Peruvian', 'Montenegrin',
       'Norwegian', 'Portuguese', 'British Or French', 'Probably Spanish',
       'Austrian', 'Dutch', 'Probably Swiss', 'Probably German',
       'Serbian', 'Wasco, Native American',
       'Lakota/ Teton Sioux, Native American', 'Latvian'

In [95]:
# delete white space
df['culture_Label'] = df['culture_Label'].str.strip()

In [96]:
df['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish',
       'Probably British', 'Probably French', 'European',
       'Probably American', 'American Or European', 'Probably Mexican',
       'China', 'Islamic', 'Greek', 'Russian', 'Romanian', 'Hungarian',
       'Danish', 'Slovak', 'Bulgarian', 'Crow, Native American', 'Japan',
       'Japanese', 'Probably Italian', 'Probably Scottish', 'Chinese',
       'Arapaho, Native American', 'German', 'Irish', 'Belgian',
       'Probably Russian', 'Czech', 'Probably Japanese',
       'Probably European', 'Philippine', 'Macedonian', 'Croatian',
       'Albanian', 'Scottish', 'European, Eastern',
       'Innu/Naskapi, Native American', 'Peruvian', 'Montenegrin',
       'Norwegian', 'Portuguese', 'British Or French', 'Probably Spanish',
       'Austrian', 'Dutch', 'Probably Swiss', 'Probably German',
       'Serbian', 'Wasco, Native American',
       'Lakota/ Teton Sioux, Native American', 'Latvian',
       'Probably Greek', 'Innu/ Naskapi,

In [97]:
# combine Native American culture
cond = (df['culture_Label'].str.contains('Native American'))
df.loc[cond,'culture_Label'] = 'Native American'

In [98]:
df['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish',
       'Probably British', 'Probably French', 'European',
       'Probably American', 'American Or European', 'Probably Mexican',
       'China', 'Islamic', 'Greek', 'Russian', 'Romanian', 'Hungarian',
       'Danish', 'Slovak', 'Bulgarian', 'Native American', 'Japan',
       'Japanese', 'Probably Italian', 'Probably Scottish', 'Chinese',
       'German', 'Irish', 'Belgian', 'Probably Russian', 'Czech',
       'Probably Japanese', 'Probably European', 'Philippine',
       'Macedonian', 'Croatian', 'Albanian', 'Scottish',
       'European, Eastern', 'Peruvian', 'Montenegrin', 'Norwegian',
       'Portuguese', 'British Or French', 'Probably Spanish', 'Austrian',
       'Dutch', 'Probably Swiss', 'Probably German', 'Serbian', 'Latvian',
       'Probably Greek', 'Indian', 'Myanmar', 'Cambodia, Possibly',
       'India', 'Tibet', 'Japan Or Netherlands', 'China, Turkestan',
       'Burma', 'Great Britain', 'United States Of Americ

In [107]:
# mixed culture --> fusion
sel = ['American Or European','European, Eastern','British Or French', 'Japan Or Netherlands', 'China, Turkestan']

for s in sel:
    cond = df['culture_Label']==s
    df.loc[cond,'culture_Label']='fusion'

In [108]:
# checkout labels sorted
ls = df['culture_Label'].unique()
ls.sort()
print(ls)

['Abuja' 'Accra' 'Aceh' 'Afghanistan' 'Africa' 'Ahmedabad' 'Al-Khobar'
 'Albania' 'Albanian' 'Algeria' 'America' 'American' 'Amritsar'
 'Amsterdam' 'Antwerp' 'Assam' 'Athens' 'Attica' 'Australia' 'Austria'
 'Austrian' 'Ayrshire' 'Bahrain' 'Baluchistan' 'Bamako' 'Bangladesh'
 'Bannu' 'Bath' 'Bedfordshire' 'Beijing' 'Belgian' 'Belgium' 'Bengal'
 'Bergen' 'Berlin' 'Berne' 'Bethlehem' 'Bharatpur' 'Bhopal' 'Bhutan'
 'Bikaner' 'Binche' 'Birmingham' 'Bitolj' 'Blackpool' 'Bombay' 'Borneo'
 'Bosnia' 'Brighton' 'Bristol' 'Britain' 'British' 'Brittany' 'Bromley'
 'Brussels' 'Buenos Aires' 'Bulgaria' 'Bulgarian' 'Burhanpur' 'Burma'
 'Calcutta' 'California' 'Cambodia, Possibly' 'Cambridge' 'Cambridgeshire'
 'Canada' 'Carrickmacross' 'Catterall' 'Cesena' 'Chanderi' 'Chelsea'
 'Chennai' 'Cheshire' 'China' 'Cornwall' 'Coromandel Coast' 'Crete'
 'Croatia' 'Croatian' 'Czech' 'Czech Republic' 'Dakar' 'Dalmatia' 'Danish'
 'Deccan' 'Delhi' 'Denmark' 'Dera Ghazi Khan' 'Derby' 'Dhaka' 'Dharwar'
 'Dodecanese'

In [120]:
# replace *ese
ese_dict = {'Chinese':'China','Japanese':'Japan','Indian':'India','French':'France'}

df = df.replace({'culture_Label': ese_dict})

In [110]:
ls = df['culture_Label'].unique()
ls.sort()
print(ls)

['Abuja' 'Accra' 'Aceh' 'Afghanistan' 'Africa' 'Ahmedabad' 'Al-Khobar'
 'Albania' 'Albanian' 'Algeria' 'America' 'American' 'Amritsar'
 'Amsterdam' 'Antwerp' 'Assam' 'Athens' 'Attica' 'Australia' 'Austria'
 'Austrian' 'Ayrshire' 'Bahrain' 'Baluchistan' 'Bamako' 'Bangladesh'
 'Bannu' 'Bath' 'Bedfordshire' 'Beijing' 'Belgian' 'Belgium' 'Bengal'
 'Bergen' 'Berlin' 'Berne' 'Bethlehem' 'Bharatpur' 'Bhopal' 'Bhutan'
 'Bikaner' 'Binche' 'Birmingham' 'Bitolj' 'Blackpool' 'Bombay' 'Borneo'
 'Bosnia' 'Brighton' 'Bristol' 'Britain' 'British' 'Brittany' 'Bromley'
 'Brussels' 'Buenos Aires' 'Bulgaria' 'Bulgarian' 'Burhanpur' 'Burma'
 'Calcutta' 'California' 'Cambodia, Possibly' 'Cambridge' 'Cambridgeshire'
 'Canada' 'Carrickmacross' 'Catterall' 'Cesena' 'Chanderi' 'Chelsea'
 'Chennai' 'Cheshire' 'China' 'Cornwall' 'Coromandel Coast' 'Crete'
 'Croatia' 'Croatian' 'Czech' 'Czech Republic' 'Dakar' 'Dalmatia' 'Danish'
 'Deccan' 'Delhi' 'Denmark' 'Dera Ghazi Khan' 'Derby' 'Dhaka' 'Dharwar'
 'Dodecanese'

In [111]:
df['culture_Label'].nunique()
# still too many

376

In [116]:
# unitify usa
map_dict = {'United States Of America':'America','Usa':'America','American':'America','United States':'America'}

df = df.replace({'culture_Label': map_dict})

In [121]:
df['culture_Label'].value_counts().head(20)

America           1377
Great Britain     1225
Islamic           1106
London             920
France             913
England            665
Paris              636
China              414
Japan              316
British            267
Italy              233
fusion             176
United Kingdom     104
Britain             91
American            79
European            64
Manchester          61
New York            60
India               57
Hong Kong           45
Name: culture_Label, dtype: int64

In [372]:
# group the bottoms ones to 'minor'
label_ls = df['culture_Label'].value_counts(normalize=True)

In [373]:
type(label_ls)

pandas.core.series.Series

In [374]:
# only keep the top 21
# why 21? because the no. 21 is native american
# I'd love to keep it
minor = label_ls.index[20:]
minor

Index(['Scottish', 'Irish', 'Slovak', 'Danish', 'Belgian', 'Dutch', 'Mexican',
       'Philippine', 'Czech', 'Swiss', 'Cambodia', 'Myanmar', 'Latvian',
       'Serbian', 'Macedonian', 'Croatian', 'Portuguese', 'Norwegian',
       'Montenegrin', 'Peruvian', 'Bulgarian', 'Burma'],
      dtype='object')

In [375]:
df_met['culture_Label'] = df_met['culture_Label'].map(lambda x: 'minor' if x in set(minor) else x)

In [376]:
df_met['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish', 'European',
       'fusion', 'minor', 'China', 'Islamic', 'Greek', 'Russian',
       'Romanian', 'Hungarian', 'Native American', 'Japan', 'German',
       'Albanian', 'Austrian', 'India', 'Tibet'], dtype=object)

In [377]:
df_met['culture_Label'].nunique()

21

In [378]:
df_met['culture_Label'].value_counts()

American           1214
French              499
British             267
China               248
Japan               214
fusion              176
Islamic             114
European             64
minor                43
Italian              35
Spanish              29
Russian              20
Greek                19
Romanian             13
Hungarian            11
Tibet                11
Albanian             10
German                9
Austrian              8
India                 7
Native American       6
Name: culture_Label, dtype: int64

### va

In [29]:
df_va._primaryPlace.value_counts()

Great Britain    1224
London            918
England           658
Paris             633
France            414
                 ... 
Cornwall            1
Guernsey            1
Malton              1
Swansea             1
Taiwan              1
Name: _primaryPlace, Length: 349, dtype: int64

## move images to their folders

### create folders

In [379]:
df_met['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish', 'European',
       'fusion', 'minor', 'China', 'Islamic', 'Greek', 'Russian',
       'Romanian', 'Hungarian', 'Native American', 'Japan', 'German',
       'Albanian', 'Austrian', 'India', 'Tibet'], dtype=object)

In [380]:
df_met['culture_Label'].nunique()

21

In [381]:
for n in df_met['culture_Label'].unique():
    newpath = r'images_tf/' + n

    if not os.path.exists(newpath):
        os.makedirs(newpath)

### move file to its folder

### prototype

In [213]:
# Source path
source = "images_proc/full_selection_sel/42112.jpg"

# Destination path
destination = "images_tf/Albanian/14099.jpg"

shutil.copy(source, destination)

'images_tf/Albanian/14099.jpg'

In [247]:
file_name = '42112.jpg'

In [248]:
file_name[:-4]

'42112'

In [258]:
src_file_name = '81263.jpg'
source = 'images_proc/full_selection_sel/' + src_file_name # Source path

name = src_file_name[:-4]
des_folder = df_met.loc[df['Object ID']==int(name),'culture_Label'].values[0]
destination = 'images_tf/' + des_folder +'/' + file_name # Destination path

shutil.copy(source, destination)

'images_tf/French/42112.jpg'

In [277]:
src_path = 'test'
des_path = 'images_tf/'

for filename in os.scandir(src_path):
    src_file_name = filename.name
    source = filename.path # Source path

    name = src_file_name[:-4]
    des_folder = df_met.loc[df_met['Object ID']==int(name),'culture_Label'].values[0]

    destination = des_path + des_folder +'/' + src_file_name # Destination path

    shutil.copy(source, destination)


### define function

In [383]:
def move_file(src_path = 'test',des_path = 'images_tf/'):

    for filename in os.scandir(src_path):
        src_file_name = filename.name
        source = filename.path # Source path

        name = src_file_name[:-4]
        # print(name)
        des_folder = df_met.loc[df_met['objectID']==int(name),'culture_Label'].values[0]

        destination = des_path + des_folder +'/' + src_file_name # Destination path

        shutil.copy(source, destination)


### test run

In [282]:
move_file()

### run on full

In [384]:
move_file('images_proc/full_selection_sel','images_tf/')

## (ongoing) Object type

In [132]:
keywords = ['costume', 'robe', 'dress', 'shirt','ensemble','jacket','coat','suit','trousers']

AttributeError: 'float' object has no attribute 'lower'

In [128]:
df['objectName'].nunique()

889

In [129]:
df['objectName'].value_counts()

dress                    1141
Dress                     759
evening dress             638
jacket                    552
shirt                     437
                         ... 
Robe, Presentation          1
wedding dress skirt         1
paper dress fabric          1
Robe, Boy's                 1
architectural drawing       1
Name: objectName, Length: 889, dtype: int64