## SETUP

In [203]:
import pandas as pd
import numpy as np
import cv2
import os
import time
import matplotlib.pyplot as plt
from matplotlib import image as mpimg
from PIL import Image
import shutil


# show all dataframe
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_columns', None)  

## IMPORT

In [299]:
df_raw = pd.read_csv('scrapped_data\MET_full_selection_Keywords.csv')

In [303]:
df_raw.columns

Index(['objectID', 'isHighlight', 'accessionNumber', 'accessionYear',
       'isPublicDomain', 'primaryImage', 'primaryImageSmall',
       'additionalImages', 'constituents', 'department', 'objectName', 'title',
       'culture', 'period', 'dynasty', 'reign', 'portfolio', 'artistRole',
       'artistPrefix', 'artistDisplayName', 'artistDisplayBio', 'artistSuffix',
       'artistAlphaSort', 'artistNationality', 'artistBeginDate',
       'artistEndDate', 'artistGender', 'artistWikidata_URL', 'artistULAN_URL',
       'objectDate', 'objectBeginDate', 'objectEndDate', 'medium',
       'dimensions', 'measurements', 'creditLine', 'geographyType', 'city',
       'state', 'county', 'country', 'region', 'subregion', 'locale', 'locus',
       'excavation', 'river', 'classification', 'rightsAndReproduction',
       'linkResource', 'metadataDate', 'repository', 'objectURL', 'tags',
       'objectWikidata_URL', 'isTimelineWork', 'GalleryNumber'],
      dtype='object')

In [348]:
# select some columns for easier management
col_sel = ['objectID','isPublicDomain', 'primaryImage' ,'department','objectName', 
            'title', 'culture', 'period']

df = df_raw[col_sel]

In [349]:
# only take the ones in public domain
# because the first batch is public domain images only
df = df[df['isPublicDomain']==True]

In [350]:
# get rid of the one without primary image link
df = df[df['primaryImage'].notna()==True]

In [351]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3017 entries, 13 to 14289
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   objectID        3017 non-null   int64 
 1   isPublicDomain  3017 non-null   bool  
 2   primaryImage    3017 non-null   object
 3   department      3017 non-null   object
 4   objectName      3017 non-null   object
 5   title           3016 non-null   object
 6   culture         2903 non-null   object
 7   period          389 non-null    object
dtypes: bool(1), int64(1), object(6)
memory usage: 191.5+ KB


## cleanup culture

In [352]:
df.culture.nunique()

77

In [353]:
df.culture.unique()

array(['British', 'American', 'French', 'Italian', 'Spanish',
       'probably British', 'probably French', 'European',
       'probably American', 'American or European', 'probably Mexican',
       'China', nan, 'Greek', 'Russian', 'French (Breton)', 'Romanian',
       'Hungarian', 'Danish', 'Slovak', 'Bulgarian',
       'Crow, Native American', 'Japan', 'Japanese', 'probably Italian',
       'Greek (Attic)', 'probably Scottish', 'Chinese',
       'Arapaho, Native American', 'German', 'Irish', 'Belgian',
       'probably Russian', 'Czech (Moravian)', 'probably Japanese',
       'probably European', 'Philippine', 'Macedonian', 'Croatian',
       'Albanian', 'Albanian (Malissori)', 'Greek (Corinthian)',
       'Scottish', 'European, Eastern', 'Innu/Naskapi, Native American',
       'Peruvian', 'Montenegrin', 'Norwegian', 'Czech (Hanáci)',
       'Portuguese', 'British or French', 'probably Spanish', 'Austrian',
       'Dutch', 'probably Swiss', 'probably German', 'Serbian',
       'Wasc

### create culture label

fill culture with department name

In [354]:
df['culture_Label'] = df['culture']

In [355]:
df_temp = df[df['culture_Label'].isna()==True]

In [356]:
df_temp['department'].value_counts()

Islamic Art    114
Name: department, dtype: int64

In [357]:
cond1 = (df['culture_Label'].isna() == True)

df.loc[cond1,'culture_Label'] = 'Islamic'

In [358]:
df['culture_Label'].isna().sum()

0

### combine cultures

In [359]:
# remove 'probably' and possibly
df['culture_Label'] = df['culture_Label'].str.replace('probably ','')
df['culture_Label'] = df['culture_Label'].str.replace(', possibly','')


In [360]:
df['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish', 'European',
       'American or European', 'Mexican', 'China', 'Islamic', 'Greek',
       'Russian', 'French (Breton)', 'Romanian', 'Hungarian', 'Danish',
       'Slovak', 'Bulgarian', 'Crow, Native American', 'Japan',
       'Japanese', 'Greek (Attic)', 'Scottish', 'Chinese',
       'Arapaho, Native American', 'German', 'Irish', 'Belgian',
       'Czech (Moravian)', 'Philippine', 'Macedonian', 'Croatian',
       'Albanian', 'Albanian (Malissori)', 'Greek (Corinthian)',
       'European, Eastern', 'Innu/Naskapi, Native American', 'Peruvian',
       'Montenegrin', 'Norwegian', 'Czech (Hanáci)', 'Portuguese',
       'British or French', 'Austrian', 'Dutch', 'Swiss', 'Serbian',
       'Wasco, Native American', 'Lakota/ Teton Sioux, Native American',
       'Latvian', 'Innu/ Naskapi, Native American', 'Indian', 'Myanmar',
       'Cambodia', 'India (for Thai market)',
       'Japan (Okinawa, Ryūkyū Islands)', 'Japan (Ainu)',
    

In [361]:
# delete content in ()
df['culture_Label'] = df['culture_Label'].str.replace(r"\(.*\)","")

  df['culture_Label'] = df['culture_Label'].str.replace(r"\(.*\)","")


In [362]:
df['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish', 'European',
       'American or European', 'Mexican', 'China', 'Islamic', 'Greek',
       'Russian', 'French ', 'Romanian', 'Hungarian', 'Danish', 'Slovak',
       'Bulgarian', 'Crow, Native American', 'Japan', 'Japanese',
       'Greek ', 'Scottish', 'Chinese', 'Arapaho, Native American',
       'German', 'Irish', 'Belgian', 'Czech ', 'Philippine', 'Macedonian',
       'Croatian', 'Albanian', 'Albanian ', 'European, Eastern',
       'Innu/Naskapi, Native American', 'Peruvian', 'Montenegrin',
       'Norwegian', 'Portuguese', 'British or French', 'Austrian',
       'Dutch', 'Swiss', 'Serbian', 'Wasco, Native American',
       'Lakota/ Teton Sioux, Native American', 'Latvian',
       'Innu/ Naskapi, Native American', 'Indian', 'Myanmar', 'Cambodia',
       'India ', 'Japan ', 'Tibet', 'Tibet ', 'Japan or Netherlands',
       'China, Turkestan', 'China ', 'Burma ', 'India'], dtype=object)

In [363]:
# delete white space
df['culture_Label'] = df['culture_Label'].str.strip()

In [364]:
df['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish', 'European',
       'American or European', 'Mexican', 'China', 'Islamic', 'Greek',
       'Russian', 'Romanian', 'Hungarian', 'Danish', 'Slovak',
       'Bulgarian', 'Crow, Native American', 'Japan', 'Japanese',
       'Scottish', 'Chinese', 'Arapaho, Native American', 'German',
       'Irish', 'Belgian', 'Czech', 'Philippine', 'Macedonian',
       'Croatian', 'Albanian', 'European, Eastern',
       'Innu/Naskapi, Native American', 'Peruvian', 'Montenegrin',
       'Norwegian', 'Portuguese', 'British or French', 'Austrian',
       'Dutch', 'Swiss', 'Serbian', 'Wasco, Native American',
       'Lakota/ Teton Sioux, Native American', 'Latvian',
       'Innu/ Naskapi, Native American', 'Indian', 'Myanmar', 'Cambodia',
       'India', 'Tibet', 'Japan or Netherlands', 'China, Turkestan',
       'Burma'], dtype=object)

In [365]:
# combine Native American culture
cond = (df['culture_Label'].str.contains('Native American'))
df.loc[cond,'culture_Label'] = 'Native American'

In [366]:
df['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish', 'European',
       'American or European', 'Mexican', 'China', 'Islamic', 'Greek',
       'Russian', 'Romanian', 'Hungarian', 'Danish', 'Slovak',
       'Bulgarian', 'Native American', 'Japan', 'Japanese', 'Scottish',
       'Chinese', 'German', 'Irish', 'Belgian', 'Czech', 'Philippine',
       'Macedonian', 'Croatian', 'Albanian', 'European, Eastern',
       'Peruvian', 'Montenegrin', 'Norwegian', 'Portuguese',
       'British or French', 'Austrian', 'Dutch', 'Swiss', 'Serbian',
       'Latvian', 'Indian', 'Myanmar', 'Cambodia', 'India', 'Tibet',
       'Japan or Netherlands', 'China, Turkestan', 'Burma'], dtype=object)

In [367]:
# mixed culture --> fusion
sel = ['American or European','European, Eastern','British or French', 'Japan or Netherlands', 'China, Turkestan']

for s in sel:
    cond = df['culture_Label']==s
    df.loc[cond,'culture_Label']='fusion'

In [368]:
# checkout labels sorted
ls = df['culture_Label'].unique()
ls.sort()
print(ls)

['Albanian' 'American' 'Austrian' 'Belgian' 'British' 'Bulgarian' 'Burma'
 'Cambodia' 'China' 'Chinese' 'Croatian' 'Czech' 'Danish' 'Dutch'
 'European' 'French' 'German' 'Greek' 'Hungarian' 'India' 'Indian' 'Irish'
 'Islamic' 'Italian' 'Japan' 'Japanese' 'Latvian' 'Macedonian' 'Mexican'
 'Montenegrin' 'Myanmar' 'Native American' 'Norwegian' 'Peruvian'
 'Philippine' 'Portuguese' 'Romanian' 'Russian' 'Scottish' 'Serbian'
 'Slovak' 'Spanish' 'Swiss' 'Tibet' 'fusion']


In [369]:
# replace *ese
ese_dict = {'Chinese':'China','Japanese':'Japan','Indian':'India'}

df = df.replace({'culture_Label': ese_dict})

In [370]:
ls = df['culture_Label'].unique()
ls.sort()
print(ls)

['Albanian' 'American' 'Austrian' 'Belgian' 'British' 'Bulgarian' 'Burma'
 'Cambodia' 'China' 'Croatian' 'Czech' 'Danish' 'Dutch' 'European'
 'French' 'German' 'Greek' 'Hungarian' 'India' 'Irish' 'Islamic' 'Italian'
 'Japan' 'Latvian' 'Macedonian' 'Mexican' 'Montenegrin' 'Myanmar'
 'Native American' 'Norwegian' 'Peruvian' 'Philippine' 'Portuguese'
 'Romanian' 'Russian' 'Scottish' 'Serbian' 'Slovak' 'Spanish' 'Swiss'
 'Tibet' 'fusion']


In [371]:
df['culture_Label'].nunique()
# still too many

42

In [372]:
# group the bottoms ones to 'minor'
label_ls = df['culture_Label'].value_counts(normalize=True)

In [373]:
type(label_ls)

pandas.core.series.Series

In [374]:
# only keep the top 21
# why 21? because the no. 21 is native american
# I'd love to keep it
minor = label_ls.index[20:]
minor

Index(['Scottish', 'Irish', 'Slovak', 'Danish', 'Belgian', 'Dutch', 'Mexican',
       'Philippine', 'Czech', 'Swiss', 'Cambodia', 'Myanmar', 'Latvian',
       'Serbian', 'Macedonian', 'Croatian', 'Portuguese', 'Norwegian',
       'Montenegrin', 'Peruvian', 'Bulgarian', 'Burma'],
      dtype='object')

In [375]:
df['culture_Label'] = df['culture_Label'].map(lambda x: 'minor' if x in set(minor) else x)

In [376]:
df['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish', 'European',
       'fusion', 'minor', 'China', 'Islamic', 'Greek', 'Russian',
       'Romanian', 'Hungarian', 'Native American', 'Japan', 'German',
       'Albanian', 'Austrian', 'India', 'Tibet'], dtype=object)

In [377]:
df['culture_Label'].nunique()

21

In [378]:
df['culture_Label'].value_counts()

American           1214
French              499
British             267
China               248
Japan               214
fusion              176
Islamic             114
European             64
minor                43
Italian              35
Spanish              29
Russian              20
Greek                19
Romanian             13
Hungarian            11
Tibet                11
Albanian             10
German                9
Austrian              8
India                 7
Native American       6
Name: culture_Label, dtype: int64

## move images to their folders

### create folders

In [379]:
df['culture_Label'].unique()

array(['British', 'American', 'French', 'Italian', 'Spanish', 'European',
       'fusion', 'minor', 'China', 'Islamic', 'Greek', 'Russian',
       'Romanian', 'Hungarian', 'Native American', 'Japan', 'German',
       'Albanian', 'Austrian', 'India', 'Tibet'], dtype=object)

In [380]:
df['culture_Label'].nunique()

21

In [381]:
for n in df['culture_Label'].unique():
    newpath = r'images_tf/' + n

    if not os.path.exists(newpath):
        os.makedirs(newpath)

### move file to its folder

### prototype

In [213]:
# Source path
source = "images_proc/full_selection_sel/42112.jpg"

# Destination path
destination = "images_tf/Albanian/14099.jpg"

shutil.copy(source, destination)

'images_tf/Albanian/14099.jpg'

In [247]:
file_name = '42112.jpg'

In [248]:
file_name[:-4]

'42112'

In [258]:
src_file_name = '81263.jpg'
source = 'images_proc/full_selection_sel/' + src_file_name # Source path

name = src_file_name[:-4]
des_folder = df.loc[df['Object ID']==int(name),'culture_Label'].values[0]
destination = 'images_tf/' + des_folder +'/' + file_name # Destination path

shutil.copy(source, destination)

'images_tf/French/42112.jpg'

In [277]:
src_path = 'test'
des_path = 'images_tf/'

for filename in os.scandir(src_path):
    src_file_name = filename.name
    source = filename.path # Source path

    name = src_file_name[:-4]
    des_folder = df.loc[df['Object ID']==int(name),'culture_Label'].values[0]

    destination = des_path + des_folder +'/' + src_file_name # Destination path

    shutil.copy(source, destination)


### define function

In [383]:
def move_file(src_path = 'test',des_path = 'images_tf/'):

    for filename in os.scandir(src_path):
        src_file_name = filename.name
        source = filename.path # Source path

        name = src_file_name[:-4]
        # print(name)
        des_folder = df.loc[df['objectID']==int(name),'culture_Label'].values[0]

        destination = des_path + des_folder +'/' + src_file_name # Destination path

        shutil.copy(source, destination)


### test run

In [282]:
move_file()

### run on full

In [384]:
move_file('images_proc/full_selection_sel','images_tf/')