We need to rescale images to a square, however if an image is non-square (either height >> width or width >> height) the rescaled content will look different and we don't know which patterns the model will find. On the other side, if width / height -> 1, the image can be easily rescaled to a square without distorting the content.

We'll look at **d=height/width**.

We want images that are kinda squared, so we accept every image with a ratio d between 10/6.5 and 6.5/10.

In [1]:
#=====================================================
# IMPORT
#=====================================================

import os
import pandas as pd
import numpy as np
from help_functions import artist_name_cleaner, title_cleaner, split_checker
from sklearn.model_selection import train_test_split

#=====================================================
# PUTTING TOGETHER THE 4 CSV
#=====================================================

cwd = os.getcwd()
folder = "/temp2_txt"
if os.path.exists(cwd+folder):
    os.chdir(cwd+folder)

df = pd.DataFrame()
files = [file for file in os.listdir() if ".csv" in file]
for i in range(len(files)):
    df = df.append(pd.read_csv(files[i]).iloc[:,1:])
df = df.reset_index().iloc[:,1:]

# here you can see 5 links in which there is no image (thus height and width are = 0)
# we will fill na with 0s and drop the zeros
list_of_wrong_items = df[(df.height==0.0) | (df.width==0.0)].image_link.tolist()

#=====================================================
# ELIMINATING THE WRONG ITEMS
#=====================================================

df[['jpeg','height', 'width']] = df[['jpeg','height', 'width']].fillna(0)
df = df[(df.height!=0) & (df.jpeg!=0) & (df.width!=0)]
print("Correctly scraped data are:", len(df))

#=====================================================
# FILTERING FOR HEIGHT/WIDTH RATIO
#=====================================================

df['dist'] = df['height'] / df['width']
df2=df[(df.dist>=6.5/10) & (df.dist<=10/6.5)].reset_index().iloc[:,1:]

print("Number of images after filter:", len(df2))

temp = df2.groupby("art_movement").count()
print("We drop", ", ".join(temp[temp.jpeg<2000].index.tolist()), "because they are less than 2000.")
# we drop the two arte astratta and arte vittoriana because they are too few

#=====================================================
# DROPPING SMALL CATEGORIES
#=====================================================

df3 = df2[~df2.art_movement.isin(['Arte Figurativa astratta', 'Arte Vittoriana', 'Arte astratta figurativa'])].reset_index().iloc[:,1:]

#=====================================================
# CLEANING ARTIST NAME
#=====================================================

df3['artist_clean'] = [artist_name_cleaner(artist) for artist in df3.artists]


df3.title = df3.title.fillna("untitled").astype(str)

df3['title_clean'] = [title_cleaner(title) for title in df3.title]
# if some entries are the empty string I want them untitled (not sure if they even are)
df3.title_clean = df3.title_clean.replace("", "untitled")


#=====================================================
# ASSIGNING RANDOMLY OBSERVATIONS TO TRAIN-VAL-TEST
#=====================================================
# I create the column folder. default is you have not been chosen.

df3['folder'] = ["""sorry, you're not part of the team"""]*len(df3)

cats = df3.art_movement.unique().tolist()
# for each category I am gonna sample the train, test, and val
for cat in cats:
    # take only the obs in that category. no reset index cause I need it
    df_cat = df3[df3.art_movement==cat]
    # I generate the train -1400 imgs- and a temporary test -600- which I'll further split in val and test
    train, test_temp = train_test_split(df_cat, train_size=1400, test_size=600, shuffle=True, random_state=118)
    # here I split further
    validation, test = train_test_split(test_temp, train_size=400, test_size=200, shuffle=True, random_state=118)
    
    # now I check that I did stuff correctly
    idx_train = train.index.tolist()
    idx_val = validation.index.tolist()
    idx_test = test.index.tolist()
    
    if split_checker(idx_train, idx_val, idx_test): # if True, everything's right
        df3.loc[idx_train, 'folder']="train"
        df3.loc[idx_val, 'folder']="validation"
        df3.loc[idx_test, 'folder']="test"
        
print("Succesfully splitted!")


#=====================================================
# CREATING THE NAME OF THE IMAGE -HOW I WILL SAVE IT- 
# AND DEALING WITH OVERLAPPING NAMES
#=====================================================
# we cannot have too long paths
df3['image_name'] = df3['artist_clean']+"-"+df3['title_clean']
df3['image_name'] = [name[:60] for name in df3.image_name.values.tolist()]

c = 0
for i in df3[df3.duplicated(subset="image_name", keep=False)].index.tolist():
    c += 1
    df3.loc[i, 'image_name']=df3.loc[i, 'image_name'] + "_" + str(c)
    
if not len(df3.image_name)==len(set(df3.image_name)):
    print("Houston we got a problem")

os.chdir(cwd)
df3.to_csv("images_to_scrape.csv")

Correctly scraped data are: 29255
Number of images after filter: 25102
We drop Arte Figurativa astratta, Arte Vittoriana, Arte astratta figurativa because they are less than 2000.
Succesfully splitted!
