<a href="https://colab.research.google.com/github/viralcmehta/artistsFromArt/blob/main/ArtistFromArt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detect Artist from Art
The goal of this project to find an artist from a picture of a art piece. 


# New Section

## Steps to take
1. Download the data from Kaggle
1. Understand the data and clean it up a bit
1. Perform some data augmentation
1. Write down the explicit parameters for my test, validation, and training set
1. Use a resnet transform learning to get some initial predictions
1. Once it seems to work, then extract the model
1. Host it somewhere and access it from my phone
1. Show it off




In [1]:
#Setup imports

!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

from fastbook import *
from fastai.vision.widgets import *
from google.colab import files
from fastai.vision import *

!pip install -qq pandas==1.1.0
!pip install -qq ipywidgets

[K     |████████████████████████████████| 727kB 14.7MB/s 
[K     |████████████████████████████████| 204kB 31.2MB/s 
[K     |████████████████████████████████| 1.2MB 24.2MB/s 
[K     |████████████████████████████████| 51kB 7.3MB/s 
[K     |████████████████████████████████| 61kB 9.0MB/s 
[K     |████████████████████████████████| 61kB 8.2MB/s 
[?25hMounted at /content/gdrive
[K     |████████████████████████████████| 10.5MB 9.8MB/s 
[?25h

In [2]:
downloadDataset = False
setupKaggle = False
retrainModel = False
downloadTestSet = False
cleanTestSet = False
runTestSet = False
createUi = True

#setup paths
path_base = Path("/content/datasets/artworks/")
path_img = path_base/ "images" / "images"
path_resized = path_base / "resized"
path_base.mkdir(parents=True, exist_ok=True)
path_test = path_base / "test"

path_gdrive = Path("/content/gdrive/MyDrive/")
path_g_datasets = path_gdrive / "datasets/"
path_g_artworks = path_g_datasets / "artworks/"
path_data_zip = path_g_artworks / "best-artworks-of-all-time.zip"
path_g_artworks_test = path_g_artworks / "test/"

path_kaggle = Path("/root/.kaggle")
path_g_kaggle = path_g_datasets / ".kaggle"


In [3]:
import shutil
from distutils.dir_util import copy_tree


def _copy(self, dest, dir=False):

  if dir:
    assert self.is_dir()
    if dest.exists():
      assert dest.is_dir()
    else:
      dest.mkdir(parents=True, exist_ok=True) 

    copy_tree(str(self), str(dest.resolve()))

  else:
    assert self.is_file()
    shutil.copy(self, dest)

Path.copy = _copy

In [4]:
#Set kaggle up
if setupKaggle:
  !pip install --upgrade -qq kaggle
  path_g_kaggle.copy(path_kaggle, dir=True)
  (path_kaggle / 'kaggle.json').chmod(0o600)

In [5]:
#Download the Kaggle dataset to gdrive, and unzip it to content folder
if downloadDataset:
  path_g_artworks.mkdir(parents=True, exist_ok=True)
  !kaggle datasets download ikarus777/best-artworks-of-all-time -p $path_g_artworks


In [6]:
if retrainModel:
  #Unzip the dataset for use
  !unzip -o -q $path_data_zip  -d $path_base

  #fix issues with this one artist
  !mv $path_img"/Albrecht_DuтХа├кrer" $path_img"/Albrecht_Durer"
  !mv $path_img"/Albrecht_Du╠Иrer/*" $path_img"/Albrecht_Durer"
  !rm -rf $path_img"/Albrecht_Du╠Иrer

  #we don't need resized folder. get rid of it
  !rm -rf $path_resized

else:
  !unzip -j -q -o $path_data_zip 'artists.csv' -d $path_base 


In [7]:
import pandas as pd
dfo = pd.read_csv(path_base / "artists.csv")

In [8]:
df = dfo[["name", "paintings"]].sort_values("paintings", ascending=False)
df = df[df["paintings"] > 200]
df["name"] = df["name"].replace(['Albrecht Dürer'], 'Albrecht Durer')
df["fname"]= df["name"].str.replace(" ", "_")


In [9]:
df

Unnamed: 0,name,paintings,fname
8,Vincent van Gogh,877,Vincent_van_Gogh
30,Edgar Degas,702,Edgar_Degas
13,Pablo Picasso,439,Pablo_Picasso
15,Pierre-Auguste Renoir,336,Pierre-Auguste_Renoir
19,Albrecht Durer,328,Albrecht_Durer
46,Paul Gauguin,311,Paul_Gauguin
16,Francisco Goya,291,Francisco_Goya
31,Rembrandt,262,Rembrandt
20,Alfred Sisley,259,Alfred_Sisley
32,Titian,255,Titian


In [10]:
if retrainModel:
  #Data visualization
  im = Image.open(path_img / "Vincent_van_Gogh/Vincent_van_Gogh_167.jpg")
  display(im.to_thumb(256))

In [11]:
if retrainModel:
  #Check if the image files are correct and clean
  fns = get_image_files(path_img)
  failed = verify_images(fns)
  failed.map(Path.unlink)

In [12]:
if retrainModel:
  artistlist = [path_img / fn for fn in df["fname"]]
  flist = [get_image_files(pth) for pth in artistlist]
  flist_flat = []
  for artist in flist:
    flist_flat.extend(artist)
  

In [13]:
if retrainModel:
  art = DataBlock(
    blocks = (ImageBlock, CategoryBlock),
    splitter=RandomSplitter(valid_pct=0.2, seed=353),
    get_y = parent_label,
    item_tfms=RandomResizedCrop(256, min_scale=0.8), batch_tfms=aug_transforms(mult=2))
  


In [14]:
if retrainModel:
  dls = art.dataloaders(flist_flat)
  dls.train.show_batch(max_n=8, nrows=2, unique=True)

In [15]:
if retrainModel:
  learn = cnn_learner(dls, resnet50, metrics=error_rate)
  learn.fine_tune(20)

In [16]:
if retrainModel:
  learn.export()
  !cp export.pkl /content/gdrive/MyDrive/datasets/artworks/export2.pkl

In [17]:
if retrainModel:
  interp = ClassificationInterpretation.from_learner(learn)
  interp.plot_confusion_matrix()

In [18]:
if retrainModel:
  interp.plot_top_losses(9, nrows=9)

In [19]:
if not retrainModel:
  path_model = path_g_artworks / "export2.pkl"
  learn = load_learner(path_model)

In [20]:
if runTestSet:
  #Create a test set from bing image search
  if downloadTestSet:
    keyFile = (path_g_datasets / "BingKey.txt").open()
    BingKey = keyFile.read()
    keyFile.close()

    artist_search_terms = df["name"].tolist()
    artist_fnames = df["fname"].tolist()

    #path_testset = Path(path_g_artworks_test)

    for search, fname in zip(artist_search_terms, artist_fnames):
      results = search_images_bing(BingKey, f'{search} original paintings', max_images=30)
      dest = path_g_artworks_test / fname
      dest.mkdir(exist_ok=True, parents=True)
      download_images(f'{dest}/', urls=results.attrgot('contentUrl'))

  if downloadTestSet or cleanTestSet:
    #get rid of all files that are not openable images
    import glob
    import numpy as np
    imgs = get_image_files(path_g_artworks_test)
    imgs = [str(img) for img in imgs]

    allf = glob.glob(f'{path_g_artworks_test}/**/*')
    allf.sort()

    failed = list(np.setdiff1d(allf, imgs, assume_unique=True))
    failed = L([Path(fn) for fn in failed])
    print(failed)

    def remFile(pth):
      print(f'Deleting {pth}')
      pth.unlink()
      return pth.exists()

    #Check if the image files are correct, clean, and openable            
    fns = get_image_files(path_g_artworks_test)
    failed.extend(verify_images(fns))
    failed.map(remFile)

  !rm -rf $path_test
  #!cp -r $path_g_artworks_test $path_base
  path_g_artworks_test.copy(path_test, dir=True)



In [21]:
if runTestSet:
  from tqdm.notebook import tqdm
  imgs = get_image_files(path_test)

  results = []

  with tqdm(total=len(imgs)) as pbar:
    for img in imgs:
      res = learn.predict(img) 
      results.append((parent_label(img),) + res + (str(img),))
      pbar.update()



In [22]:
if runTestSet:
  #now we have results from the test set. Lets hope it does well. and if it doesn't lets look at the results
  missed = []
  i = 0
  for res in results:
    if res[0] != res[1]:
      #this was a bad prediction. lets stare at this
      missed.append((i, res[0], res[1], res[-1])) 
      i=i+1



In [23]:
if runTestSet:
  #Lets analyze the missed
  print(f"{len(missed)}/{len(results)} : {100*len(missed)/len(results)}% failure")

  import matplotlib.pyplot as plt
  %matplotlib inline
  plt.rcParams.update({'figure.max_open_warning': 0})

  missed_imgs = [(i, act, pred, pth, Image.open(pth)) for i, act, pred, pth in missed]

  for i, act, pred, pth, img in missed_imgs:
    #print(f"{i}, {act}, {pred}, {imgpth}")
    #plt.subplot(45, 2, i+1)
    plt.figure()
    plt.imshow(img)
    plt.title(f'{act}/{pred}/{Path(pth).name}')


In [57]:
if createUi:
  import ipywidgets as widgets
  from PIL import Image

  fu = widgets.FileUpload(description="Gimme an image", accept='image/*', multiple=False)

  defaultImg = Image.new('RGB', size = (256, 256), color = (153,153,153))
  img = Image.new('RGB', size = (256, 256), color = (153,153,255))
  img_widget = widgets.Image(value = defaultImg.to_bytes_format(), format = 'png', width=256, height=256)

  def onFileUpload(value):
    global img 
    img = PILImage.create(fu.data[0])
    img_widget.value = fu.data[0]
    lbl_pred.value="Click classify for answers."

  fu.observe(onFileUpload, names='value')

  lbl_pred = widgets.Label(value="Upload a file.")

  path_model = path_g_artworks / "export2.pkl"
  learn = load_learner(path_model)

  def onClassifyClick(_):
    #res = learn.predict(img)
    pred,pred_idx,probs = learn.predict(img)
    lbl_pred.value = f'Prediction: {pred}; Probability: {probs[pred_idx]*100:.00f}%'

  classify = widgets.Button(description = 'Classify Image')
  classify.on_click(onClassifyClick)


In [58]:
if createUi:
  topBox = widgets.HBox([fu, classify])
  display(widgets.VBox([topBox, img_widget, lbl_pred]))


VBox(children=(HBox(children=(FileUpload(value={}, accept='image/*', description='Gimme an image'), Button(des…