In [84]:
from bs4 import BeautifulSoup
import os
import requests
import pandas as pd
import csv

import cv2
import numpy as np

In [None]:
### PARTS FOR SCRAPING DATA

In [None]:
## set up directories
DATA_DIR = 'data/'

## for Henri-matisse
HENRI_ARTIST_URL = 'http://www.henri-matisse.net/paintingssection{section_num}.html'
HENRI_PAINTING_URL = 'http://www.henri-matisse.net/{painting_source}'

## for Pablo Picasso
PABLO_URL = 'https://www.pablo-ruiz-picasso.net/{where_to_scrape}.php'
PABLO_PAINTING_URL = 'https://www.pablo-ruiz-picasso.net/{painting_source}'

## checking if our data directory is there or not
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

In [None]:
NAME_YEAR_HENRI_DIC = {}
NAME_YEAR_PABLO_DIC = {}

In [None]:
# getting all of the image urls
def scrape_arts_henri(sec_num):
    url_query = HENRI_ARTIST_URL.format(section_num=sec_num)
    artist_page = requests.get(url_query)

    # check for request error
    try:
        artist_page.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print("Error trying to retrieve {}".format(artist_page.url))
        raise e

        
    soup = BeautifulSoup(artist_page.text, 'lxml')

    painting_paths = []
    ## filtering part, getting year between 1900-1942
    for li in soup.find_all('div', {'class': 'thmbnlspaintingselectbis'}):
        try:
            name = li.find('p').getText().split('\n')[0].strip()
            year = li.find('p').getText().split('\n')[-1].strip()
        except:
            year = li.getText().split('\n')[-1].strip()
        year_lst = year.split('-')
        year = year_lst[-1]
        if len(year) == 4:
            try:
                year = int(year)
            except:
                year = ''
        else:
            continue
        if(1900<=year<=1942):
            url_append = li.find('img').get('src')
            dict_name = url_append.replace('/','-')
            NAME_YEAR_HENRI_DIC[dict_name] = (name, year)
            painting_paths.append(url_append)
    return painting_paths

In [None]:
HENRI_ARTS = [scrape_arts_henri(sec_num) for sec_num in ['one', 'two', 'three']]
HENRI_ARTS = sum(HENRI_ARTS, [])
HENRI_ARTS

In [None]:
# getting all of the image urls
def scrape_arts_pablo(here_to_scrape):
    PABLO_WEB = PABLO_URL.format(where_to_scrape=here_to_scrape)
    print(PABLO_WEB)
    artist_page = requests.get(PABLO_WEB)
    try:
        artist_page.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print("Error trying to retrieve {}".format(artist_page.url))
        raise e

    soup = BeautifulSoup(artist_page.text, 'lxml')
    
    painting_paths = []
    pics = soup.find('div', {'id': 'main'}).find_all('div', style=lambda value: 'width' in value)
    
    ## filtering part, getting year between 1900-1942
    for pic in pics:
        name = pic.find_all('a')[-1].text
        year = pic.text.split(',')[-1].strip()
        year_lst = year.split('-')
        year = int(year_lst[-1])
        if (1900<=year<=1942):
            url_append = pic.find('img').get('src')
            dict_name = url_append.replace('/','-')
            NAME_YEAR_PABLO_DIC[dict_name] = (name, year)
            painting_paths.append(url_append)
    return painting_paths

In [None]:
PABLO_ARTS = scrape_arts_pablo('topviews')+scrape_arts_pablo('topshared')+scrape_arts_pablo('topexpensive')
PABLO_ARTS 

In [None]:
NAME_YEAR_PABLO_DIC

In [None]:
def download_and_save(artist_name):
    if artist_name == 'henri-matisse':
        painting_urls = HENRI_ARTS
        PAINTING_URL = HENRI_PAINTING_URL
    else:
        painting_urls = PABLO_ARTS
        PAINTING_URL = PABLO_PAINTING_URL
    
    IMAGE_DIR = os.path.join(os.getcwd(),DATA_DIR, artist_name)
    if not os.path.exists(IMAGE_DIR):
        os.makedirs(IMAGE_DIR)
    display(IMAGE_DIR)
    for url in painting_urls:
        download_url = PAINTING_URL.format(painting_source=url)
        outfile = os.path.join(IMAGE_DIR, url.replace('/','-'))
        if not os.path.exists(outfile):
            print("downloading: {}".format(url))
            r_painting_page = requests.get(download_url)
            with open(outfile, 'wb') as f:
                f.write(r_painting_page.content)
        else:
            pass

In [None]:
download_and_save('henri-matisse')

In [None]:
download_and_save('picasso')

In [None]:
### PARTS FOR CALCULATING RMS and Entropy of images:

calculating RMS contrast -- https://forum.processing.org/one/topic/calculate-image-contrast-using-root-mean-square-rms.html#25080000001971367  https://github.com/jeffThompson/ProcessingTeachingSketches/blob/master/ImageProcessingAndOpenCV/MeasureImageBrightnessAndContrast/MeasureImageBrightnessAndContrast.pde  

In [122]:
def RMS_CONTRAST(imagefile):
    print(imagefile)
    img = cv2.imread(imagefile)
    oneD_pixels = np.concatenate(img, axis = 0)
    
    brightness = 0
    ## average brightness
    for info in oneD_pixels:
        brightness += (0.2126 * info[0]) + (0.7152 * info[1]) + (0.0722 * info[2])
    average_brightness = brightness / len(oneD_pixels)## calculate average
    average_brightness /= 255 ## normalizing to 0 and 1

    rms = 0
    for info in oneD_pixels:
        brightness = (0.2126 * info[0]) + (0.7152 * info[1]) + (0.0722 * info[2])
        brightness /=255 ## normalizing to 0 and 1
        rms += pow((brightness - average_brightness), 2); ## calculate squared
    rms /= len(oneD_pixels) ## calculate mean squared
    rms = pow(rms, 0.5)
    return rms

In [124]:
image_folder = os.path.join(os.getcwd(), 'data/picasso')
picasso_women = pd.read_csv('../dsc160-midterm-404-not-found/data/picasso_women.csv')
picasso_women['RMS_contrast'] = picasso_women.apply(lambda x: RMS_CONTRAST(os.path.join(image_folder,x['Picturesource'])), axis = 1)

/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-3437_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-56_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-1513_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-3570_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-1459_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-151_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-1885_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-3040_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-102_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-128_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-1958

/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-911_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-2190_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-1722_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-1526_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-3010_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-2702_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-1483_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-2449_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-2310_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-1706_s.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso/images-works-

In [125]:
image_folder = os.path.join(os.getcwd(), 'data/henri-matisse')
matisse_women = pd.read_csv('../dsc160-midterm-404-not-found/data/matisse_women.csv')
matisse_women['RMS_contrast'] = matisse_women.apply(lambda x: RMS_CONTRAST(os.path.join(image_folder,x['Picturesource'])), axis = 1)

/Users/michelle/Documents/dsc160-midterm-404-not-found/data/henri-matisse/paintings-thmbnls150-bb.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/henri-matisse/paintings-thmbnls150-ba.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/henri-matisse/paintings-thmbnls150-bd.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/henri-matisse/paintings-thmbnls150-bc.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/henri-matisse/paintings-thmbnls150-cs.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/henri-matisse/paintings-thmbnls150-dr.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/henri-matisse/paintings-thmbnls150-zd.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/henri-matisse/paintings-thmbnls150-hma308.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/henri-matisse/paintings-thmbnls150-by.jpg
/Users/michelle/Documents/dsc160-midterm-404-not-found/data/henri-matisse/pain

calculate entropy