## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from bs4 import BeautifulSoup
import requests
import urllib.request
from urllib.parse import urlparse
import os
import nltk
import spacy
import locationtagger
from colorthief import ColorThief
from geopy import geocoders, Nominatim
import json
import colorsys

In [2]:
gn = Nominatim(user_agent="Your_Name")

## Web scraping
Retrieve the latest website data

In [3]:
# Retrieve the relevant website data, set an index, and put it into a dataframe
url1 = 'http://en.wikipedia.org/wiki/List_of_works_by_Vincent_van_Gogh'
tables = pd.read_html(url1)
phases = tables[1:7]

for i, ph in enumerate(phases):
    phases[i] = ph.drop(['#', 'Image', 'Medium,Dimensions', 'Catalogue No.'], axis=1)
vgph = pd.concat(phases)

vgph.index = range(len(vgph))

# Set workbook
vgph1 = vgph.copy()

## Pre-processing
Map the dates to seasons and extract the cities in which the works are currently located

In [4]:
# Establish patterns to search for in pre-processing
#    Month data
months_mapping = ['January', 'February', 'March',
         'April', 'May', 'June', 'July',
         'August', 'September', 'October',
         'November', 'December', 'Spring',
         'Summer', 'Autumn', 'Winter']
month_pattern = '|'.join(months_mapping)

#    Ownership data
owner_pattern = ['Private', 'Private Collection', 'Private Collections',
                    'Unknown', 'Stolen', 'Location']
ownership_pattern = '|'.join(owner_pattern)

In [5]:
# Helper functions to map 'Date' to seasons
def seasonal_search(search_str:str, search_list:str):
    search_obj = re.search(search_list, search_str)
    if search_obj:
        return_str = search_str[search_obj.start():search_obj.end()]
    else: 
        return_str = ''
    return return_str

def season_map(month):
    if month == 'March' or month == 'April' or month == 'May' or month == 'Spring':
        return 'Spring'
    if month == 'June' or month == 'July' or month == 'August' or month == 'Summer':
        return 'Summer'
    if month == 'September' or month == 'October' or month == 'November' or month == 'Autumn':
        return 'Autumn'
    if month == 'December' or month == 'January' or month == 'February' or month == 'Winter': 
        return 'Winter'
    
# Helper function to map 'Current location' to an ownership pattern
def ownership_search(search_str:str, search_list:str):
    search_obj = re.search(search_list, search_str)
    if search_obj:
        return_str = search_str[search_obj.start():search_obj.end()]
    else: 
        return_str = 'Museum'
    return return_str

# Helper function to retrieve the stripped citynames
def loc_map(loc):
    if loc == 'Villa':
        return 'Winterthur'
    if loc == 'Clark':
        return 'Williamstown'
    else:
        return loc

In [6]:
# Pre-processing
#     Correct for date/time information
vgph1['Year'] = vgph1['Date']
vgph1['Year'] = vgph1['Year'].str.replace(r'\D', '', regex=True)
vgph1['Year'] = vgph1['Year'].map(str).apply(lambda val: val[:4] if val[:2]=='18' else val[-4:])
vgph1['Month'] = vgph1['Date'].str.replace(r'\d+', '', regex=True).replace('\W+','', regex=True)
vgph1['Month'] = vgph1['Month'].apply(lambda x: seasonal_search(search_str=x, search_list=month_pattern))

#     Draw 'Season' from 'Month'
vgph1['Season'] = vgph1['Month'].apply(lambda x: season_map(x))

#    Draw ownership from 'Current Location'
vgph1['Ownership'] = vgph1['Current location'].apply(lambda x: ownership_search(search_str=x, search_list=ownership_pattern))

#     Draw 'Place_Name' from 'Current Location' (time-consuming: run only once and store in separate .csv)
# df_temp = pd.DataFrame()
# df_temp2 = pd.DataFrame()
# df_temp['Extracted location'] = vgph1['Current location'].apply(lambda x: locationtagger.find_locations(text = x))
# df_temp['Tagged city'] = df_temp['Extracted location'].apply(lambda x: x.cities).astype(str)
# df_temp['Tagged city'] = df_temp['Tagged city'].str.replace('[','', regex=True).replace(']','', regex=True).replace('Van','', regex=True).replace("'", '', regex=True)
# df_temp2 = df_temp['Tagged city'].str.split(',', expand=True)
# vgph1['Current city'] = df_temp2[0].apply(lambda x: loc_map(x))

#    Retrieve latitude and longitude from city names (time-consuming: run only once and store in separate .csv)
# latitude, longitude = [], []
# for loc in vgph1['Current city']:
#     if (loc==''):
#         latitude.append('')
#         longitude.append('')
#     else:
#         location = gn.geocode(loc)
#         latitude.append(location.latitude)
#         longitude.append(location.longitude)

# vgph1['Current x'] = latitude
# vgph1['Current y'] = longitude

## Image scraping
Scrape images and infromation from the website and run the color classification

In [7]:
# Obtain the title and url of the images with BeautifulSoup 
img_titles = []
img_urls = []
    
r = requests.get('http://en.wikipedia.org/wiki/List_of_works_by_Vincent_van_Gogh') 
soup = BeautifulSoup(r.text, 'html.parser') 

for item in soup.find_all('img'):
    img_titles.append(item['alt'])
    img_urls.append('https:' + item['src'])
    
# Add the file name and url to the dataframe
# Omit the first img featured (Vincent van Gogh portrait)
vgph1['.jpg name'] = img_titles[1:len(vgph1)+1]
vgph1['.jpg url'] = img_urls[1:len(vgph1)+1]

In [8]:
# Download images and store in local folder (time-consuming: run only once and store in separate .csv)

# def imagedown(df, url, folder):
    
#     try:
#         os.mkdir(os.path.join(os.getcwd(), folder))
#     except:
#         pass
#     os.chdir(os.path.join(os.getcwd(), folder))
    
#     r = requests.get(url)
#     soup = BeautifulSoup(r.text, 'html.parser')
#     images = soup.find_all('img')
#     images_lic = images[1:len(df)+1]
    
#     for i, image in enumerate(images_lic):
#         name = image['alt']
#         link = 'https:' + image['src']
#         index = 'vg{:0>3}_'.format(i)
#         with open(index + name.replace('jpeg', '').replace('JPG', '').replace('jpg', '').replace('"', '').replace('?', '').replace(':', '-').replace('*', '').replace('<', '').replace('>', '').replace('|', '-').replace('/', '-').replace('(', '-').replace(')', '-').replace(' ', '-') + 'jpg', 'wb') as f:
#             im = requests.get(link, headers={'Host': 'upload.wikimedia.org'})
#             f.write(im.content)

# imagedown(vgph1, 'http://en.wikipedia.org/wiki/List_of_works_by_Vincent_van_Gogh', 'van_gogh_paintings')

In [9]:
# Helper function to retrieve and save dominant colors and color palette from the paintings
def colorscrape(df, directory):
    dominant_r, dominant_g, dominant_b, dominant_rgb, palette = [], [], [], [], []
    
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            color_thief = ColorThief(f)
            dominant_color= color_thief.get_color(quality=3)
            dominant_r.append(dominant_color[0])
            dominant_g.append(dominant_color[1])
            dominant_b.append(dominant_color[2])
            dominant_rgb.append(dominant_color)
            palette.append(color_thief.get_palette(color_count = 5))
    
    df['R'] = dominant_r
    df['G'] = dominant_g
    df['B'] = dominant_b
    df['Dominant Color'] = dominant_rgb
    df['Color Palette'] = palette

In [10]:
# Retrieve colors and color palette from local folder (time-consuming: run only once and store in separate .csv)
#colorscrape(vgph1, r'C:\Users\s164386\VolVis\epds\life_in_color\van_gogh_paintings')

In [11]:
# Separate the time-consuming code for a faster website (last update: 14/01/2022)

#vg_place_color = vgph2[['Current city', 'Current x', 'Current y', '.jpg name', '.jpg url', 'Dominant Color', 'Color Palette', 'R', 'G', 'B', 'HLS', 'H', 'L', 'S']]
#vg_place_color.to_csv(r'C:\Users\s164386\VolVis\epds\life_in_color\datasets\vg_place_color.csv', index = False)

## Dataset configuration
Join live data with pre-classified data (city, color) <br>

In [17]:
# Load local data and check for changes
vg_place_color =  pd.read_csv('https://raw.githubusercontent.com/wieswies/lifeincolor/main/data/vg_place_color.csv')
if len(vg_place_color)!=len(vgph1):
    print("The WikiPedia table might have been manipulated.")
    if len(vg_place_color)< len(vgph1):
        print("It seems that some of his paintings have been added to the Wikipedia page.")
    if len(vg_place_color)> len(vgph1):
        print("It seems that some of his paintings have been removed from the Wikipedia page.")
print("The number of artworks that were downloaded and colorcoded: " + str(len(vg_place_color)))
print("The number of artworks that are currently featured on the WikiPedia page: " + str(len(vgph1)))

# Configure dataset with an inner join to ensure correctness of colormapping and live-data
vgph2 = pd.merge(vgph1, vg_place_color)

# Back-up csv
vgph2.to_csv(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\vg_data.csv', index = False)

The WikiPedia table might have been manipulated.
It seems that some of his paintings have been removed from the Wikipedia page.
The number of artworks that were downloaded and colorcoded: 870
The number of artworks that are currently featured on the WikiPedia page: 868


## Workcount statistics
Retrieve relevant statistics about the artist's life

In [22]:
# Extract from how many works the season can be determined
vgph_temp = vgph1.dropna(subset=['Season'])
print("From " + str(vgph_temp.shape[0]) + " out of " + str(vgph1.shape[0]) + " artworks, the season in which they were made is known.")

From 635 out of 868 artworks, the season in which they were made is known.


In [23]:
# Prepare .csv for d3 stacked barchart about seasonality
vgph_temp2 = pd.DataFrame({'Count': vgph_temp.groupby( ['Created in', 'Season'], dropna=False).size()}).reset_index()
vgph_barchart = pd.DataFrame()
vgph_barchart['place'] = [place for place in vgph1['Created in'].unique()]

autumn, winter, spring, summer = np.zeros(len(vgph_barchart)), np.zeros(len(vgph_barchart)), np.zeros(len(vgph_barchart)), np.zeros(len(vgph_barchart))

for index, place in enumerate(vgph_barchart['place']):
    for i, row in vgph_temp2.iterrows():
        if row['Created in']==place:
            if row['Season']=="Autumn":
                    autumn[index] = row['Count']
            if row['Season']=="Winter":
                    winter[index] = row['Count']
            if row['Season']=="Spring":
                    spring[index] = row['Count']
            if row['Season']=="Summer":
                    summer[index] = row['Count']
vgph_barchart['autumn'] = autumn
vgph_barchart['winter'] = winter
vgph_barchart['spring'] = spring
vgph_barchart['summer'] = summer
vgph_barchart.to_csv(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\statistics\vgph_barchart.csv', index = False)

In [30]:
# Prepare .csv for d3 barchart about yearly productivity
vgph_temp3 = pd.DataFrame({'count': vgph2.groupby(['Year'], dropna=False).size()}).reset_index()
vgph_temp3.to_csv(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\statistics\barchart_year.csv', index = False)

## Color gradients

In [25]:
# Create the array to draw the gradient - group colors by 'Created in'
# Read retrieved colors and initialize arrays
colormap_thehague = []
colormap_amsterdam = []
colormap_drenthe = []
colormap_nuenen = []
colormap_antwerp = []
colormap_paris = []
colormap_arles = []
colormap_saintremy = []
colormap_auverssuroise = []

# Store the colors in grouped arrays
for index, row in vgph2.iterrows():
    if (row['Created in'] == "The Hague"):
        colormap_thehague.append("rgb"+row['Dominant Color'])
    if (row['Created in'] == "Scheveningen"):
        colormap_thehague.append("rgb"+row['Dominant Color'])
    if (row['Created in'] == "Amsterdam"):
        colormap_amsterdam.append("rgb"+row['Dominant Color'])
    if (row['Created in'] == "Nieuw-Amsterdam"):
        colormap_amsterdam.append("rgb"+row['Dominant Color'])
    if (row['Created in'] == "Drenthe"):
        colormap_drenthe.append("rgb"+row['Dominant Color'])
    if (row['Created in'] == "Nuenen"):
        colormap_nuenen.append("rgb"+row['Dominant Color'])
    if (row['Created in'] == "Antwerp"):
        colormap_antwerp.append("rgb"+row['Dominant Color'])
    if (row['Created in'] == "Paris"):
        colormap_paris.append("rgb"+row['Dominant Color'])
    if (row['Created in'] == "Arles"):
        colormap_arles.append("rgb"+row['Dominant Color'])
    if (row['Created in'] == "Saint-Rémy"):
        colormap_saintremy.append("rgb"+row['Dominant Color'])
    if (row['Created in'] == "Auvers-sur-Oise"):
        colormap_auverssuroise.append("rgb"+row['Dominant Color'])
        
# Write the .txt files to feature them in .css
with open(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\colormaps\cm_thehague.txt', "w") as txt_file:
    for line in colormap_thehague:
        txt_file.write("".join(line) + ",")
with open(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\colormaps\cm_amsterdam.txt', "w") as txt_file:
    for line in colormap_amsterdam:
        txt_file.write("".join(line) + ",")
with open(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\colormaps\cm_drenthe.txt', "w") as txt_file:
    for line in colormap_drenthe:
        txt_file.write("".join(line) + ",")
with open(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\colormaps\cm_nuenen.txt', "w") as txt_file:
    for line in colormap_nuenen:
        txt_file.write("".join(line) + ",")
with open(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\colormaps\cm_antwerp.txt', "w") as txt_file:
    for line in colormap_antwerp:
        txt_file.write("".join(line) + ",")
with open(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\colormaps\cm_paris.txt', "w") as txt_file:
    for line in colormap_paris:
        txt_file.write("".join(line) + ",")
with open(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\colormaps\cm_arles.txt', "w") as txt_file:
    for line in colormap_arles:
        txt_file.write("".join(line) + ",")
with open(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\colormaps\cm_saintremy.txt', "w") as txt_file:
    for line in colormap_saintremy:
        txt_file.write("".join(line) + ",")
with open(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\colormaps\cm_auverssuroise.txt', "w") as txt_file:
    for line in colormap_auverssuroise:
        txt_file.write("".join(line) + ",")

## Connection coordinates

In [27]:
# Information about Van Gogh's living pattern, retrieved from Van Gogh Museum website (time-consuming: run only once and store in separate .csv)
vg_places = ["Zundert", "Zevenbergen", "Tilburg",
            "The Hague", "London", "Paris", "Dordrecht",
            "Amsterdam", "Borinage", "Brussels", "Etten",
            "The Hague", "Drenthe", "Nuenen", "Antwerp",
            "Paris", "Arles", "Saint-Remy", "Auvers-sur-Oise"]

# vg_lat, vg_lon = [], []
# for place in vg_places:
#     location = gn.geocode(place)
#     vg_lat.append(location.latitude)
#     vg_lon.append(location.longitude)

# vg_locations = pd.DataFrame()
# vg_locations['place'] = vg_places
# vg_locations['lat'] = vg_lat
# vg_locations['lon'] = vg_lon
# vg_locations.to_csv(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\map\vg_latlon_names.csv', index = False)

In [28]:
# Prepare .csv for connection map
# vg_latlon = pd.DataFrame()
# vg_lat1 = vg_lat[:len(vg_lat)-1]
# vg_lat2 = vg_lat[1:]
# vg_lon1 = vg_lon[:len(vg_lon)-1]
# vg_lon2 = vg_lon[1:]
# vg_latlon['long1'] = vg_lon1
# vg_latlon['long2'] = vg_lon2
# vg_latlon['lat1'] = vg_lat1
# vg_latlon['lat2'] = vg_lat2
# vg_latlon.to_csv(r'C:\Users\s164386\VolVis\epds\lifeincolor\data\map\vg_latlon.csv', index = False)