## Web Scraper DPchallenge.com

### Goal
Since the AVA dataset constists of ~250.000 images it is too large to train the decision tree developed. Additionally the dataset includes heavy edited/artistic images or image subjects not interesting for the purpose of the decision tree: detecting quality of images created during social events. Therefore I decided to create a new dataset containing only images with a relevant subject: sports, concert and formal events. 

### Script
For each of the three themes I composed a list of search terms that could be used to search for relevant images on the website of https://dpchallenge.com/. For each search request it collects the pages where the images can be found and subsequently collects the image links and scores. 
This resulted in a final dataframe of image id's, image links, image scores and image themes. 

### Additional Remarks
Make sure that web scraping can cause overloads on the servers of dpchallenge. Please include some delays in between requests.  

In [None]:
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup as bs
from skimage import io
import matplotlib.pyplot as plt
import re
import numpy as np
import pandas as pd

In [None]:
# http://jonathansoma.com/lede/foundations-2017/classes/adv-scraping/advanced-scraping-form-submission/ 
def get_pages_search(term):
    
    data = {
    'MATCH': 'bool',
    'search_title': '1',
    'search_keywords': '1',
    'search_description': '1',
    'filter_challenge': '1',
    'order': 'r',
    'q': term,
    }
    
    number_pages = []
    all_pages = []
    url = 'https://www.dpchallenge.com/photo_search.php'
    response = requests.post(url, data = data)
    doc = bs(response.text, 'html.parser')
    hyperlinks = doc.find_all("a", {"class": "u"}, href=True)
    for page in hyperlinks:
        str_page = str(page)
        if "/photo_search.php?IMAGE_SEARCH_ID" in str_page and "next.gif" not in str_page:
            base_url = re.split('href="|amp;page=', str_page)[1]
            number = re.split('page=|"><img border="0"', str_page)
            number_pages.append(int(number[1]))

    max_page = np.max(np.array(number_pages))
    for i in range (1, max_page + 1):
        hyperlink = 'https://www.dpchallenge.com' + base_url +"page=" + str(i)
        all_pages.append(hyperlink)  
    return(all_pages)

In [None]:
def image_links(pages):
    image_links = []
    for link in pages:
        page = requests.get(link)
        soup = bs(page.content, 'html.parser')
        image_table = soup.find('table', {'cellpadding': '3', 'width': '100%', 'cellspacing': '0'})
        image_columns = image_table.find_all('td')
        for column in image_columns:
            string_column = re.split('href="|">', str(column))[2]
            hyperlink = 'https://www.dpchallenge.com' + string_column
            image_links.append(hyperlink)
    return(image_links)

In [None]:
def image_by_link(link, theme):
    try:
        image_id = link.split('IMAGE_ID=')[1]
        proxy = {'http': '181.209.82.154:23500', 'https':'181.209.82.154:23500'}
        page = requests.get(link, proxies = proxy)
        soup = bs(page.content, 'html.parser')
        images = soup.find('td', {'id': 'img_container'})
        found = str(images).split("src=")[2].split('"')[1]
        img_link = str("https:" + found)
        print(img_link)
        statistics = soup.find_all('table', {'width': '750'})[1]
        img_score = str(statistics).split('Avg (all users):</b> ')[1]
        img_score = img_score.split('<br/>')[0]
        return(image_id, img_link, img_score, theme)
    except:
        return(0)

In [None]:
def get_images_theme(theme):
    pages = get_pages_search(theme)
    image_link_list = image_links(pages)
    for image_link in image_link_list:
        image = image_by_link(image_link, theme)
        if image != 0:
            data_dpchallenge.append(image)

In [None]:
themes = ['sports', 'concert', 'formal']
sports = ['soccer', 'football', 'basketball', 'cycling', 'racing', 'stadium']
concert = ['festival', 'concert', 'music', 'disco', 'dancing', 'nightlife'] 
formal = ['exhibition', 'expo', 'wedding', 'conversation'] 

dict_themes = {}
for theme in themes:
    dict_themes[theme] = globals()[theme]

for key in dict_themes:
    terms = dict_themes[key]
    for word in terms:
        get_images_theme(word)

data_df = pd.DataFrame(data_dpchallenge, columns = ['id', 'link', 'score', 'theme'])