In [1]:
from bs4 import BeautifulSoup
import requests
import urllib.request
from selenium import webdriver
import os
import json
import pandas as pd
import time

In [2]:
class Project:
    def __init__(self,project_name, project_url):
        self.project_name = project_name
        self.project_url = project_url
        self.get_first_image_url()
        
    def get_first_image_url(self):
        """
        input: project_url
        output: all image urls
        """
        driver.get(self.project_url)
        soup = BeautifulSoup(driver.page_source, 'lxml')
        x = soup.find('a',{'class':'js-image-size__link'})
        image_url = 'https://www.archdaily.com/' + x.attrs['href']
        self.first_image_url = image_url

    def down_load_all_images(self):
        print(f'downing all images from project {self.project_name}')
        driver.get(self.first_image_url)

        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        directory = f'../data/{self.project_name}'
        if not os.path.exists(directory):
            os.makedirs(directory)

        download_urls = json.loads(soup.find('div', {'id':'gallery-items'}).attrs['data-images'])

        for i in range(len(download_urls)):
            response = requests.get(download_urls[i]['url_slideshow'])
            file = open(f"{directory}/{i}.png", "wb")
            file.write(response.content)
            file.close()
            
        df = pd.read_csv('../data/projects.csv')
        df_tmp = pd.DataFrame([[self.project_name, self.project_url,len(download_urls),directory]],columns=['project_name','project_url','number_of_images','folder'])
        df = df.append(df_tmp)
        df.to_csv('../data/projects.csv',index=False)

In [3]:
class Page:
    def __init__(self, page_name, page_url):
        self.page_name = page_name
        self.page_url = page_url
        self.projects = []
        self.get_all_projects(URL)
        
    def get_all_projects(self,URL):
        """
        input: a page url
        return: list of projects_titles in this page
                list of projects_urls in this page
        """
        project_urls = []
        project_titles = []
        driver.get(URL)
        soup = BeautifulSoup(driver.page_source, 'lxml')

        gridview__content = soup.find_all('a', {'class':'gridview__content'})

        for element in gridview__content:
            project_url = element.attrs['href']
            project_name = element.find('h3',{'class':"gridview__entry-title"}).text
            project_name = project_name.replace('/','_')
            project = Project(project_name,project_url)
            self.projects.append(project)

In [4]:
for page_num in range(15,20):
    driver = webdriver.Chrome('../chromedriver')
    print(f'waked up, scraping page {page_num}...')
    URL = f'https://www.archdaily.com/search/projects?page={page_num}'

    page = Page(f'{page_num}',URL)

    for project in page.projects:
        project.down_load_all_images()
        
    driver.quit()
    print(f'done scrape page {page_num}, going to sleep for 2min')
    time.sleep(120)

waked up, scraping page 15...
downing all images from project Beaumont Quarter _ Studio Woodroffe Papa
downing all images from project Architectural Classics: Residencial San Felipe in Lima _ Enrique Ciriani + Mario Bernuy
downing all images from project BIT Sports Center _ Atelier Alter Architects
downing all images from project Read and Rest Hotel _ OFFICE AIO
downing all images from project Aizumi Base _ FujiwaraMuro Architects
downing all images from project Small Hotel _ CAPD
downing all images from project WM Plenary Hall _ Bgnr Architects
downing all images from project Nildo José Office  _ Nildo José
downing all images from project Arouca's House _ Ana de Bastos + Filipe Xavier Oliveira
downing all images from project Home^Dome House _ Idoia otegui_arquitectura
downing all images from project Oslo Residential Complex _ Reiulf Ramstad Architects
downing all images from project Spyder Flagship Store Gangnam _ Jo Nagasaka + Schemata Architects
downing all images from project 45 Ho

In [5]:
df = pd.read_csv('../data/projects.csv')

In [6]:
df.number_of_images.sum()

9357

In [9]:
df.head()

Unnamed: 0,project_name,project_url,number_of_images,folder
0,Eyes Wide Open Apartment _ PRUSTA LTD,https://www.archdaily.com/947748/eyes-wide-ope...,36,../data/Eyes Wide Open Apartment _ PRUSTA LTD
1,Eyes Wide Open Apartment _ PRUSTA LTD,https://www.archdaily.com/947748/eyes-wide-ope...,36,../data/Eyes Wide Open Apartment _ PRUSTA LTD
2,The Quayside Mix Use Development _ CL3 Architects,https://www.archdaily.com/947950/the-quayside-...,20,../data/The Quayside Mix Use Development _ CL3...
3,Bar Beach House _ Bourne Blue Architecture,https://www.archdaily.com/948120/bar-beach-hou...,26,../data/Bar Beach House _ Bourne Blue Architec...
4,LAN-4 Installation _ oe architect,https://www.archdaily.com/948143/lan-4-install...,19,../data/LAN-4 Installation _ oe architect


In [10]:
df.shape

(343, 4)