In [1]:
from bs4 import BeautifulSoup
import requests
import urllib.request
from selenium import webdriver
import os
import json
import pandas as pd
import time
import psycopg2
import config as creds
import random
import string
from PIL import Image
from io import BytesIO
import numpy as np

In [2]:
image_data_dir = '/Users/chizhang/AWS-s3/archdaily'

In [3]:
def get_random_alphanumeric_string(length):
    letters_and_digits = string.ascii_letters + string.digits
    result_str = ''.join((random.choice(letters_and_digits) for i in range(length)))
    return result_str

def is_true_image(url):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img = np.array(img)
    if len(img.shape) != 3:
        return False
    if img.shape[2] != 3:
        return False
    if np.sum(img == 255)/img.shape[0]/img.shape[1]/img.shape[2] > 0.1:
        return False
    return True

In [4]:
def connect():
    
    # Set up a connection to the postgres server.
    conn_string = "host="+ creds.PGHOST +" port="+ "5432" +" dbname="+ creds.PGDATABASE +" user=" + creds.PGUSER \
                  +" password="+ creds.PGPASSWORD
    
    conn = psycopg2.connect(conn_string)
    print("Connected to AWS DataBase!")

    # Create a cursor object
    cursor = conn.cursor()
    
    return conn, cursor

In [5]:
class Project:
    def __init__(self,project_name, project_url):
        self.project_name = project_name
        self.project_url = project_url
        driver.get(self.project_url)
        soup = BeautifulSoup(driver.page_source, 'lxml')
        self.get_first_image_url(soup)
        self.get_other_info(soup)
        
    def get_first_image_url(self,soup):
        """
        input: project_url
        output: all image urls
        """
        x = soup.find('a',{'class':'js-image-size__link'})
        image_url = 'https://www.archdaily.com/' + x.attrs['href']
        self.first_image_url = image_url
        
    def get_other_info(self,soup):
        if soup.find('div',"afd-specs__header-category"):
            self.categories = soup.find('div',"afd-specs__header-category").text.replace("'", "")

        if soup.find('div',"afd-specs__header-location"):
            self.location = soup.find('div',"afd-specs__header-location").text.replace("'", "")

        if soup.find('div',"afd-specs__architects"):
            self.architects = soup.find('div',"afd-specs__architects").text[13:].replace("'", "")
        
    def update_projects_table(self):
        cursor.execute(f'INSERT INTO public."Projects" (url, categories, architects, location, name) \
        VALUES (\'{self.project_url}\', \'{self.categories}\', \'{self.architects}\', \'{self.location}\', \'{self.project_name}\');')

        conn.commit()
        print(f'added {self.project_name} in db!')
        
    def update_images_table(self, name, url, path):
        url = url.replace("'", "")
        cursor.execute(f'INSERT INTO public."Images" (name, url, project_name, path) \
        VALUES (\'{name}\', \'{url}\', \'{self.project_name}\', \'{path}\');')

        conn.commit()
        
    def down_load_all_images(self):
        print(f'downing all images from project {self.project_name}')
        driver.get(self.first_image_url)
        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        directory = f'{image_data_dir}/{self.project_name}'
        if not os.path.exists(directory):
            os.makedirs(directory)

        download_urls = json.loads(soup.find('div', {'id':'gallery-items'}).attrs['data-images'])

        for i in range(len(download_urls)):
            response = requests.get(download_urls[i]['url_slideshow'])
            
            if is_true_image(download_urls[i]['url_slideshow']):
                filename = get_random_alphanumeric_string(10)
                file = open(f"{directory}/{i}_{filename}.png", "wb")

                self.update_images_table(f'{i}_{filename}.png',download_urls[i]['url_slideshow'],f"{directory}/{i}_{filename}.png")

                file.write(response.content)
                file.close()
                
        self.update_projects_table()

In [6]:
class Page:
    def __init__(self, page_name, page_url):
        self.page_name = page_name
        self.page_url = page_url
        self.projects = []
        self.get_all_projects(URL)
        
    def get_all_projects(self,URL):
        """
        input: a page url
        return: list of projects_titles in this page
                list of projects_urls in this page
        """
        project_urls = []
        project_titles = []
        driver.get(URL)
        soup = BeautifulSoup(driver.page_source, 'lxml')

        gridview__content = soup.find_all('a', {'class':'gridview__content'})

        for element in gridview__content:
            project_url = element.attrs['href']
            project_name = element.find('h3',{'class':"gridview__entry-title"}).text
            project_name = project_name.replace('/','_').replace("'", "")
            cursor.execute(f'SELECT * FROM public."Projects" where name = \'{project_name}\'')
            if cursor.rowcount == 0:
                project = Project(project_name,project_url)
                self.projects.append(project)

In [None]:
conn, cursor = connect()

for page_num in range(300,600):
    driver = webdriver.Chrome('../chromedriver')
    print(f'------------scraping page {page_num}...------------------')
    URL = f'https://www.archdaily.com/search/projects?page={page_num}'

    page_readed = False
    for i in range(5):
        try:
            page = Page(f'{page_num}',URL)
            page_readed = True
        except:
            continue
        break
        
    if not page_readed:
        driver.quit()
        continue

    for project in page.projects:
        try:
            project.down_load_all_images()
        except:
            pass
        
    driver.quit()
    print(f'done scrape page {page_num}')
    
conn.close()

Connected to AWS DataBase!
------------scraping page 300...------------------
done scrape page 300
------------scraping page 301...------------------
done scrape page 301
------------scraping page 302...------------------
done scrape page 302
------------scraping page 303...------------------
downing all images from project Hotel Säntispark _ Carlos Martinez Architekten
added Hotel Säntispark _ Carlos Martinez Architekten in db!
downing all images from project Santé Publique France Headquarters Near Paris  _ Atelier du Pont
added Santé Publique France Headquarters Near Paris  _ Atelier du Pont in db!
downing all images from project Jolly Gastro Lab Bar _ Laje 54 Arquitetura
added Jolly Gastro Lab Bar _ Laje 54 Arquitetura in db!
downing all images from project Colorado Lodge  _ Terremoto Landscape + EARL + SheetrockLA
added Colorado Lodge  _ Terremoto Landscape + EARL + SheetrockLA in db!
downing all images from project Cycling through the Trees _ Burolandschap
added Cycling through th

added ZERO Box Lodge _ Gonçalo Queirós Carvalho Architecto in db!
done scrape page 313
------------scraping page 314...------------------
done scrape page 314
------------scraping page 315...------------------
done scrape page 315
------------scraping page 316...------------------
done scrape page 316
------------scraping page 317...------------------
done scrape page 317
------------scraping page 318...------------------
done scrape page 318
------------scraping page 319...------------------
done scrape page 319
------------scraping page 320...------------------
done scrape page 320
------------scraping page 321...------------------
done scrape page 321
------------scraping page 322...------------------
done scrape page 322
------------scraping page 323...------------------
done scrape page 323
------------scraping page 324...------------------
downing all images from project The Elms Boutique Apartments _ Chan Architecture
added The Elms Boutique Apartments _ Chan Architecture in db!

added Donado 2325 Building _ Estudio NDG + Lautaro Malnatti in db!
downing all images from project Generali Tower _ Zaha Hadid Architects
added Generali Tower _ Zaha Hadid Architects in db!
downing all images from project Coshocton Ray Trace Installation _ Behin Ha
added Coshocton Ray Trace Installation _ Behin Ha in db!
done scrape page 332
------------scraping page 333...------------------
done scrape page 333
------------scraping page 334...------------------
done scrape page 334
------------scraping page 335...------------------
done scrape page 335
------------scraping page 336...------------------
done scrape page 336
------------scraping page 337...------------------
done scrape page 337
------------scraping page 338...------------------
done scrape page 338
------------scraping page 339...------------------
done scrape page 339
------------scraping page 340...------------------
done scrape page 340
------------scraping page 341...------------------
done scrape page 341
--------

added Ambrosia Tea Shop _ Biasol in db!
done scrape page 351
------------scraping page 352...------------------
done scrape page 352
------------scraping page 353...------------------
done scrape page 353
------------scraping page 354...------------------
done scrape page 354
------------scraping page 355...------------------
downing all images from project Monceau Apartment _ JCPCDR Architecture
added Monceau Apartment _ JCPCDR Architecture in db!
downing all images from project 195 Clarkson _ AB Architekten
added 195 Clarkson _ AB Architekten in db!
downing all images from project Koller + Koller am Waagplatz Restaurant _ BEHF Architects
added Koller + Koller am Waagplatz Restaurant _ BEHF Architects in db!
downing all images from project 100 Steuben Street _ AB Architekten
added 100 Steuben Street _ AB Architekten in db!
downing all images from project MÉCA Cultural Center _ BIG
added MÉCA Cultural Center _ BIG in db!
downing all images from project Phoenix House _ Sebastian Marisca

added BT House _ Taguá Arquitetura in db!
downing all images from project Pilevneli Gallery _ Emre Arolat Architecture
added Pilevneli Gallery _ Emre Arolat Architecture in db!
downing all images from project Pillars of Dreams Pavilion _ MARC FORNES  _ THEVERYMANY
added Pillars of Dreams Pavilion _ MARC FORNES  _ THEVERYMANY in db!
downing all images from project House in the Fields _ Estudio Acta
added House in the Fields _ Estudio Acta in db!
downing all images from project Prismatic Installation _ Hou de Sousa
added Prismatic Installation _ Hou de Sousa in db!
downing all images from project Jiangsu Beisha Kindergarten _ Crossboundaries
added Jiangsu Beisha Kindergarten _ Crossboundaries in db!
downing all images from project Omakase Restaurant _ Shanghai Hip-pop Design Team
added Omakase Restaurant _ Shanghai Hip-pop Design Team in db!
downing all images from project Antara Gallery _ EDI Architects
added Antara Gallery _ EDI Architects in db!
done scrape page 377
------------scrapi

added OpaslyTom Restaurant _ BUCK.STUDIO in db!
downing all images from project Villa Baronessa _ Walter Angonese + Schiefer Tschöll Architektur
added Villa Baronessa _ Walter Angonese + Schiefer Tschöll Architektur in db!
downing all images from project Park Legend Kindergarten _ CHALLENGE DESIGN
added Park Legend Kindergarten _ CHALLENGE DESIGN in db!
downing all images from project Extension of Oyamagaoka _ Aki Hamada Architects
added Extension of Oyamagaoka _ Aki Hamada Architects in db!
downing all images from project Courtyard-Wayuan _ Basic Architecture Studio
added Courtyard-Wayuan _ Basic Architecture Studio in db!
downing all images from project Five Gardens House _ David Boyle Architect
done scrape page 391
------------scraping page 392...------------------
done scrape page 392
------------scraping page 393...------------------
done scrape page 393
------------scraping page 394...------------------
done scrape page 394
------------scraping page 395...------------------
done 

added Clemente Dental Clinic _ LANDÍNEZ+REY | equipo L2G arquitectos  in db!
downing all images from project Palatial House on Liberdade 191-193 _ Contacto Atlântico
added Palatial House on Liberdade 191-193 _ Contacto Atlântico in db!
downing all images from project El Cabanyal Residential Renovation _ David Estal + Arturo Sanz
added El Cabanyal Residential Renovation _ David Estal + Arturo Sanz in db!
downing all images from project 106 · Øki Apartment _ elii
added 106 · Øki Apartment _ elii in db!
downing all images from project T House _ Olalquiaga Arquitectos
downing all images from project Colonias Viladoms Houses _ OAB
added Colonias Viladoms Houses _ OAB in db!
downing all images from project X.Ø House  _ BETA.ø architecture office
added X.Ø House  _ BETA.ø architecture office in db!
downing all images from project Koto Muutama Prefabricated Cabin _ Koto Design
added Koto Muutama Prefabricated Cabin _ Koto Design in db!
downing all images from project Aranya Café _ odd
added Ar

done scrape page 438
------------scraping page 439...------------------
done scrape page 439
------------scraping page 440...------------------
done scrape page 440
------------scraping page 441...------------------
done scrape page 441
------------scraping page 442...------------------
done scrape page 442
------------scraping page 443...------------------
done scrape page 443
------------scraping page 444...------------------
done scrape page 444
------------scraping page 445...------------------
done scrape page 445
------------scraping page 446...------------------
done scrape page 446
------------scraping page 447...------------------
done scrape page 447
------------scraping page 448...------------------
done scrape page 448
------------scraping page 449...------------------
done scrape page 449
------------scraping page 450...------------------
done scrape page 450
------------scraping page 451...------------------
done scrape page 451
------------scraping page 452...-----------

downing all images from project Tacoma Art Museum Benaroya Wing _ Olson Kundig
added Tacoma Art Museum Benaroya Wing _ Olson Kundig in db!
downing all images from project Pike Place MarketFront _ The Miller Hull Partnership
added Pike Place MarketFront _ The Miller Hull Partnership in db!
downing all images from project PD House _ WAATAA
done scrape page 463
------------scraping page 464...------------------
done scrape page 464
------------scraping page 465...------------------
done scrape page 465
------------scraping page 466...------------------
done scrape page 466
------------scraping page 467...------------------
done scrape page 467
------------scraping page 468...------------------
done scrape page 468
------------scraping page 469...------------------
done scrape page 469
------------scraping page 470...------------------
done scrape page 470
------------scraping page 471...------------------
done scrape page 471
------------scraping page 472...------------------
done scrape 

added Cecil St House  _ Chan Architecture in db!
downing all images from project The Renovation of JiJiaDun Village Center _ Yzscape
added The Renovation of JiJiaDun Village Center _ Yzscape in db!
downing all images from project The Ilma _ LABOTORY
added The Ilma _ LABOTORY in db!
downing all images from project AD Classics: Kuwait National Assembly Building _ Jørn Utzon
done scrape page 480
------------scraping page 481...------------------
done scrape page 481
------------scraping page 482...------------------
downing all images from project City of Saints Bryant Park _ Only If 
added City of Saints Bryant Park _ Only If  in db!
downing all images from project Torre Diana _ Colonnier Arquitectos
added Torre Diana _ Colonnier Arquitectos in db!
downing all images from project Le Hideout _ Ménard Dworkind architecture & design
added Le Hideout _ Ménard Dworkind architecture & design in db!
downing all images from project Avenues The Word School _ aflalo_gasperini arquitetos
added Aven

done scrape page 495
------------scraping page 496...------------------
downing all images from project Creative Imagination Space for Youth _ BLACKhome
added Creative Imagination Space for Youth _ BLACKhome in db!
downing all images from project Guaíba Orla Urban Park _ Jaime Lerner Arquitetos Associados
added Guaíba Orla Urban Park _ Jaime Lerner Arquitetos Associados in db!
downing all images from project Unitarian Universalist Society _ Neumann Monson Architects
added Unitarian Universalist Society _ Neumann Monson Architects in db!
downing all images from project Narrow Door House _ Alberto Craveiro
added Narrow Door House _ Alberto Craveiro in db!
downing all images from project Earth Memorial _ Gitai Architects
added Earth Memorial _ Gitai Architects in db!
downing all images from project Songs Chinese Cuisine _ Republican Metropolis Architecture
added Songs Chinese Cuisine _ Republican Metropolis Architecture in db!
downing all images from project Wejherowo _ PB STUDIO
added We