### This script scrape images from archdaily.com and save them into AWS S3 bucket. 
    


In [None]:
from bs4 import BeautifulSoup
import requests
import urllib.request
from selenium import webdriver
import os
import json
import pandas as pd
import time
import psycopg2
import config as creds
import random
import string
from PIL import Image
from io import BytesIO
import numpy as np
from webapp.utils import *

In [None]:
def get_random_alphanumeric_string(length):
    """
    Generate a random alphanumeric string for naming each image scraped
    input: 
        length: int
            the length of the alphanumeric string
    output:
        result_str: string
            the generated random string
    """
    letters_and_digits = string.ascii_letters + string.digits
    result_str = ''.join((random.choice(letters_and_digits) for i in range(length)))
    return result_str

def is_true_image(url):
    """
    Return if the image in the url is a image or not
    input:
        url: string
            the url address of the image
    output: 
        boolean
            if the image in the url is a ture image or not
    """
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img = np.array(img)
    if len(img.shape) != 3:
        return False
    if img.shape[2] != 3:
        return False
    if np.sum(img == 255)/img.shape[0]/img.shape[1]/img.shape[2] > 0.1:
        return False
    return True

In [None]:
class Project:
    """
    This class is create for each project.
    
    Properties:
        project_name string
        project_url: string
        first_image_url: string
    
    Methods:
        get_first_image_url
        get_other_info
        update_projects_table
        update_images_table
        down_load_all_images
    """
    def __init__(self,project_name, project_url):
        self.project_name = project_name
        self.project_url = project_url
        driver.get(self.project_url)
        soup = BeautifulSoup(driver.page_source, 'lxml')
        self.get_first_image_url(soup)
        self.get_other_info(soup)
        
    def get_first_image_url(self,soup):
        """
        input: project_url
        output: all image urls
        """
        x = soup.find('a',{'class':'js-image-size__link'})
        image_url = 'https://www.archdaily.com/' + x.attrs['href']
        self.first_image_url = image_url
        
    def get_other_info(self,soup):
        """
        get all other related information, such as categories, location, architects 
        """
        if soup.find('div',"afd-specs__header-category"):
            self.categories = soup.find('div',"afd-specs__header-category").text.replace("'", "")

        if soup.find('div',"afd-specs__header-location"):
            self.location = soup.find('div',"afd-specs__header-location").text.replace("'", "")

        if soup.find('div',"afd-specs__architects"):
            self.architects = soup.find('div',"afd-specs__architects").text[13:].replace("'", "")
        
    def update_projects_table(self):
        """
        update the project table in AWS RDS 
        """
        cursor.execute(f'INSERT INTO public."Projects" (url, categories, architects, location, name) \
        VALUES (\'{self.project_url}\', \'{self.categories}\', \'{self.architects}\', \'{self.location}\', \'{self.project_name}\');')

        conn.commit()
        print(f'added {self.project_name} in db!')
        
    def update_images_table(self, name, url, path):
         """
        update the images table in AWS RDS 
        """
        url = url.replace("'", "")
        cursor.execute(f'INSERT INTO public."Images" (name, url, project_name, path) \
        VALUES (\'{name}\', \'{url}\', \'{self.project_name}\', \'{path}\');')

        conn.commit()
        
    def down_load_all_images(self):
        """
        down load all images contained in the project and save it into AWS s3 bucket
        """
        print(f'downing all images from project {self.project_name}')
        driver.get(self.first_image_url)
        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        directory = f'{image_data_dir}/{self.project_name}'
        if not os.path.exists(directory):
            os.makedirs(directory)

        download_urls = json.loads(soup.find('div', {'id':'gallery-items'}).attrs['data-images'])

        for i in range(len(download_urls)):
            response = requests.get(download_urls[i]['url_slideshow'])
            
            if is_true_image(download_urls[i]['url_slideshow']):
                filename = get_random_alphanumeric_string(10)
                file = open(f"{directory}/{i}_{filename}.png", "wb")

                self.update_images_table(f'{i}_{filename}.png',download_urls[i]['url_slideshow'],f"{directory}/{i}_{filename}.png")

                file.write(response.content)
                file.close()
                
        self.update_projects_table()

In [None]:
class Page:
    """
    This class is create for each page of archdaily.
    
    Properties:
        page_name: string
        page_url: string
        projects: list[Project]
    
    Methods:
        get_all_projects
    """
    
    def __init__(self, page_name, page_url):
        self.page_name = page_name
        self.page_url = page_url
        self.projects = []
        self.get_all_projects(URL)
        
    def get_all_projects(self,URL):
        """
        input: a page url
        return: list of projects_titles in this page
                list of projects_urls in this page
        """
        project_urls = []
        project_titles = []
        driver.get(URL)
        soup = BeautifulSoup(driver.page_source, 'lxml')

        gridview__content = soup.find_all('a', {'class':'gridview__content'})

        for element in gridview__content:
            project_url = element.attrs['href']
            project_name = element.find('h3',{'class':"gridview__entry-title"}).text
            project_name = project_name.replace('/','_').replace("'", "")
            cursor.execute(f'SELECT * FROM public."Projects" where name = \'{project_name}\'')
            if cursor.rowcount == 0:
                project = Project(project_name,project_url)
                self.projects.append(project)

In [None]:
conn, cursor = connect()
image_data_dir = '/Users/chizhang/AWS-s3/archdaily'
for page_num in range(0,600):
    # Loop over page number
    driver = webdriver.Chrome('../chromedriver')
    print(f'------------scraping page {page_num}...------------------')
    URL = f'https://www.archdaily.com/search/projects?page={page_num}'

    page_readed = False
    for i in range(5):
        try:
            page = Page(f'{page_num}',URL)
            page_readed = True
        except:
            continue
        break
        
    if not page_readed:
        driver.quit()
        continue

    for project in page.projects:
        # Loop ove all project 
        try:
            project.down_load_all_images()
        except:
            pass
        
    driver.quit()
    print(f'done scrape page {page_num}')
    
conn.close()