In [None]:
# !pip install scrapy
!pip install requests-html

In [None]:
from requests_html import HTMLSession
import requests
import mimetypes
session = HTMLSession()

def get_url(url):
    BASE_URL = 'https://www.houseplans.com'
    
    return '{}{}'.format(BASE_URL, url)

def find_entries(url, page_number):
    r = session.get(get_url('{}?page={}'.format(url, page_number)))
    
    if r.html.text.find('No plans matched your search criteria.') > 0:
        return []
    
    links = []
    for link in r.html.links:
        if link.startswith('/plan/'):
            links.append(link)
    return links
        
def scrape_images(entry_link):
    print(entry_link)
    r = session.get(get_url(entry_link))
    images = []
    for img in r.html.find('img'):
        if 'src' in img.attrs:
            source = img.attrs['src']
            if source.find('w180x120') > 0:
                images.append(source)
    
    return images

def store_image(url, path):
    r = requests.get(url)
    img_data = r.content
    ext = mimetypes.guess_extension(r.headers['content-type'])
    full_path = '{}{}'.format(path, ext)
    
    with open(full_path, 'wb') as handler:
        handler.write(img_data)

In [None]:
from IPython.display import Image
from tqdm import tqdm
import os

collection_links = {
    'contemporary': '/collection/contemporary-house-plans',
    'european': '/collection/european-house-plans',
    'beach': '/collection/beach-house-plans',
    'bungalow': '/collection/bungalow-house-plans',
    'classical': '/collection/classical-house-plans',
    'colonial': '/collection/colonial-house-plans',
    'country': '/collection/country-house-plans',
    'southern': '/collection/southern-house-plans',
    'modern': '/collection/modern-house-plans',
    'victorian': '/collection/victorian-house-plans',
    'mediterranean': '/collection/mediterranean-house-plans',
    'craftsman': '/collection/craftsman-house-plans'
}

for (style, link) in collection_links.items():
    print('Starting style {}'.format(style))
    entry_links = []
    dir_name = 'data/raw_house_plans/{}'.format(style)
    try:
        os.makedirs(dir_name)
    except OSError:
        pass
    
    for page_number in range(0, 1):
        entry_links.extend(find_entries(link, page_number))
        
    print('Scraping {} pages'.format(len(entry_links)))
    counter = 0
    for entry_link in entry_links:
        counter = counter + 1
        print('Page {} of {}'.format(counter, len(entry_links)))
        images = scrape_images(entry_link)
        
        img_counter = 0
        for image in images:
            img_counter = img_counter + 1
            print('Downloading image {} of {}'.format(img_counter, len(images)))
            store_image(image, '{}/{}-{}-{}'.format(dir_name, style, counter, img_counter))

        
    
            
            
            