In [12]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import json
from urllib.parse import urlencode
import requests
import random
from bs4 import BeautifulSoup
from lxml import etree
import re
import os
from io import BytesIO
from PIL import Image
import pickle

In [50]:
!pip install tldextract
!pip install jmd_imagescraper
from tldextract import extract

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tldextract
  Downloading tldextract-3.4.0-py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 2.2 MB/s 
Collecting requests-file>=1.4
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-1.5.1 tldextract-3.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jmd_imagescraper
  Downloading jmd_imagescraper-1.0.2-py3-none-any.whl (12 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 22.2 MB/s 
Installing collected packages: jedi, jmd-imagescraper
Successfully installed jedi-0.18.1 jmd-imagescraper-1.0.2


In [None]:
import signal
from contextlib import contextmanager

class TimeoutException(Exception): pass

@contextmanager

def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)

In [None]:
def random_headers():
    headers = {
        'User-Agent': random.choice(
            [
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
                'Mozilla/5.0 (Windows NT 10.0; Win 64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
            ]
        )          
    }
    return headers

In [None]:
def get_google_results(query, num_pages):

    logs = {}

    domain = 'com'
    payload = {
        'q': query, 
        'uule': 'w+CAIQICIfTG9uZG9uLCBFbmdsYW5kLCBVbml0ZWQgS2luZ2RvbQ'
    }

    links = [] 

    for count in range(0, num_pages * 10, 10):
        payload['start'] = count # One page is equal to 10 google results.

        params = urlencode(payload)
        url = f'https://www.google.{domain}/search?{params}'

        # Scrape.
        response = requests.get(url=url, headers=random_headers())
        soup = BeautifulSoup(response.content, 'html.parser')
        
        unwanted_list = ['google', 'search', '#', 'facebook', 'instagram']

        for ref in set(soup.findAll('a')):
                href  = ref.get('href')
                if type(href) == str:
                    good_link = True
                    for unwanted in unwanted_list:
                        if unwanted in href:
                            good_link = False
                            dict_append_or_create(href, f'Filtered: On {unwanted}', logs)
                    
                    if good_link and 'http' in href:
                        links.append(href)
                    else: 
                        if good_link:
                            dict_append_or_create(href, 'Filtered: http', logs)
                            
    return links, logs

In [None]:
def url_rank(url):
    rank = 0
    if 'about' not in url.lower():
        rank += 200
    rank += (len(url))
    return rank

In [None]:
def get_domain(url):
    tsd, td, tsu = extract(url)
    if len(tsd) > 0:
        tsd += '.'
    if 'https' in url:
        domain_url = f'https://{tsd}{td}.{tsu}'
    else:
        domain_url = f'http://{tsd}{td}.{tsu}'
    return domain_url

In [None]:
def dict_append_or_create(key, value, d):
    if key not in d.keys():
        d[key] = [value]
    else:
        d[key].append(value)

In [None]:
def find_about(url):
    
    logs = {}

    tsd, td, tsu = extract(url)
    domain_url = get_domain(url)
    
    try:
        response = requests.get(url=domain_url, headers=random_headers())
        soup = BeautifulSoup(response.content, 'html.parser')

        possible_links = []

        for ref in set(soup.findAll('a')):
            href  = ref.get('href')
            if type(href) == str:
                if domain_url in href and 'about' in href.lower():
                    possible_links.append(href)
                else:
                    dict_append_or_create(url, f'Filtered: {href} No about or domain ', logs)

        possible_links.sort(key=url_rank)
                    
        if len(possible_links) > 0:
            
            dict_append_or_create(url, f'Found: {possible_links[0]}', logs)
            return possible_links[0], logs
    except: 
        dict_append_or_create(url, f'Failed: Generic, possibly SSL', logs)
    return False, logs

In [None]:
def google_search_about(url):
    
    logs = {}

    tsd, td, tsu = extract(url)
    domain_url = get_domain(url)

    domain_url_trimmed = domain_url.replace('https://', '')
    domain_url_trimmed = domain_url_trimmed.replace('http://', '')
    
    query = f'about us {domain_url_trimmed}'
    dict_append_or_create(url, f'Query: {query}', logs)
    
    domain = 'com'
    payload = {
        'q': query, 
        'uule': 'w+CAIQICIfTG9uZG9uLCBFbmdsYW5kLCBVbml0ZWQgS2luZ2RvbQ'
    }

    params = urlencode(payload)
    search_url = f'https://www.google.{domain}/search?{params}'


    response = requests.get(url=search_url, headers=random_headers())
    soup = BeautifulSoup(response.content, 'html.parser')

    unwanted_list = ['google', 'search', '#', 'facebook', 'instagram']

    possible_links = []

    for ref in set(soup.findAll('a')):
        href  = ref.get('href')
        if type(href) == str:
            good_link = True
            for unwanted in unwanted_list:
                if unwanted in href:
                    good_link = False
                    dict_append_or_create(url, f'Filtered: {href} on {unwanted}', logs)

            if good_link: 
                if domain_url in href and 'about' in href.lower():
                    possible_links.append(href)
                else:
                    dict_append_or_create(url, f'Filtered: {href} not in domain/about', logs)
    
    possible_links.sort(key=url_rank)
                
    if len(possible_links) > 0:
        dict_append_or_create(url, f'Found: {possible_links[0]} on via google', logs)
        return possible_links[0], logs
    
    dict_append_or_create(url, f'None found:', logs)
    return False, logs

In [None]:
def get_image_urls(url):
    
    logs = {}
    img_urls = []
    try:
        response = requests.get(url, headers=random_headers())

        soup = BeautifulSoup(response.text, 'html.parser')
        img_tags = soup.find_all('img')

        dict_append_or_create(url, f'{len(img_tags)} found', logs)

        unwanted_list = ['google', 'icon', 'logo', 'banner', 'facebook']

        for img in img_tags:
            source_found = True
            try:
                img_url = img["data-srcset"]
            except:
                try:
                    img_url = img["data-src"]
                except:
                    try:
                        img_url = img["src"]
                    except:
                        try:
                            img_url = img["data-lazy-src"]
                        except:
                            try: 
                                img_url = img["data-fallback-src"]
                            except:
                                try: 
                                    img_url = re.search(r'\/\S+?(png|jpg|gif)', str(img)).group(0)
                                except:
                                    source_found = False
                                    dict_append_or_create(url, f'No soucre found for: {img}', logs)
            
            if source_found:
                if 'http' not in img_url:
                    img_url = get_domain(url) + img_url
                    dict_append_or_create(url, f'Relative path fixed for {img}', logs)
                
                good_link = True
                for unwanted in unwanted_list:
                    if unwanted in img_url:
                        good_link = False
                        dict_append_or_create(url,f'Filtered {img_url} on {unwanted}', logs)
                
                if good_link:
                    re_search = re.search(r'https?:\/\/\S+?(png|jpg|gif)', img_url)
                    if re_search and re_search.group(0) not in img_urls:
                        img_urls.append(re_search.group(0))
                        dict_append_or_create(url,f'{re_search.group(0)} found for {img}', logs)
                else:
                    dict_append_or_create(url,f'Filtered {img} on no re match or duplicate', logs)
    except: 
        dict_append_or_create(url, f'Failed: Generic, possibly SSL', logs)
    
    return img_urls, logs

In [None]:
def save_from_url_list(img_urls, save_dir, threshold_dim, filename_prfx):

    count = 1
    filename_dict = {}

    logs = {}

    for url in img_urls:
        
        filename = filename_prfx + str(count).zfill(5)
        filename_dict[filename] = url

        try:
            response = requests.get(url, headers=random_headers())
            response_content = response.content
            try:
                response_content = str(response_content, 'utf-8')
                dict_append_or_create(url,f'Unknown try clause enacted', logs)
            except UnicodeDecodeError:
                i = Image.open(BytesIO(response_content))
                width, height = i.size
                
                if width > threshold_dim and height > threshold_dim:
                    with open(f"{save_dir}/{filename}.jpg", "wb+") as f:
                        f.write(response_content)
                    count += 1
                    dict_append_or_create(url,f'Write attempted as {filename}', logs)
                    if os.path.exists(f'{save_dir}/{filename}.jpg'):
                        dict_append_or_create(url,f'Write success as {filename}', logs)
                else:
                    dict_append_or_create(url,f'Undersize: {url}', logs)
        except:
            dict_append_or_create(url,f'Failed to get content: {url}', logs)
    
    return filename_dict, logs

In [None]:
locations = ['London UK', 'Manchester UK', 'Birmingham UK','New York US', 
             'Los Angeles US', 'Sydney Australia', 'Toronto CA']

search_results = []
search_logs = {}
for loc in locations:
    query = f'plumber {loc}'
    num_pages = 3
    google_search_output = get_google_results(query, num_pages)
    search_results.extend(google_search_output[0])
    search_logs.update(google_search_output[1]) 

search_results = list(set(search_results))

In [None]:
about_results = []
about_logs = {}
timeouts = []
for url in search_results:
    try:
        with time_limit(10):
            domain_results = find_about(url)
        about_logs.update(domain_results[1])
    except TimeoutException as e:
        timeouts.append(url)
    if domain_results[0]:
        about_results.append(domain_results[0])
    else:
        google_results = google_search_about(url)
        about_logs.update(google_results[1])
        if google_results[0]:
            about_results.append(google_results[0])

In [None]:
all_results = list(set(search_results + about_results))

images_logs = {}
images_urls = {}
get_img_timeouts = []
for url in all_results:

    try:
        with time_limit(10):
            get_images_reults = get_image_urls(url)
    
        images_urls[url] = get_images_reults[0]
        images_logs.update(get_images_reults[1])
    
    except TimeoutException as e:
        get_img_timeouts.append(url)

In [None]:
no_images = 0
for value in images_urls.values():
    no_images += len(value)

no_images

3815

In [None]:
save_logs = {}
catalogue = {}
save_dir = '/content/drive/MyDrive/Projects/Plumbers_with_Vehicles/Images'
threshold_dim = 100
count = 1
save_img_timeouts = []

for page_url in list(images_urls.keys()):
    
    save_logs_len = len(save_logs)
    filename_prfx = str(count).zfill(5) + '_'
    try:
        with time_limit(20):
            catalogue[page_url], save_logs[page_url] = save_from_url_list(
                images_urls[page_url], save_dir, threshold_dim, filename_prfx)
    
    except TimeoutException as e:
        get_img_timeouts.append(url)
    
    if len(save_logs) > save_logs_len:
        count +=1

In [None]:
for page_log in save_logs.keys():
    print('\n\n', page_log)
    for sublog in save_logs[page_log].keys():
        print(sublog, save_logs[page_log][sublog])

In [None]:
filename = save_dir + '/Catalogue/' + 'catalogue.pickle'

with open(filename, 'wb') as handle:
    pickle.dump(catalogue, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(filename, 'rb') as handle:
    test = pickle.load(handle)

print(catalogue == test)

True


In [21]:
save_dir = '/content/drive/MyDrive/Projects/Plumbers_with_Vehicles/Images/'
len(os.listdir(save_dir+'NoVehicle')),len(os.listdir(save_dir+'Vehicle'))

(2372, 134)

In [30]:
im = Image.open('/content/drive/MyDrive/Projects/Plumbers_with_Vehicles/Images/NoVehicle/00419_00013.jpg')
im.convert("RGBA").save('/content/drive/MyDrive/Projects/Plumbers_with_Vehicles/Test/00419_00013.png', format = 'PNG')

In [45]:
types = []
dir = '/content/drive/MyDrive/Projects/Plumbers_with_Vehicles/Images/Vehicle/'
for i in os.listdir(dir):
    im = Image.open(dir + i)
    types.append((im.format,im.mode))
set(types)

{('PNG', 'RGBA')}

In [47]:
len(os.listdir('/content/drive/MyDrive/Projects/Plumbers_with_Vehicles/Images_Additional/plumber_fleet'))

32

In [48]:
len(os.listdir('/content/drive/MyDrive/Projects/Plumbers_with_Vehicles/Images_Additional/plumber_van'))

106

In [49]:
len(os.listdir('/content/drive/MyDrive/Projects/Plumbers_with_Vehicles/Images_Additional/plumber_vehicle'))

54

In [None]:
run_additional = False
    if run_additional:
        from jmd_imagescraper.core import * 

        base_dir = '/content/drive/MyDrive/Projects/Plumbers_with_Vehicles/Images_Additional/'
        searches = ['plumber van', 'plumber vehicle', 'plumber fleet']
        sub_dirs = [search.replace(' ', '_') for search in searches]

        for search,sub_dir in zip(searches, sub_dirs):
            duckduckgo_search(base_dir, sub_dir, search, max_results=300)

In [56]:
vehicle_dir = '/content/drive/MyDrive/Projects/Plumbers_with_Vehicles/Images/Vehicle/'
for sub_dir in sub_dirs:
    sub_path = base_dir + sub_dir + '/'
    files = os.listdir(sub_path)
    count = 1
    for file in files:
        save_as = str(count).zfill(5)
        full_path = sub_path + file
        im = Image.open(full_path)
        im.convert("RGBA").save(f'{vehicle_dir}{sub_dir}_{save_as}.png', format = 'PNG')
        count += 1

In [57]:
os.listdir(vehicle_dir)

['00001_00003.png',
 '00007_00005.png',
 '00008_00001.png',
 '00010_00004.png',
 '00037_00001.png',
 '00042_00001.png',
 '00045_00010.png',
 '00047_00002.png',
 '00052_00001.png',
 '00055_00002.png',
 '00055_00005.png',
 '00061_00008.png',
 '00086_00001.png',
 '00086_00002.png',
 '00088_00004.png',
 '00096_00009.png',
 '00097_00003.png',
 '00097_00004.png',
 '00097_00011.png',
 '00098_00013.png',
 '00117_00004.png',
 '00124_00001.png',
 '00133_00012.png',
 '00135_00001.png',
 '00137_00001.png',
 '00144_00001.png',
 '00146_00008.png',
 '00146_00009.png',
 '00153_00001.png',
 '00153_00003.png',
 '00154_00001.png',
 '00155_00001.png',
 '00155_00008.png',
 '00163_00010.png',
 '00169_00002.png',
 '00172_00005.png',
 '00173_00002.png',
 '00178_00001.png',
 '00178_00002.png',
 '00182_00011.png',
 '00182_00012.png',
 '00182_00018.png',
 '00182_00023.png',
 '00182_00024.png',
 '00183_00007.png',
 '00183_00010.png',
 '00196_00001.png',
 '00196_00005.png',
 '00196_00010.png',
 '00207_00001.png',
