In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# download wikipage
wikipage = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_by_continent_(data_file)"
result = requests.get(wikipage)

# if successful parse the download into a BeautifulSoup object, which allows easy manipulation 
if result.status_code == 200:
    soup = BeautifulSoup(result.content, "html.parser")
    
# find the object with HTML class wikitable sortable
table = soup.find('table',{'class':'wikitable sortable'})

# loop through all the rows and pull the text
new_table = []
for row in table.find_all('tr')[1:]:
    column_marker = 0
    columns = row.find_all('td')
    new_table.append([column.get_text() for column in columns])
    

In [3]:

df = pd.DataFrame(new_table, columns=['ContinentCode','Alpha2','Alpha3','PhoneCode','Name'])
df['Name'] = df['Name'].str.replace('\n','')
df

Unnamed: 0,ContinentCode,Alpha2,Alpha3,PhoneCode,Name
0,AS,AF,AFG,004,"Afghanistan, Islamic Republic of"
1,EU,AL,ALB,008,"Albania, Republic of"
2,AN,AQ,ATA,010,Antarctica (the territory South of 60 deg S)
3,AF,DZ,DZA,012,"Algeria, People's Democratic Republic of"
4,OC,AS,ASM,016,American Samoa
...,...,...,...,...,...
257,AS,YE,YEM,887,Yemen
258,AF,ZM,ZMB,894,"Zambia, Republic of"
259,AS,XD,,,United Nations Neutral Zone
260,AS,XS,,,Spratly Islands


In [4]:
res=pd.read_html(wikipage)
res

[                0                                                  1
 0  Type of format                               Document file format
 1        Standard  ISO 3166-1 alpha-2; ISO 3166-1 alpha-3; ISO 31...,
                                                    0  \
 0  .mw-parser-output .hidden-begin{box-sizing:bor...   
 1  .mw-parser-output .hidden-begin{box-sizing:bor...   
 
                                    1                                    2  \
 0                       Country code                         Country code   
 1  Two-letterISO 3166-1 alpha-2(a-2)  Three-letterISO 3166-1 alpha-3(a-3)   
 
                                    3                            4  
 0                     Country number  English country name (Name)  
 1  Three-digitISO 3166-1 numeric (#)  English country name (Name)  ,
      CC a-2  a-3      #                                          Name
 0    AS  AF  AFG    4.0              Afghanistan, Islamic Republic of
 1    EU  AL  ALB    8.0      

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

DRIVER_PATH = r"C:\Users\surya\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
service = Service(executable_path=DRIVER_PATH)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)
# ...
driver.get('https://google.com')
search_box = driver.find_elements(By.CSS_SELECTOR,'textarea.gLFyf')[0]
search_box.send_keys('Dogs')
# driver.quit()

In [19]:
import time
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    print('fetching starts')
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements(By.CSS_SELECTOR,"img.rg_i.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements(By.CSS_SELECTOR,"img.Q4LuWd")
            for actual_image in actual_images:
                # print(actual_image.get_attribute('src'))
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
            else:
                print("Found:", len(image_urls), "image links, looking for more ...")
                time.sleep(1)
                
                # load_more_button = wd.find_element_by_css_selector(".mye4qd")
                # if load_more_button:
                #     wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    print(image_urls)
    return image_urls

In [20]:
import hashlib
import io
import os
from PIL import Image

image_content=None
def persist_image(folder_path:str,url:str):
    global image_content
    # print(url[:20], folder_path)
    # return
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        if image_content:
            image_file = io.BytesIO(image_content)
            image = Image.open(image_file).convert('RGB')
            file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
            with open(file_path, 'wb') as f:
                image.save(f, "JPEG", quality=85)
            print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")


In [21]:
import os
def search_and_download(search_term:str,driver_path:str,target_path='./images',number_images=5):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    service = Service(executable_path=DRIVER_PATH)
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=service, options=options)

    global res
    with driver as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.01)
        
    for elem in res:
        persist_image(target_folder,elem)

In [26]:
# search_term = 'dogs'
number_images=150
billionaires = [
    "Elon Musk",
    "Jeff Bezos",
    "Bernard Arnault",
    "Gautam Adani",
    "Bill Gates",
    "Mukesh Ambani",
    "Mark Zuckerberg",
    "Warren Buffett",
]
for search_term in billionaires:
    continue
    search_and_download(
        search_term=search_term+' face',
        driver_path=DRIVER_PATH,
        number_images=number_images)

fetching starts
Found: 100 search results. Extracting links from 0:100
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, looking for more ...
Found: 9 image links, look