In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import csv
import time
import logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

# Setup Chrome options
options = Options()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0")
# options.add_argument("--headless")  # Uncomment if you don't want a browser window
driver = webdriver.Chrome(options=options)

# Output file
filename = "yellowpages_banks_uz.csv"
csvfile = open(filename, "w", newline="", encoding="utf-8-sig")
writer = csv.writer(csvfile)
writer.writerow(["Bank Name", "Region", "Address", "Website", "Page Link"])

# Loop through pages
base_url = "https://www.yellowpages.uz/en/rubric/banks?pagenumber={}&pagesize=100"
try:
    for page_number in range(1, 8):  # Pages 1 to 7
        url = base_url.format(page_number)
        logging.info(f"Scraping page {page_number}: {url}")
        driver.get(url)
        time.sleep(5)  # Wait for full content load

        soup = BeautifulSoup(driver.page_source, "html.parser")
        bank_cards = soup.select("div.custom-card")

        for card in bank_cards:
            # Bank Name
            name_tag = card.select_one("h4 a")
            bank_name = name_tag.text.strip() if name_tag else ""

            # Profile Link
            link = "https://www.yellowpages.uz" + name_tag['href'] if name_tag and name_tag.has_attr('href') else ""

            # Website (rel="nofollow noindex")
            website_tag = card.select_one('a[rel="nofollow noindex"]')
            website = website_tag.text.strip() if website_tag else ""

            # Region (from address <a>)
            region_tag = card.select('a.addr_link')
            region = region_tag[0].text.strip() if region_tag else ""

            # Address
            address_tag = card.find("p", text=lambda x: x and "Address:" in x)
            if not address_tag:
                address_p = card.find_all("p")
                address_text = ""
                for p in address_p:
                    if "Address:" in p.text:
                        address_text = p.text.replace("Address:", "").strip()
                        break
            else:
                address_text = address_tag.text.replace("Address:", "").strip()

            writer.writerow([bank_name, region, address_text, website, link])

        logging.info(f"✅ Page {page_number} done. Banks scraped: {len(bank_cards)}")
        time.sleep(3)

finally:
    csvfile.close()
    driver.quit()
    logging.info(f"✅ Scraping complete. File saved: {filename}")


2025-07-02 17:00:05,041 - Scraping page 1: https://www.yellowpages.uz/en/rubric/banks?pagenumber=1&pagesize=100
  address_tag = card.find("p", text=lambda x: x and "Address:" in x)
2025-07-02 17:00:18,179 - ✅ Page 1 done. Banks scraped: 106
2025-07-02 17:00:21,181 - Scraping page 2: https://www.yellowpages.uz/en/rubric/banks?pagenumber=2&pagesize=100
2025-07-02 17:00:33,530 - ✅ Page 2 done. Banks scraped: 106
2025-07-02 17:00:36,531 - Scraping page 3: https://www.yellowpages.uz/en/rubric/banks?pagenumber=3&pagesize=100
2025-07-02 17:00:45,833 - ✅ Page 3 done. Banks scraped: 106
2025-07-02 17:00:48,834 - Scraping page 4: https://www.yellowpages.uz/en/rubric/banks?pagenumber=4&pagesize=100
2025-07-02 17:00:59,269 - ✅ Page 4 done. Banks scraped: 106
2025-07-02 17:01:02,271 - Scraping page 5: https://www.yellowpages.uz/en/rubric/banks?pagenumber=5&pagesize=100
2025-07-02 17:01:13,375 - ✅ Page 5 done. Banks scraped: 106
2025-07-02 17:01:16,377 - Scraping page 6: https://www.yellowpages.uz/e

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import csv
import time
import logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

# Setup Chrome
options = Options()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0")
# options.add_argument("--headless")  # Optional
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options)

filename = "yellowpages_banks_detailed.csv"
csvfile = open(filename, "w", newline="", encoding="utf-8-sig")
writer = csv.writer(csvfile)
writer.writerow([
    "Bank Name", "Brand Name", "Region", "Address", "Website", "Page Link",
    "Phones", "Detail Page Website", "Telegram", "Facebook", "Instagram"
])

base_url = "https://www.yellowpages.uz/en/rubric/banks?pagenumber={}&pagesize=100"

def extract_social_link(soup, label):
    tag = soup.find("h3", string=lambda t: t and label in t)
    if tag:
        a_tag = tag.find("a")
        return a_tag.text.strip() if a_tag else ""
    return ""

try:
    for page_number in range(1, 8):
        url = base_url.format(page_number)
        logging.info(f"📄 Scraping page {page_number} → {url}")
        driver.get(url)
        WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "custom-card"))
        )
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        card_html_blocks = soup.select("div.custom-card")

        for idx, card_block in enumerate(card_html_blocks):
            card_html = str(card_block)
            card_soup = BeautifulSoup(card_html, "html.parser")

            # Bank Name and Page Link
            name_tag = card_soup.select_one("h4 a")
            bank_name = name_tag.text.strip() if name_tag else ""
            page_link = "https://www.yellowpages.uz" + name_tag['href'] if name_tag else ""

            # Brand Name
            brand_tag = card_soup.find("p", string=lambda x: x and "Brand name:" in x)
            brand_name = brand_tag.find_next("span").text.strip() if brand_tag else ""

            # Region
            region_tag = card_soup.select("a.addr_link")
            region = region_tag[0].text.strip() if region_tag else ""

            # Address
            addr_p = card_soup.find_all("p")
            address = ""
            for p in addr_p:
                if "Address:" in p.text:
                    address = p.text.replace("Address:", "").strip()
                    break

            # Website on card
            website_tag = card_soup.select_one('a[rel="nofollow noindex"]')
            website = website_tag.text.strip() if website_tag else ""

            # === Detail Page Visit ===
            phones = detail_website = telegram = facebook = instagram = ""

            try:
                driver.get(page_link)
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.ID, "contacts"))
                )
                time.sleep(2)

                # Click the "Call" button to reveal phone numbers
                try:
                    call_btn = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Call")]'))
                    )
                    driver.execute_script("arguments[0].click();", call_btn)
                    time.sleep(1)

                    phone_soup = BeautifulSoup(driver.page_source, "html.parser")
                    phone_links = phone_soup.select("ul li a[href^='tel']")
                    phones = "; ".join([a.text.strip() for a in phone_links if a.text.strip()])

                except Exception as e:
                    logging.warning(f"⚠️ Could not click/reveal phones for: {bank_name}")

                detail_soup = BeautifulSoup(driver.page_source, "html.parser")
                detail_website = extract_social_link(detail_soup, "Website")
                telegram = extract_social_link(detail_soup, "Telegram")
                facebook = extract_social_link(detail_soup, "Facebook")
                instagram = extract_social_link(detail_soup, "Instagram")

            except Exception as e:
                logging.warning(f"⚠️ Could not open detail page for: {bank_name}")

            writer.writerow([
                bank_name, brand_name, region, address, website, page_link,
                phones, detail_website, telegram, facebook, instagram
            ])
            logging.info(f"✅ Done: {bank_name}")

        time.sleep(2)

finally:
    csvfile.close()
    driver.quit()
    logging.info(f"✅ All scraping complete. File saved to: {filename}")


2025-07-02 17:33:34,126 - 📄 Scraping page 1 → https://www.yellowpages.uz/en/rubric/banks?pagenumber=1&pagesize=100
2025-07-02 17:34:03,637 - ⚠️ Could not open detail page for: 1. Garant Bank" head office pro
2025-07-02 17:34:03,639 - ✅ Done: 1. Garant Bank" head office pro
2025-07-02 17:34:08,436 - ✅ Done: 2. AGROBANK STOCK COMMERCE BANK BUKHARA DEPARTMENT OF BUKHARA AREA
2025-07-02 17:34:12,981 - ✅ Done: 3. AGROBANK STOCK COMMERCE BANK STATE DEPARMENT OF DZHIZAK AREA
2025-07-02 17:34:17,025 - ✅ Done: 4. AGROBANK STOCK COMMERCE BANK STATE DEPARTMENT OF KASHKADARYA AREA
2025-07-02 17:34:21,437 - ✅ Done: 5. AGROBANK STOCK COMMERCE BANK STATE DEPARTMENT OF REOUBLIC OF KARAKALPAKSTAN
2025-07-02 17:34:26,062 - ✅ Done: 6. AGROBANK STOCK COMMERCE BANK STATE DEPARTMENT OF REOUBLIC OF UZBEKISTAN
2025-07-02 17:34:30,104 - ✅ Done: 7. AGROBANK STOCK COMMERCE BANK STATE DEARTMENT OF SIRDARYA AREA
2025-07-02 17:34:35,636 - ✅ Done: 8. AGROBANK STOCK COMMERCE BANK STATE DEPARTMENT OF FERGANA AREA
2025

KeyboardInterrupt: 