<a href="https://colab.research.google.com/github/yavuzim/beautifulsoup/blob/main/beautifulsoup2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [68]:
!pip install requests beautifulsoup4 aiohttp fake-useragent



In [69]:
import requests
from bs4 import BeautifulSoup
import logging
import time
import random
from typing import Optional, List, Dict, Any
from dataclasses import dataclass
from urllib.parse import urljoin
import concurrent.futures
import aiohttp
import asyncio
import nest_asyncio
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter

In [70]:
@dataclass
class ScrapingConfig:
    base_url: str
    headers: Dict[str, str]
    proxies: Optional[Dict[str, str]] = None
    timeout: int = 30
    max_retries: int = 3
    delay_range: tuple = (1, 3)

class WebScraper:
    def __init__(self, config: ScrapingConfig):
        self.config = config
        self.session = self._create_session()
        self.logger = self._setup_logger()
        self.user_agent = UserAgent()

    def _setup_logger(self) -> logging.Logger:
        """Logger yapılandırması"""
        logger = logging.getLogger('WebScraper')
        logger.setLevel(logging.INFO)

        handler = logging.FileHandler('scraping.log')
        handler.setFormatter(
            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        )
        logger.addHandler(handler)

        return logger

    def _create_session(self) -> requests.Session:
        """Retry mekanizmalı session oluşturma"""
        session = requests.Session()

        retry_strategy = Retry(
            total=self.config.max_retries,
            backoff_factor=0.5,
            status_forcelist=[500, 502, 503, 504]
        )

        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("http://", adapter)
        session.mount("https://", adapter)

        return session

    def _get_headers(self) -> Dict[str, str]:
        """Random User-Agent ile headers oluşturma"""
        headers = self.config.headers.copy()
        headers['User-Agent'] = self.user_agent.random
        return headers

    def _random_delay(self):
        """İstekler arası random gecikme"""
        time.sleep(random.uniform(*self.config.delay_range))

    def get_page_content(self, url: str) -> Optional[BeautifulSoup]:
        """Sayfa içeriğini alma ve parse etme"""
        try:
            self._random_delay()

            response = self.session.get(
                url,
                headers=self._get_headers(),
                proxies=self.config.proxies,
                timeout=self.config.timeout
            )
            response.raise_for_status()

            return BeautifulSoup(response.content, 'html.parser')

        except Exception as e:
            self.logger.error(f"Error fetching {url}: {str(e)}")
            return None

    async def async_get_page(self, url: str) -> Optional[str]:
        """Asenkron sayfa içeriği alma"""
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(
                    url,
                    headers=self._get_headers(),
                ) as response:
                    return await response.text()

        except Exception as e:
            self.logger.error(f"Error in async fetch {url}: {str(e)}")
            return None

    def parse_content(self, soup: BeautifulSoup, selectors: Dict[str, str]) -> Dict[str, Any]:
        """İçerik parse etme"""
        result = {}

        for key, selector in selectors.items():
            try:
                element = soup.select_one(selector)
                if element:
                    result[key] = element.get_text(strip=True)
                else:
                    result[key] = None
                    self.logger.warning(f"Selector '{selector}' not found for key '{key}'")

            except Exception as e:
                self.logger.error(f"Error parsing {key}: {str(e)}")
                result[key] = None

        return result

    def scrape_multiple_pages(self, urls: List[str], selectors: Dict[str, str]) -> List[Dict[str, Any]]:
        """Çoklu sayfa scraping"""
        results = []

        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_url = {
                executor.submit(self.get_page_content, url): url
                for url in urls
            }

            for future in concurrent.futures.as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    soup = future.result()
                    if soup:
                        data = self.parse_content(soup, selectors)
                        data['url'] = url
                        results.append(data)

                except Exception as e:
                    self.logger.error(f"Error processing {url}: {str(e)}")

        return results

    async def async_scrape_multiple_pages(self, urls: List[str], selectors: Dict[str, str]) -> List[Dict[str, Any]]:
        """Asenkron çoklu sayfa scraping"""
        tasks = [self.async_get_page(url) for url in urls]
        pages = await asyncio.gather(*tasks)

        results = []
        for url, html in zip(urls, pages):
            if html:
                soup = BeautifulSoup(html, 'html.parser')
                data = self.parse_content(soup, selectors)
                data['url'] = url
                results.append(data)

        return results


In [72]:
# Asenkron çoklu scraping
async def main():
    results = await scraper.async_scrape_multiple_pages(urls, selectors)
    await asyncio.run(main())

if __name__ == "__main__":
    config = ScrapingConfig(
        base_url="https://example.com",
        headers={
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
        }
    )

    scraper = WebScraper(config)
    number = 1

    # Tek sayfa scraping
    url = "https://www.dunyahalleri.com/category/teknoloji-bilim/page/"+str(number)

    selectors = {
        'gundem': 'div[class*="taxonomy-category"] a',
        'baslik': 'h2.wp-block-post-title',
        'ozet': 'div[class*="wp-block-post-excerpt"] p'
    }

    soup = scraper.get_page_content(url)
    if soup:
        # İçeriği parse et
        # Verilerin tümünü çekmek için döngü ekliyoruz
        data = []
        gundem_elements = soup.select(selectors['gundem'])
        baslik_elements = soup.select(selectors['baslik'])
        ozet_elements = soup.select(selectors['ozet'])

        for gundem, baslik, ozet in zip(gundem_elements, baslik_elements, ozet_elements):
            data.append({
                'gundem': gundem.text.strip(),
                'baslik': baslik.text.strip(),
                'ozet': ozet.text.strip()
            })

        print("data : ", data)

    # Çoklu sayfa scraping
    urls = [
       "https://www.dunyahalleri.com/category/haftanin-ozeti/page/"+str(number),
            "https://www.dunyahalleri.com/category/genel-gundem/page/"+str(number),
            "https://www.dunyahalleri.com/category/teknoloji-bilim/page/"+str(number),
            "https://www.dunyahalleri.com/category/internet-girisimler/page/"+str(number),
            "https://www.dunyahalleri.com/category/tasarim-inovasyon/page/"+str(number),
            "https://www.dunyahalleri.com/category/kultur-sanat/page/"+str(number)
    ]
    # Senkron çoklu scraping

    for url in urls:
      print("url : ",url)
      soup = scraper.get_page_content(url)
      # print(soup)
      if soup:
        results = []
        gundem_elements = soup.select(selectors['gundem'])
        baslik_elements = soup.select(selectors['baslik'])
        ozet_elements = soup.select(selectors['ozet'])
        gundem_count = len(gundem_elements)
        print(f"gundem öğeleri sayısı: {gundem_count}")
        for gundem, baslik, ozet in zip(gundem_elements, baslik_elements, ozet_elements):
          results.append({
              'gundem': gundem.text.strip(),
              'baslik': baslik.text.strip(),
              'ozet': ozet.text.strip()
          })


        print("->>> ",results)


data :  [{'gundem': 'Teknoloji / Bilim', 'baslik': 'Yapay zeka, işlevini kaybedecek böbreği iki gün önceden belirleyebiliyor', 'ozet': 'DeepMind adlı yapay zeka şirketi, akut böbrek yetmezliğini 48 saat önceden tahmin edebilen bir yapay zeka sistemi üzerinde çalışıyor.'}, {'gundem': 'Teknoloji / Bilim', 'baslik': 'Hoverboard’la Manş Denizi’ni geçme denemesi tekrarlanacak', 'ozet': 'Geçen hafta hoverboard üzerinde Manş Denizi’ni geçmeye çalışıp başarısız olan Franky Zapata, şansını bir kez daha deneyecek.'}, {'gundem': 'Teknoloji / Bilim', 'baslik': 'Capital One bankası 106 milyon müşterisinin verilerini çaldırdı', 'ozet': 'ABD’nin en büyük bankalarından Capital One, 22-23 Mart tarihlerinde bankada büyük çaplı bir veri ihlali yaşandığını açıkladı.'}, {'gundem': 'Teknoloji / Bilim', 'baslik': 'Fransa uydularına lazerli savunma sistemleri yerleştirmek istiyor', 'ozet': 'Uzay lazerlerini öz savunma amaçlı kullanmak istediğini açıklayan Fransa, uydularına silah ve lazerler ekleyecek.'}, {'g