In [None]:
import requests
from bs4 import BeautifulSoup
from lxml import html
import csv
import time
import re


HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Referer': 'https://www.imdb.com/',
    'Connection': 'keep-alive',
}

BASE_URL = "https://www.imdb.com/search/title/?groups=top_1000&count=250&start={}"


def clean_title(title_text):
    cleaned = re.sub(r'^\d+\.\s*', '', title_text)
    return cleaned.strip()

def parse_with_beautifulsoup():
    all_movies = []
    for page in range(10):
        start = page * 100 + 1
        url = BASE_URL.format(start)
        print(f"BeautifulSoup: обработка страницы {page + 1}/4 (start={start})")

        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        movie_containers = soup.find_all('li', class_=lambda x: x and 'ipc-metadata-list-summary-item' in x)

        for movie in movie_containers:
            title = movie.find('h3', {'class': 'ipc-title__text ipc-title__text--reduced'}).text
            year = movie.find('span', {'class': 'sc-15ac7568-7 cCsint dli-title-metadata-item'}).text
            duration = movie.find('span', {'class': 'sc-15ac7568-7 cCsint dli-title-metadata-item'}).text
            rating = movie.find('span', {'class': 'ipc-rating-star--rating'}).text

            all_movies.append({
                'title': clean_title(title),
                'year': year,
                'duration': duration,
                'rating': rating
            })

        time.sleep(1)

    return all_movies

def parse_with_lxml():
    all_movies = []
    for page in range(10):
        start = page * 100 + 1
        url = BASE_URL.format(start)

        response = requests.get(url, headers=HEADERS, timeout=10)
        tree = html.fromstring(response.content)

        movie_nodes = tree.xpath('//li[@class = "ipc-metadata-list-summary-item"]')

        for movie in movie_nodes:
            title = movie.xpath('.//h3[@class = "ipc-title__text ipc-title__text--reduced"]/text()')[0]
            year = movie.xpath('.//span[@class = "sc-15ac7568-7 cCsint dli-title-metadata-item"]/text()')[0]
            duration = movie.xpath('.//span[@class = "sc-15ac7568-7 cCsint dli-title-metadata-item"]/text()')[1]
            rating = movie.xpath('.//span[@class = "ipc-rating-star--rating"]/text()')[0]

            all_movies.append({
                'title': clean_title(title),
                'year': year,
                'duration': duration,
                'rating': rating
            })

        time.sleep(1)

    return all_movies

def save_to_csv(movies, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['title', 'year', 'duration', 'rating']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for movie in movies:
            writer.writerow(movie)
    print(f"Данные сохранены в {filename}")


if __name__ == "__main__":
  """
  print("=== Парсинг с помощью BeautifulSoup ===")
  movies_bs4 = parse_with_beautifulsoup()
  save_to_csv(movies_bs4, 'imdb_top1000_bs4.csv')
  """
  print("\n=== Парсинг с помощью lxml ===")
  movies_lxml = parse_with_lxml()
  save_to_csv(movies_lxml, 'imdb_top1000_lxml.csv')

  #print(f"\n Собрано фильмов (BS4): {len(movies_bs4)}")
  #print(f"Собрано фильмов (lxml): {len(movies_lxml)}")



=== Парсинг с помощью lxml ===
Данные сохранены в imdb_top1000_lxml.csv


In [None]:
!pip install selenium

import requests
import time
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

service = Service()#executable_path=r'/usr/bin/chromedriver.exe')

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

wd = webdriver.Chrome(service=service,options=chrome_options)
wd.get("https://www.news29.ru/novosti/ekonomika/")
wd.implicitly_wait(10)
wd.fullscreen_window

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading packag

In [None]:
for i in range(5):
    wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

def parse():
        all_articles = []
        soup = BeautifulSoup(wd.page_source, 'html.parser')
        articles = soup.find_all('div', class_=lambda x: x and 'newItemContainer' in x)

        for article in articles:
            title = article.find('div', {'class': 'dataContainer'}).find('div', {'class': 'title'}).text
            description = article.find('div', {'class': 'dataContainer'}).find('div', {'class': 'lead'}).text
            date = article.find('div', {'class': 'dataContainer'}).find('div', {'class': 'date'}).text

            all_articles.append({
                'title': title,
                'description': description,
                'date': date
            })

        time.sleep(1)
        return all_articles

# Сохраняем в CSV
with open('pomorie_news.csv', 'w', newline='', encoding='utf-8-sig') as f:
    writer = csv.DictWriter(f, fieldnames=['title', 'description', 'date'])
    writer.writeheader()
    writer.writerows(unique_news)

print(f"✅ Успешно собрано {len(unique_news)} новостей")
print("Данные сохранены в pomorie_news.csv")

wd.quit()



MaxRetryError: HTTPConnectionPool(host='localhost', port=58989): Max retries exceeded with url: /session/55b0c82c35b1c1060fa80c400a21e03d/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7d831bad9340>: Failed to establish a new connection: [Errno 111] Connection refused'))