# Selenium

In [1]:
import pandas as pd
from datetime import date
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import random

# Set Chrome options for headless mode
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

In [2]:
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1",
]

In [3]:
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
browser.implicitly_wait(5)

In [4]:
start_date = '01.03.2025' #'01.04.2025' , '01.05.2025' , '01.06.2025'
end_date = '31.03.2025' #'30.04.2025' , '31.05.2025' , '30.06.2025'
output_name = 'march_2025' #'april_2025' , 'may_2025'

In [5]:
base_url = f'https://nsddata.ru/ru/news?text=Выплата+купонного+дохода&from={start_date}&to={end_date}'

# Get initial page to find last page number
browser.get(base_url)
last_page_element = browser.find_element(By.CLASS_NAME, "last").text

if last_page_element:
    last_page = int(last_page_element)
else:
    last_page = 1  # Default if pagination not found

In [6]:
# Initialize DataFrame BEFORE the loop
table = pd.DataFrame(columns=['date', 'link', 'text'])

for page in tqdm(range(1, last_page + 1)):  #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    chrome_options.add_argument(f"user-agent={random.choice(USER_AGENTS)}")
    browser.implicitly_wait(4)
    # Create URL for each page
    url = f'https://nsddata.ru/ru/news?text=Выплата+купонного+дохода&from={start_date}&to={end_date}&page={page}'
    # Get the page content
    browser.get(url)
    # soup = BeautifulSoup(r.content, 'lxml')
    
    # Find items on THIS page
    items = browser.find_elements(By.CLASS_NAME, 'row.news_list__item')
    
    # Temporary lists for this page
    dates = []
    links = []
    texts = []
    
    for i in items:
        # Extract date
        date_element = i.find_element(By.CLASS_NAME, 'col-sm-2.col-md-1.news_list__item__date')
        date = date_element.text.strip() if date_element else ''
        
        # Extract link
        header_element = i.find_element(By.CLASS_NAME, 'col-sm-10.col-md-11.news_list__item__header')
        href = header_element.find_element(By.TAG_NAME, "a") if header_element else ''
        link = href.get_attribute('href')
        
        
        # Extract text
        text = i.find_element(By.CLASS_NAME, 'news_list__item__header__title').text if i else ''
        
        dates.append(date)
        links.append(link)
        texts.append(text)
    
    # Create DataFrame for this page and append to main table
    page_df = pd.DataFrame({
        'date': dates,
        'link': links,
        'text': texts
    })
    
    # Append to the main table
    table = pd.concat([table, page_df], ignore_index=True)

table.to_csv(f'CICs\{output_name}.csv')

100%|██████████| 168/168 [10:15<00:00,  3.66s/it]
