In [140]:
import pandas as pd
import os
import pickle
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, TimeoutException


def extract_channel_name(row):
    try:
        channel_name = row.find_element('tag name', 'td').text
        return channel_name.split('@')[-1].strip()  
    except Exception as e:
        print(f"Error extracting channel name: {str(e)}")
        return ""

def parse_table_rows(table_rows):
    channels = []
    for row in table_rows:
        cells = row.find_all('td')
        if cells and len(cells) > 1:  # Ensure row contains more than one cell
            channel = {}
            channel['rank'] = cells[0].get_text(strip=True).split('@')[0]
            channel['channel_id'] = cells[0].get_text(strip=True).split('@')[1] if '@' in cells[0].get_text(strip=True) else None            
            channel['description'] = cells[1].get_text(strip=True)  # Extract channel description
            channels.append(channel)
    return channels
    
def scrape_top30_channels(link):
    options = Options()
    options.add_argument("--headless")  
    options.add_argument("--window-size=1920x1080")

    driver = webdriver.Chrome(executable_path='chromedriver', options=options)
    driver.get(link)

    try:
        cookie_element = driver.find_element(By.XPATH, '//div[@class="bottom__cookie-block"]')
        ok_button = cookie_element.find_element(By.CLASS_NAME, 'ok')
        actions = ActionChains(driver)
        actions.move_to_element(cookie_element).click(ok_button).perform()
    except Exception as e:
        print(f"Error accepting cookies for {link}: {str(e)}")

    try:
        wait = WebDriverWait(driver, 10)
        wait.until(EC.invisibility_of_element_located((By.CLASS_NAME, "bottom__cookie-block")))
    except TimeoutException:
        print(f"Timeout: Unable to make the element invisible for {link}")
        return []

    try:
        show_all_element = driver.find_element(By.XPATH, '//div[@class="more_rows pk"]/a')
        driver.execute_script("arguments[0].click();", show_all_element)
    except NoSuchElementException:
        show_all_element = driver.find_element(By.XPATH, '//div[@class="more_rows"]/a')
        driver.execute_script("arguments[0].click();", show_all_element)
    #except NoSuchElementException:
        #print(f"'Все позиции' link not found for {link}")
    except Exception as e:
        print(f"Error clicking 'Все позиции' link for {link}: {str(e)}")
        return []

    # At this point, all the rows are visible. Get the table and parse the rows.
    table = driver.find_element(By.XPATH, '//div[@class="block_tables"]//table')
    rows = table.find_elements(By.CSS_SELECTOR, 'tbody > tr:not(.rcount-row)')

    # Create BeautifulSoup objects from the row's HTML and parse them
    top_channels = parse_table_rows([BeautifulSoup(row.get_attribute('outerHTML'), 'html.parser') for row in rows])

    driver.quit()

    return top_channels  # Now this function returns a list of dicts containing channel information


def main():
    df = pd.read_csv('top30_links.csv')
    df['year'] = df['year'].astype(int)

    if os.path.exists('top30_channels_data.pkl'):
        with open('top30_channels_data.pkl', 'rb') as file:
            top30_channels_by_year = pickle.load(file)
    else:
        top30_channels_by_year = {}

    channel_descriptions = {}  # Add this line to store the most recent descriptions

    for index, row in df.iterrows():
        year = row['year']
        link = row['link']
        month = row['month']

        if year not in top30_channels_by_year:
            top30_channels_by_year[year] = []

        print(f"Scraping data for year: {year}, month: {month}")

        channel_info = scrape_top30_channels(link)
        top30_channels_by_year[year].extend(channel_info)

        # Update the channel_descriptions dictionary with the most recent descriptions
        for channel in channel_info:
            channel_descriptions[channel['channel_id']] = channel['description']

        with open('top30_channels_data.pkl', 'wb') as file:
            pickle.dump(top30_channels_by_year, file)

    common_channels_by_year = {
        year: set(channel['channel_id'] for channel in channels) 
        for year, channels in top30_channels_by_year.items()
    }

    channels = set.union(*common_channels_by_year.values())
    data = {
        channel: [channel in common_channels_by_year.get(year, set()) for year in range(2017, 2024)] 
        for channel in channels
    }
    common_channels_df = pd.DataFrame(data, index=range(2017, 2024)).T

    # Add a column to the DataFrame with the most recent description for each channel
    common_channels_df['description'] = [channel_descriptions.get(channel_id, '') for channel_id in common_channels_df.index]

    common_channels_df.columns = ['2017', '2018', '2019', '2020', '2021', '2022', '2023', 'description']
    common_channels_df.index.name = 'channel name'

    common_channels_df.to_csv('common_channels.csv')


In [139]:
if __name__ == "__main__":
    main()

Scraping data for year: 2017, month: November
Scraping data for year: 2018, month: February
Scraping data for year: 2018, month: July
Scraping data for year: 2019, month: January
Scraping data for year: 2019, month: July
Scraping data for year: 2020, month: January
Scraping data for year: 2020, month: July
Scraping data for year: 2021, month: January
Scraping data for year: 2021, month: February
Scraping data for year: 2021, month: July
Scraping data for year: 2022, month: January
Scraping data for year: 2022, month: July
Scraping data for year: 2023, month: January
Scraping data for year: 2023, month: June
