In [None]:
import re
import requests
from bs4 import BeautifulSoup, Tag
import csv
import sys
from tqdm import tqdm
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from datetime import datetime
from typing import List, Optional
from pydantic import BaseModel, Field, HttpUrl

# Ignore only the InsecureRequestWarning
warnings.simplefilter("ignore", InsecureRequestWarning)

class SearchResult(BaseModel):
    """
    Individual search result from ITMO news/pages
    """
    title: str
    url: HttpUrl
    description: Optional[str] = None
    date: Optional[datetime] = None
    content: Optional[str] = None

class SearchResponse(BaseModel):
    """
    Complete search response
    """
    query: str
    total_results: int
    results: List[SearchResult]
    search_time: datetime = Field(default_factory=datetime.now)

def clean_article_content(content_block: Tag) -> str:
    """
    Remove unnecessary elements and prepare text:
      - Remove <script> and <style>
      - Convert non-breaking spaces (\xa0) to regular spaces
      - Collapse multiple spaces/newlines
    """
    # 1. Remove <script> and <style> tags
    for unwanted_tag in content_block(["script", "style"]):
        unwanted_tag.decompose()

    # 2. Extract text
    text = content_block.get_text(separator="\n", strip=True)

    # 3. Remove non-breaking spaces
    text = text.replace("\xa0", " ")

    # 4. Collapse multiple newlines and extra spaces
    text = re.sub(r"\n\s*\n+", "\n\n", text)
    text = re.sub(r" {2,}", " ", text)

    # 5. Final strip
    text = text.strip()
    return text

def parse_itmo_article_page(html_content: str) -> dict:
    """
    Extracts data from a single ITMO article page:
      - title
      - publication_datetime (e.g. 2025-01-22T11:48:35+03:00)
      - views
      - authors (list of strings)
      - tags (list of strings)
      - cleaned article_text
    """
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Title
    title_tag = soup.select_one("div.article h1")
    title = title_tag.get_text(strip=True) if title_tag else None

    # Date/time in time[datetime], e.g. 2025-01-22T11:48:35+03:00
    time_tag = soup.select_one("div.news-info-wrapper time")
    publication_datetime_str = time_tag.get("datetime", "").strip() if time_tag else None
    publication_datetime = None
    if publication_datetime_str:
        # Try to parse as an ISO datetime
        # If you need more robust parsing, install/use dateutil.parser.parse(...)
        try:
            publication_datetime = datetime.fromisoformat(publication_datetime_str)
        except ValueError:
            # fallback if the format is not strictly ISO or can't be parsed
            publication_datetime = None

    # Views (example: <span class="icon eye">1049</span>)
    views_span = time_tag.select_one("span.icon.eye") if time_tag else None
    views = views_span.get_text(strip=True) if views_span else None

    # Authors
    authors = []
    for author_block in soup.select(".author-block .author-item"):
        name_tag = author_block.select_one(".about h6 a[rel='author']")
        if name_tag:
            authors.append(name_tag.get_text(strip=True))

    # Tags
    tags = []
    for tag_item in soup.select("ul.tags li a"):
        tags.append(tag_item.get_text(strip=True))

    # Main article text
    article_block = soup.select_one(".content.js-mediator-article")
    article_text = clean_article_content(article_block) if article_block else ""

    return {
        "title": title,
        "publication_datetime": publication_datetime,
        "views": views,
        "authors": authors,
        "tags": tags,
        "article_text": article_text
    }


def parse_itmo_search_page(html_content: str):
    """
    Parses the main ITMO search result page, returning:
      - total_results (string or integer)
      - articles: list of {title, link, snippet, date (string)}
    """
    soup = BeautifulSoup(html_content, "html.parser")

    # Find total results (e.g. "152 результата")
    total_results_str = None
    h2_element = soup.select_one(".weeklyevents h2 span")
    if h2_element:
        total_results_str = h2_element.get_text(strip=True)

    # Convert e.g. "152 результата" -> 152 (if possible)
    total_results = 0
    if total_results_str:
        # Use regex to extract digits
        match = re.search(r"(\d+)", total_results_str)
        if match:
            total_results = int(match.group(1))

    # Parse each result (li.weeklyevent)
    results = []
    for item in soup.select(".weeklyevents ul li.weeklyevent"):
        h4 = item.find("h4")
        if not h4 or not h4.find("a"):
            continue

        # Title & Link
        title_tag = h4.find("a")
        title = title_tag.get_text(strip=True)
        link = title_tag["href"]
        if link.startswith("/"):
            link = "https://news.itmo.ru" + link

        # paragraphs typically contain snippet & date
        paragraphs = item.find_all("p")
        snippet = paragraphs[0].get_text(strip=True) if len(paragraphs) > 0 else None
        date_str = paragraphs[-1].get_text(strip=True) if len(paragraphs) > 1 else None

        results.append({
            "title": title,
            "link": link,
            "snippet": snippet,
            "date": date_str  # date as string for now
        })

    return {
        "total_results": total_results,
        "articles": results
    }

def get_search_results(
    query: str,
    max_articles: int = 5
) -> SearchResponse:
    """
    High-level function that:
      - Searches ITMO news for `query`
      - Parses search results
      - Fetches each article's content
      - Returns a Pydantic SearchResponse
    """
    # Build the search URL
    base_url = "https://news.itmo.ru/ru/search/"
    params = {"search": query}

    # 1. Fetch the search page
    response = requests.get(base_url, params=params, verify=False)
    response.raise_for_status()

    # 2. Parse the search page
    search_data = parse_itmo_search_page(response.text)
    total_found = search_data["total_results"]
    raw_articles = search_data["articles"]

    # 3. For each article (up to max_articles), parse additional data
    results_list = []
    for article_info in raw_articles[:max_articles]:
        url = article_info["link"]
        snippet = article_info["snippet"]
        date_str = article_info["date"]

        # 3a. Fetch & parse the article page
        article_resp = requests.get(url, verify=False)
        article_resp.raise_for_status()
        article_data = parse_itmo_article_page(article_resp.text)

        # 3b. Build a Pydantic SearchResult
        # We'll use the parse_itmo_article_page date if it's valid,
        # otherwise fallback to the date from the search snippet.
        final_date = article_data["publication_datetime"]

        # If parse_itmo_article_page could not parse any date/time, optionally try
        # to parse `date_str` from the snippet. This might be "22.01.2025" or similar.
        # For demonstration, we'll do a simple attempt:
        if not final_date and date_str:
            # Attempt a naive parse of "DD.MM.YYYY" 
            match = re.search(r"(\d{2}\.\d{2}\.\d{4})", date_str)
            if match:
                try:
                    final_date = datetime.strptime(match.group(1), "%d.%m.%Y")
                except ValueError:
                    final_date = None

        # Construct the SearchResult
        search_result = SearchResult(
            title=article_data["title"] or article_info["title"],
            url=url,
            description=snippet,
            date=final_date,
            content=article_data["article_text"]
        )
        results_list.append(search_result)

    # 4. Build & return our SearchResponse
    return SearchResponse(
        query=query,
        total_results=total_found,
        results=results_list
    )

if __name__ == "__main__":
    # Example usage
    from pprint import pprint

    query = "yandex"
    max_articles = 2  # parse first 3 articles in detail
    search_response = get_search_results(query, max_articles)

    # Print out the resulting data
    # Because it's a Pydantic model, we can do .dict() or .json() too
    print("--- SEARCH RESPONSE (dict) ---")
    pprint(search_response.dict())
    
    # Or just show each result nicely
    print("\n--- HUMAN-READABLE OUTPUT ---")
    print(f"Query: {search_response.query}")
    print(f"Total found: {search_response.total_results}")
    for idx, result in enumerate(search_response.results, 1):
        print(f"\nResult {idx}:")
        print(f"  Title: {result.title}")
        print(f"  URL:   {result.url}")
        print(f"  Date:  {result.date}")
        print(f"  Snippet/desc: {result.description}")
        # Show first 200 chars of content
        if result.content:
            print(f"  Content (truncated): {result.content[:200]}...")
        else:
            print("  Content: [empty]")

In [None]:
import re
import requests
from bs4 import BeautifulSoup, Tag
import csv
import sys
from tqdm import tqdm
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from datetime import datetime
from typing import List, Optional
from pydantic import BaseModel, Field, HttpUrl

# Ignore only the InsecureRequestWarning
warnings.simplefilter("ignore", InsecureRequestWarning)

class SearchResult(BaseModel):
    """
    Individual search result from ITMO news/pages
    """
    title: str
    url: HttpUrl
    description: Optional[str] = None
    date: Optional[datetime] = None
    content: Optional[str] = None

class SearchResponse(BaseModel):
    """
    Complete search response
    """
    query: str
    total_results: int
    results: List[SearchResult]
    search_time: datetime = Field(default_factory=datetime.now)

def clean_article_content(content_block: Tag) -> str:
    """
    Remove unnecessary elements and prepare text:
      - Remove <script> and <style>
      - Convert non-breaking spaces (\xa0) to regular spaces
      - Collapse multiple spaces/newlines
    """
    # 1. Remove <script> and <style> tags
    for unwanted_tag in content_block(["script", "style"]):
        unwanted_tag.decompose()

    # 2. Extract text
    text = content_block.get_text(separator="\n", strip=True)

    # 3. Remove non-breaking spaces
    text = text.replace("\xa0", " ")

    # 4. Collapse multiple newlines and extra spaces
    text = re.sub(r"\n\s*\n+", "\n\n", text)
    text = re.sub(r" {2,}", " ", text)

    # 5. Final strip
    text = text.strip()
    return text

def parse_itmo_article_page(html_content: str) -> dict:
    """
    Extracts data from a single ITMO article page:
      - title
      - publication_datetime (e.g. 2025-01-22T11:48:35+03:00)
      - views
      - authors (list of strings)
      - tags (list of strings)
      - cleaned article_text
    """
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Title
    title_tag = soup.select_one("div.article h1")
    title = title_tag.get_text(strip=True) if title_tag else None

    # Date/time in time[datetime], e.g. 2025-01-22T11:48:35+03:00
    time_tag = soup.select_one("div.news-info-wrapper time")
    publication_datetime_str = time_tag.get("datetime", "").strip() if time_tag else None
    publication_datetime = None
    if publication_datetime_str:
        try:
            publication_datetime = datetime.fromisoformat(publication_datetime_str)
        except ValueError:
            publication_datetime = None

    # Views (example: <span class="icon eye">1049</span>)
    views_span = time_tag.select_one("span.icon.eye") if time_tag else None
    views = views_span.get_text(strip=True) if views_span else None

    # Authors
    authors = []
    for author_block in soup.select(".author-block .author-item"):
        name_tag = author_block.select_one(".about h6 a[rel='author']")
        if name_tag:
            authors.append(name_tag.get_text(strip=True))

    # Tags
    tags = []
    for tag_item in soup.select("ul.tags li a"):
        tags.append(tag_item.get_text(strip=True))

    # Main article text
    article_block = soup.select_one(".content.js-mediator-article")
    article_text = clean_article_content(article_block) if article_block else ""

    return {
        "title": title,
        "publication_datetime": publication_datetime,
        "views": views,
        "authors": authors,
        "tags": tags,
        "article_text": article_text
    }

def parse_itmo_search_page(html_content: str):
    """
    Parses the main ITMO search result page, returning:
      - total_results (integer)
      - articles: list of {title, link, snippet, date (string)}
    """
    soup = BeautifulSoup(html_content, "html.parser")

    # Find total results (e.g. "152 результата")
    total_results_str = None
    h2_element = soup.select_one(".weeklyevents h2 span")
    if h2_element:
        total_results_str = h2_element.get_text(strip=True)

    # Convert e.g. "152 результата" -> 152 (if possible)
    total_results = 0
    if total_results_str:
        match = re.search(r"(\d+)", total_results_str)
        if match:
            total_results = int(match.group(1))

    # Parse each result (li.weeklyevent)
    results = []
    for item in soup.select(".weeklyevents ul li.weeklyevent"):
        h4 = item.find("h4")
        if not h4 or not h4.find("a"):
            continue

        # Title & Link
        title_tag = h4.find("a")
        title = title_tag.get_text(strip=True)
        link = title_tag["href"]
        if link.startswith("/"):
            link = "https://news.itmo.ru" + link

        # paragraphs typically contain snippet & date
        paragraphs = item.find_all("p")
        snippet = paragraphs[0].get_text(strip=True) if len(paragraphs) > 0 else None
        date_str = paragraphs[-1].get_text(strip=True) if len(paragraphs) > 1 else None

        results.append({
            "title": title,
            "link": link,
            "snippet": snippet,
            "date": date_str  # date as string for now
        })

    return {
        "total_results": total_results,
        "articles": results
    }

def get_search_results(
    query: str,
    max_articles: int = 5,
    max_pages: int = 1
) -> SearchResponse:
    """
    High-level function that:
      - Searches ITMO news for `query`
      - Parses multiple pages if max_pages > 1
      - Collects search results across pages
      - Fetches each article's content (up to max_articles total)
      - Returns a Pydantic SearchResponse
    """
    base_url = "https://news.itmo.ru/ru/search/"
    all_articles = []
    total_found = 0

    for page_num in range(1, max_pages + 1):
        params = {
            "search": query,
            "page": page_num,
            "section": "news"  # важный параметр, учитывая пример ссылок
        }

        response = requests.get(base_url, params=params, verify=False)
        response.raise_for_status()

        search_data = parse_itmo_search_page(response.text)
        
        if page_num == 1:
            total_found = search_data["total_results"]

        # Добавляем все статьи с текущей страницы
        all_articles.extend(search_data["articles"])

        # Если статей на странице нет — прерываем цикл (больше страниц нет)
        if not search_data["articles"]:
            break

    # Обрезаем список статей, если хотим получить не более max_articles
    articles_to_fetch = all_articles[:max_articles]

    results_list = []
    for article_info in articles_to_fetch:
        url = article_info["link"]
        snippet = article_info["snippet"]
        date_str = article_info["date"]

        article_resp = requests.get(url, verify=False)
        article_resp.raise_for_status()
        article_data = parse_itmo_article_page(article_resp.text)

        final_date = article_data["publication_datetime"]

        # Если не удалось вытащить ISO-дату, пробуем распарсить date_str
        if not final_date and date_str:
            match = re.search(r"(\d{2}\.\d{2}\.\d{4})", date_str)
            if match:
                try:
                    final_date = datetime.strptime(match.group(1), "%d.%m.%Y")
                except ValueError:
                    final_date = None

        search_result = SearchResult(
            title=article_data["title"] or article_info["title"],
            url=url,
            description=snippet,
            date=final_date,
            content=article_data["article_text"]
        )
        results_list.append(search_result)

    return SearchResponse(
        query=query,
        total_results=total_found,
        results=results_list
    )

In [45]:
queries = [
    "История", "Основание", "Рейтинг", "Титулы", "Миссия",
    "Наука", "Исследования", "Гранты", "Лаборатории", "Публикации",
    "Поступление", "Проходной балл", "Приёмная", "ЕГЭ", "Бюджет",
    "Контракт", "Льготы", "Целевая", "Олимпиады", "Права", "Факультет"
    "Стипендии", "Повышенная", "Соцстипендия", "Студгранты", "Поддержка",
    "Спонсорство", "Президентская", "Международные", "Конкурсы", "Заявка",
    "Магистратура", "Аспирантура", "Докторантура", "Бакалавриат", "Направления",
    "Специальности", "Учплан", "Модули", "Заочка", "Дистанционка", "Новости"
    "Онлайн-курсы", "Допобразование", "Квалификация", "Профпереподг", "Семинары",
    "Летние школы", "Зимние школы", "Вечерние", "Курсы IT", "Компетенции",
    "Партнёры", "Обмен", "Двойной диплом", "Зарубежная", "Академмобильность",
    "Erasmus+", "Иностранцы", "Англопрограммы", "World rankings", "Спорт",
    "Студсовет", "Клубы", "Научные кружки", "Волонтёры", "Студжизнь",
    "Хакатоны", "Проекты", "Конференции", "Стажировки", "Аудитории",
    "Корпуса", "Общежитие", "Проживание", "Матбаза", "Коворкинги",
    "Библиотека", "Электронная", "Инфоресурсы", "Кампус-тур", "Адрес",
    "Транспорт", "Правила", "Академотпуск", "Перевод", "Регламент",
    "Экзамены", "Апелляция", "Э-расписание", "Выпускники", "Вакансии",
    "Трудоустройство", "Компании", "Карьерный центр", "Стартап", "Акселератор",
    "Предприниматель", "Инноватика", "Выдающиеся", "Известные", "Международные"
    "Искуственный интеллект", "Talent Hub", "AI", "ITMO", "ИТМО", "Ректор",
    "Деканат", "Кафедра", "Учёный совет", "Администрация", "Документы",
    "Сессия", "Зачёты", "Диплом", "Практика", "Семестр",
    "Методички", "Учебники", "Лекции", "Семинары", "Лабораторные",
    "Профком", "Общага", "Столовая", "Медпункт", "Спортзал",
    "Творчество", "Мероприятия", "Концерты", "Фестивали", "Конкурсы",
    "Робототехника", "Программирование", "Биотехнологии", "Нанотехнологии", "Квантовые технологии",
    "Машинное обучение", "Data Science", "Кибербезопасность", "Blockchain", "VR/AR",
    "Стажировки", "Обмен студентами", "Визы", "Языковые курсы", "Международные проекты",
    "Резюме", "Собеседование", "Практика", "Стартап-акселератор", "Бизнес-инкубатор",
    "Менторство", "Networking", "Soft skills", "Hard skills", "Portfolio",
    "Диссертация", "Научрук", "Патенты", "Scopus", "Web of Science",
    "РИНЦ", "Импакт-фактор", "Монография", "Рецензирование", "Индекс Хирша",
    "Психолог", "Тьютор", "Куратор", "Адаптация", "Инклюзив","Проректор",
    "Доступная среда", "Соцподдержка", "Материальная помощь", "Профилакторий", "Путёвки"
]

def main():
    csv_filename = "itmo_search_results_LARGE.csv"
    
    # Параметры, которые вы можете менять:
    max_articles_to_parse = 30  # Сколько статей максимально разобрать
    max_pages_to_crawl = 3      # Сколько страниц будет просмотрено для каждой query

    with open(csv_filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=";")
        # Шапка таблицы
        writer.writerow(["Query", "Title", "url", "Date", "Snippet/desc", "content"])

        for query in tqdm(queries):
            try:
                search_response = get_search_results(
                    query,
                    max_articles=max_articles_to_parse,
                    max_pages=max_pages_to_crawl
                )

                print(f"\n=== Query: {query} (Found: {search_response.total_results}) ===")

                for idx, result in enumerate(search_response.results, 1):
                    row = [
                        query,
                        result.title,
                        str(result.url),
                        result.date.isoformat() if result.date else "",
                        result.description or "",
                        result.content or ""
                    ]
                    writer.writerow(row)

            except Exception as e:
                # При ошибках сети или парсинга выводим сообщение, но не прерываем весь процесс
                print(f"Error processing query '{query}': {e}", file=sys.stderr)

    print(f"\nAll results have been written to '{csv_filename}'.")


if __name__ == "__main__":
    main()

  1%|          | 1/176 [00:08<24:55,  8.55s/it]


=== Query: История (Found: 441) ===


  1%|          | 2/176 [00:17<24:54,  8.59s/it]


=== Query: Основание (Found: 64) ===


  2%|▏         | 3/176 [00:25<24:45,  8.59s/it]


=== Query: Рейтинг (Found: 758) ===


  2%|▏         | 4/176 [00:26<16:12,  5.65s/it]


=== Query: Титулы (Found: 0) ===


  3%|▎         | 5/176 [00:35<18:54,  6.63s/it]


=== Query: Миссия (Found: 294) ===


  3%|▎         | 6/176 [00:44<21:01,  7.42s/it]


=== Query: Наука (Found: 905) ===


  4%|▍         | 7/176 [00:52<21:31,  7.64s/it]


=== Query: Исследования (Found: 2386) ===


  5%|▍         | 8/176 [01:01<22:19,  7.97s/it]


=== Query: Гранты (Found: 242) ===


  5%|▌         | 9/176 [01:09<22:20,  8.03s/it]


=== Query: Лаборатории (Found: 1708) ===


  6%|▌         | 10/176 [01:17<22:31,  8.14s/it]


=== Query: Публикации (Found: 348) ===


  6%|▋         | 11/176 [01:25<22:36,  8.22s/it]


=== Query: Поступление (Found: 188) ===


  7%|▋         | 12/176 [01:33<22:08,  8.10s/it]


=== Query: Проходной балл (Found: 28) ===


  7%|▋         | 13/176 [01:42<22:38,  8.33s/it]


=== Query: Приёмная (Found: 80) ===


  8%|▊         | 14/176 [01:50<22:23,  8.29s/it]


=== Query: ЕГЭ (Found: 245) ===


  9%|▊         | 15/176 [01:58<21:53,  8.16s/it]


=== Query: Бюджет (Found: 482) ===


  9%|▉         | 16/176 [02:06<21:36,  8.11s/it]


=== Query: Контракт (Found: 206) ===


 10%|▉         | 17/176 [02:15<22:06,  8.34s/it]


=== Query: Льготы (Found: 65) ===


 10%|█         | 18/176 [02:26<24:14,  9.21s/it]


=== Query: Целевая (Found: 52) ===


 11%|█         | 19/176 [02:35<23:40,  9.04s/it]


=== Query: Олимпиады (Found: 433) ===


 11%|█▏        | 20/176 [02:44<23:13,  8.93s/it]


=== Query: Права (Found: 216) ===


 12%|█▏        | 21/176 [02:45<16:53,  6.54s/it]


=== Query: ФакультетСтипендии (Found: 0) ===


 12%|█▎        | 22/176 [02:52<17:15,  6.72s/it]


=== Query: Повышенная (Found: 24) ===


 13%|█▎        | 23/176 [02:53<12:55,  5.07s/it]


=== Query: Соцстипендия (Found: 0) ===


 14%|█▎        | 24/176 [02:54<09:50,  3.88s/it]


=== Query: Студгранты (Found: 0) ===


 14%|█▍        | 25/176 [03:03<13:18,  5.29s/it]


=== Query: Поддержка (Found: 333) ===


 15%|█▍        | 26/176 [03:05<11:21,  4.54s/it]


=== Query: Спонсорство (Found: 2) ===


 15%|█▌        | 27/176 [03:09<10:33,  4.25s/it]


=== Query: Президентская (Found: 7) ===


 16%|█▌        | 28/176 [03:17<13:21,  5.41s/it]


=== Query: Международные (Found: 393) ===


 16%|█▋        | 29/176 [03:25<15:18,  6.25s/it]


=== Query: Конкурсы (Found: 190) ===


 17%|█▋        | 30/176 [03:33<16:34,  6.81s/it]


=== Query: Заявка (Found: 134) ===


 18%|█▊        | 31/176 [03:42<17:58,  7.44s/it]


=== Query: Магистратура (Found: 216) ===


 18%|█▊        | 32/176 [03:51<18:55,  7.89s/it]


=== Query: Аспирантура (Found: 62) ===


 19%|█▉        | 33/176 [03:53<14:43,  6.18s/it]


=== Query: Докторантура (Found: 1) ===


 19%|█▉        | 34/176 [04:02<16:03,  6.78s/it]


=== Query: Бакалавриат (Found: 771) ===


 20%|█▉        | 35/176 [04:10<16:59,  7.23s/it]


=== Query: Направления (Found: 1807) ===


 20%|██        | 36/176 [04:18<17:48,  7.63s/it]


=== Query: Специальности (Found: 358) ===


 21%|██        | 37/176 [04:19<13:04,  5.64s/it]


=== Query: Учплан (Found: 0) ===


 22%|██▏       | 38/176 [04:28<14:53,  6.48s/it]


=== Query: Модули (Found: 132) ===


 22%|██▏       | 39/176 [04:29<11:00,  4.82s/it]


=== Query: Заочка (Found: 0) ===


 23%|██▎       | 40/176 [04:31<09:07,  4.03s/it]


=== Query: Дистанционка (Found: 1) ===


 23%|██▎       | 41/176 [04:32<07:04,  3.15s/it]


=== Query: НовостиОнлайн-курсы (Found: 0) ===


 24%|██▍       | 42/176 [04:34<06:23,  2.86s/it]


=== Query: Допобразование (Found: 1) ===


 24%|██▍       | 43/176 [04:41<08:56,  4.03s/it]


=== Query: Квалификация (Found: 22) ===


 25%|██▌       | 44/176 [04:43<07:46,  3.53s/it]


=== Query: Профпереподг (Found: 2) ===


 26%|██▌       | 45/176 [04:51<10:28,  4.80s/it]


=== Query: Семинары (Found: 227) ===


 26%|██▌       | 46/176 [04:59<12:31,  5.78s/it]


=== Query: Летние школы (Found: 50) ===


 27%|██▋       | 47/176 [05:06<13:14,  6.16s/it]


=== Query: Зимние школы (Found: 26) ===


 27%|██▋       | 48/176 [05:10<11:16,  5.28s/it]


=== Query: Вечерние (Found: 7) ===


 28%|██▊       | 49/176 [05:12<09:18,  4.40s/it]


=== Query: Курсы IT (Found: 2) ===


 28%|██▊       | 50/176 [05:20<11:29,  5.47s/it]


=== Query: Компетенции (Found: 463) ===


 29%|██▉       | 51/176 [05:29<13:23,  6.42s/it]


=== Query: Партнёры (Found: 329) ===


 30%|██▉       | 52/176 [05:37<14:50,  7.18s/it]


=== Query: Обмен (Found: 926) ===


 30%|███       | 53/176 [05:41<12:11,  5.95s/it]


=== Query: Двойной диплом (Found: 5) ===


 31%|███       | 54/176 [05:44<10:22,  5.10s/it]


=== Query: Зарубежная (Found: 6) ===


 31%|███▏      | 55/176 [05:45<07:53,  3.91s/it]


=== Query: Академмобильность (Found: 0) ===


 32%|███▏      | 56/176 [05:53<10:40,  5.34s/it]


=== Query: Erasmus+ (Found: 53) ===


 32%|███▏      | 57/176 [06:02<12:46,  6.44s/it]


=== Query: Иностранцы (Found: 34) ===


 33%|███▎      | 58/176 [06:04<09:34,  4.87s/it]


=== Query: Англопрограммы (Found: 0) ===


 34%|███▎      | 59/176 [06:06<08:13,  4.22s/it]


=== Query: World rankings (Found: 3) ===


 34%|███▍      | 60/176 [06:15<10:33,  5.46s/it]


=== Query: Спорт (Found: 1649) ===


 35%|███▍      | 61/176 [06:18<09:23,  4.90s/it]


=== Query: Студсовет (Found: 8) ===


 35%|███▌      | 62/176 [06:27<11:15,  5.92s/it]


=== Query: Клубы (Found: 69) ===


 36%|███▌      | 63/176 [06:28<08:25,  4.47s/it]


=== Query: Научные кружки (Found: 0) ===


 36%|███▋      | 64/176 [06:36<10:32,  5.65s/it]


=== Query: Волонтёры (Found: 83) ===


 37%|███▋      | 65/176 [06:37<07:55,  4.28s/it]


=== Query: Студжизнь (Found: 0) ===


 38%|███▊      | 66/176 [06:45<09:59,  5.45s/it]


=== Query: Хакатоны (Found: 59) ===


 38%|███▊      | 67/176 [06:54<11:37,  6.40s/it]


=== Query: Проекты (Found: 1725) ===


 39%|███▊      | 68/176 [07:03<12:46,  7.10s/it]


=== Query: Конференции (Found: 1014) ===


 39%|███▉      | 69/176 [07:11<13:25,  7.53s/it]


=== Query: Стажировки (Found: 377) ===


 40%|███▉      | 70/176 [07:20<13:48,  7.82s/it]


=== Query: Аудитории (Found: 525) ===


 40%|████      | 71/176 [07:28<13:52,  7.92s/it]


=== Query: Корпуса (Found: 226) ===


 41%|████      | 72/176 [07:37<14:04,  8.12s/it]


=== Query: Общежитие (Found: 53) ===


 41%|████▏     | 73/176 [07:45<14:05,  8.21s/it]


=== Query: Проживание (Found: 101) ===


 42%|████▏     | 74/176 [07:46<10:18,  6.06s/it]


=== Query: Матбаза (Found: 0) ===


 43%|████▎     | 75/176 [07:54<11:22,  6.76s/it]


=== Query: Коворкинги (Found: 31) ===


 43%|████▎     | 76/176 [08:02<11:52,  7.13s/it]


=== Query: Библиотека (Found: 113) ===


 44%|████▍     | 77/176 [08:11<12:22,  7.50s/it]


=== Query: Электронная (Found: 80) ===


 44%|████▍     | 78/176 [08:12<09:06,  5.57s/it]


=== Query: Инфоресурсы (Found: 0) ===


 45%|████▍     | 79/176 [08:13<06:46,  4.19s/it]


=== Query: Кампус-тур (Found: 0) ===


 45%|████▌     | 80/176 [08:21<08:39,  5.42s/it]


=== Query: Адрес (Found: 541) ===


 46%|████▌     | 81/176 [08:29<09:50,  6.22s/it]


=== Query: Транспорт (Found: 646) ===


 47%|████▋     | 82/176 [08:37<10:40,  6.81s/it]


=== Query: Правила (Found: 581) ===


 47%|████▋     | 83/176 [08:38<07:54,  5.10s/it]


=== Query: Академотпуск (Found: 0) ===


 48%|████▊     | 84/176 [08:46<09:06,  5.94s/it]


=== Query: Перевод (Found: 375) ===


 48%|████▊     | 85/176 [08:55<10:02,  6.62s/it]


=== Query: Регламент (Found: 129) ===


 49%|████▉     | 86/176 [09:02<10:27,  6.98s/it]


=== Query: Экзамены (Found: 156) ===


 49%|████▉     | 87/176 [09:05<08:28,  5.71s/it]


=== Query: Апелляция (Found: 4) ===


 50%|█████     | 88/176 [09:06<06:16,  4.27s/it]


=== Query: Э-расписание (Found: 0) ===


 51%|█████     | 89/176 [09:14<07:44,  5.33s/it]


=== Query: Выпускники (Found: 638) ===


 51%|█████     | 90/176 [09:22<08:49,  6.15s/it]


=== Query: Вакансии (Found: 77) ===


 52%|█████▏    | 91/176 [09:30<09:44,  6.88s/it]


=== Query: Трудоустройство (Found: 66) ===


 52%|█████▏    | 92/176 [09:38<09:57,  7.11s/it]


=== Query: Компании (Found: 2005) ===


 53%|█████▎    | 93/176 [09:39<07:17,  5.28s/it]


=== Query: Карьерный центр (Found: 0) ===


 53%|█████▎    | 94/176 [09:47<08:20,  6.10s/it]


=== Query: Стартап (Found: 1000) ===


 54%|█████▍    | 95/176 [09:55<09:03,  6.72s/it]


=== Query: Акселератор (Found: 449) ===


 55%|█████▍    | 96/176 [10:03<09:18,  6.98s/it]


=== Query: Предприниматель (Found: 810) ===


 55%|█████▌    | 97/176 [10:11<09:46,  7.42s/it]


=== Query: Инноватика (Found: 44) ===


 56%|█████▌    | 98/176 [10:19<09:55,  7.63s/it]


=== Query: Выдающиеся (Found: 66) ===


 56%|█████▋    | 99/176 [10:28<10:04,  7.85s/it]


=== Query: Известные (Found: 213) ===


 57%|█████▋    | 100/176 [10:29<07:20,  5.79s/it]


=== Query: МеждународныеИскуственный интеллект (Found: 0) ===


 57%|█████▋    | 101/176 [10:34<07:00,  5.61s/it]


=== Query: Talent Hub (Found: 13) ===

=== Query: AI (Found: 0) ===


 59%|█████▊    | 103/176 [10:41<05:42,  4.69s/it]


=== Query: ITMO (Found: 3046) ===


 59%|█████▉    | 104/176 [10:48<06:06,  5.09s/it]


=== Query: ИТМО (Found: 6970) ===


 60%|█████▉    | 105/176 [10:55<06:46,  5.73s/it]


=== Query: Ректор (Found: 2685) ===


 60%|██████    | 106/176 [11:02<07:12,  6.18s/it]


=== Query: Деканат (Found: 28) ===


 61%|██████    | 107/176 [11:11<07:43,  6.72s/it]


=== Query: Кафедра (Found: 311) ===


 61%|██████▏   | 108/176 [11:16<07:21,  6.49s/it]


=== Query: Учёный совет (Found: 17) ===


 62%|██████▏   | 109/176 [11:25<07:48,  6.99s/it]


=== Query: Администрация (Found: 109) ===


 62%|██████▎   | 110/176 [11:33<08:06,  7.37s/it]


=== Query: Документы (Found: 332) ===


 63%|██████▎   | 111/176 [11:41<08:20,  7.69s/it]


=== Query: Сессия (Found: 175) ===


 64%|██████▎   | 112/176 [11:49<08:00,  7.50s/it]


=== Query: Зачёты (Found: 23) ===


 64%|██████▍   | 113/176 [11:57<08:05,  7.71s/it]


=== Query: Диплом (Found: 879) ===


 65%|██████▍   | 114/176 [12:05<08:01,  7.77s/it]


=== Query: Практика (Found: 510) ===


 65%|██████▌   | 115/176 [12:13<08:04,  7.94s/it]


=== Query: Семестр (Found: 446) ===


 66%|██████▌   | 116/176 [12:16<06:33,  6.56s/it]


=== Query: Методички (Found: 7) ===


 66%|██████▋   | 117/176 [12:24<06:54,  7.03s/it]


=== Query: Учебники (Found: 41) ===


 67%|██████▋   | 118/176 [12:32<07:04,  7.31s/it]


=== Query: Лекции (Found: 1109) ===


 68%|██████▊   | 119/176 [12:41<07:13,  7.60s/it]


=== Query: Семинары (Found: 227) ===


 68%|██████▊   | 120/176 [12:49<07:14,  7.76s/it]


=== Query: Лабораторные (Found: 103) ===


 69%|██████▉   | 121/176 [12:57<07:07,  7.78s/it]


=== Query: Профком (Found: 45) ===


 69%|██████▉   | 122/176 [12:59<05:34,  6.19s/it]


=== Query: Общага (Found: 1) ===


 70%|██████▉   | 123/176 [13:05<05:30,  6.24s/it]


=== Query: Столовая (Found: 19) ===


 70%|███████   | 124/176 [13:09<04:35,  5.29s/it]


=== Query: Медпункт (Found: 6) ===


 71%|███████   | 125/176 [13:17<05:12,  6.12s/it]


=== Query: Спортзал (Found: 35) ===


 72%|███████▏  | 126/176 [13:25<05:38,  6.78s/it]


=== Query: Творчество (Found: 121) ===


 72%|███████▏  | 127/176 [13:33<05:45,  7.04s/it]


=== Query: Мероприятия (Found: 1474) ===


 73%|███████▎  | 128/176 [13:41<05:53,  7.37s/it]


=== Query: Концерты (Found: 33) ===


 73%|███████▎  | 129/176 [13:49<05:57,  7.61s/it]


=== Query: Фестивали (Found: 59) ===


 74%|███████▍  | 130/176 [13:57<05:58,  7.80s/it]


=== Query: Конкурсы (Found: 190) ===


 74%|███████▍  | 131/176 [14:05<05:55,  7.91s/it]


=== Query: Робототехника (Found: 247) ===


 75%|███████▌  | 132/176 [14:13<05:46,  7.87s/it]


=== Query: Программирование (Found: 431) ===


 76%|███████▌  | 133/176 [14:21<05:35,  7.81s/it]


=== Query: Биотехнологии (Found: 254) ===


 76%|███████▌  | 134/176 [14:29<05:34,  7.97s/it]


=== Query: Нанотехнологии (Found: 106) ===


 77%|███████▋  | 135/176 [14:37<05:26,  7.97s/it]


=== Query: Квантовые технологии (Found: 43) ===


 77%|███████▋  | 136/176 [14:45<05:17,  7.94s/it]


=== Query: Машинное обучение (Found: 205) ===


 78%|███████▊  | 137/176 [14:53<05:11,  8.00s/it]


=== Query: Data Science (Found: 69) ===


 78%|███████▊  | 138/176 [15:01<05:06,  8.08s/it]


=== Query: Кибербезопасность (Found: 81) ===


 79%|███████▉  | 139/176 [15:08<04:42,  7.64s/it]


=== Query: Blockchain (Found: 21) ===


 80%|███████▉  | 140/176 [15:15<04:26,  7.41s/it]


=== Query: VR/AR (Found: 25) ===


 80%|████████  | 141/176 [15:23<04:25,  7.59s/it]


=== Query: Стажировки (Found: 377) ===


 81%|████████  | 142/176 [15:29<03:59,  7.05s/it]


=== Query: Обмен студентами (Found: 14) ===


 81%|████████▏ | 143/176 [15:37<04:03,  7.39s/it]


=== Query: Визы (Found: 43) ===


 82%|████████▏ | 144/176 [15:40<03:17,  6.17s/it]


=== Query: Языковые курсы (Found: 8) ===


 82%|████████▏ | 145/176 [15:46<03:07,  6.04s/it]


=== Query: Международные проекты (Found: 15) ===


 83%|████████▎ | 146/176 [15:54<03:20,  6.69s/it]


=== Query: Резюме (Found: 222) ===


 84%|████████▎ | 147/176 [16:02<03:26,  7.13s/it]


=== Query: Собеседование (Found: 151) ===


 84%|████████▍ | 148/176 [16:10<03:26,  7.38s/it]


=== Query: Практика (Found: 510) ===


 85%|████████▍ | 149/176 [16:18<03:25,  7.61s/it]


=== Query: Стартап-акселератор (Found: 99) ===


 85%|████████▌ | 150/176 [16:26<03:16,  7.54s/it]


=== Query: Бизнес-инкубатор (Found: 263) ===


 86%|████████▌ | 151/176 [16:33<03:03,  7.34s/it]


=== Query: Менторство (Found: 23) ===


 86%|████████▋ | 152/176 [16:36<02:25,  6.07s/it]


=== Query: Networking (Found: 7) ===


 87%|████████▋ | 153/176 [16:43<02:31,  6.60s/it]


=== Query: Soft skills (Found: 226) ===


 88%|████████▊ | 154/176 [16:51<02:31,  6.90s/it]


=== Query: Hard skills (Found: 41) ===


 88%|████████▊ | 155/176 [16:58<02:23,  6.85s/it]


=== Query: Portfolio (Found: 21) ===


 89%|████████▊ | 156/176 [17:06<02:24,  7.21s/it]


=== Query: Диссертация (Found: 47) ===


 89%|████████▉ | 157/176 [17:07<01:42,  5.40s/it]


=== Query: Научрук (Found: 0) ===


 90%|████████▉ | 158/176 [17:15<01:48,  6.04s/it]


=== Query: Патенты (Found: 25) ===


 90%|█████████ | 159/176 [17:22<01:51,  6.55s/it]


=== Query: Scopus (Found: 126) ===


 91%|█████████ | 160/176 [17:30<01:51,  6.94s/it]


=== Query: Web of Science (Found: 62) ===


 91%|█████████▏| 161/176 [17:38<01:48,  7.26s/it]


=== Query: РИНЦ (Found: 1399) ===


 92%|█████████▏| 162/176 [17:47<01:47,  7.69s/it]


=== Query: Импакт-фактор (Found: 48) ===


 93%|█████████▎| 163/176 [17:50<01:20,  6.22s/it]


=== Query: Монография (Found: 5) ===


 93%|█████████▎| 164/176 [17:56<01:14,  6.20s/it]


=== Query: Рецензирование (Found: 17) ===


 94%|█████████▍| 165/176 [18:04<01:15,  6.86s/it]


=== Query: Индекс Хирша (Found: 31) ===


 94%|█████████▍| 166/176 [18:12<01:11,  7.16s/it]


=== Query: Психолог (Found: 400) ===


 95%|█████████▍| 167/176 [18:21<01:08,  7.56s/it]


=== Query: Тьютор (Found: 99) ===


 95%|█████████▌| 168/176 [18:28<01:01,  7.64s/it]


=== Query: Куратор (Found: 427) ===


 96%|█████████▌| 169/176 [18:37<00:55,  7.88s/it]


=== Query: Адаптация (Found: 54) ===


 97%|█████████▋| 170/176 [18:45<00:47,  8.00s/it]


=== Query: Инклюзив (Found: 51) ===


 97%|█████████▋| 171/176 [18:53<00:39,  7.88s/it]


=== Query: Проректор (Found: 431) ===


 98%|█████████▊| 172/176 [18:56<00:25,  6.44s/it]


=== Query: Доступная среда (Found: 6) ===


 98%|█████████▊| 173/176 [18:57<00:14,  4.82s/it]


=== Query: Соцподдержка (Found: 0) ===


 99%|█████████▉| 174/176 [18:58<00:07,  3.67s/it]


=== Query: Материальная помощь (Found: 0) ===


 99%|█████████▉| 175/176 [18:59<00:02,  2.87s/it]


=== Query: Профилакторий (Found: 0) ===


100%|██████████| 176/176 [19:02<00:00,  6.49s/it]


=== Query: Путёвки (Found: 9) ===

All results have been written to 'itmo_search_results_LARGE.csv'.





In [46]:
test = pd.read_csv('itmo_search_results_LARGE.csv', on_bad_lines='skip', delimiter=';')

In [50]:
test.drop_duplicates(subset='url')

Unnamed: 0,Query,Title,url,Date,Snippet/desc,content
0,История,Дополненная реальность: как работает и что жда...,https://news.itmo.ru/ru/education/trend/news/1...,2025-01-31T13:53:30+03:00,"…и объектами. Например, цифровой объект можно ...","Играть в виртуальные шахматы прямо в воздухе, ..."
1,История,"«Можно быть прикольным ученым, а не скучным за...",https://news.itmo.ru/ru/science/photonics/news...,2025-01-15T13:17:31+03:00,…й в пиджаке.\r\n\r\n― И как вы решили идти к ...,В школе Алексей Кохановский прочитал автобиогр...
2,История,«Инклюзия — логичная эволюция дизайна»: какие ...,https://news.itmo.ru/ru/education/trend/news/1...,2024-11-22T16:01:53+03:00,"…ь на будущем месте работы», — подчеркнул выпу...",Как сегодня создаются отечественные цифровые с...
3,История,Не только сфинксы и Эрмитаж: в Петербурге выбр...,https://news.itmo.ru/ru/education/cooperation/...,2024-10-31T10:00:10+03:00,"…о сфинксами, Исаакиевским собором и Бродским ...",В Петербурге подвели итоги проектной школы по ...
4,История,Никакой скучной физкультуры: как «Кронверкские...,https://news.itmo.ru/ru/university_live/social...,2024-10-11T10:02:50+03:00,…Содержание\r\n\r\n\r\n\tКраткаяисторияклуба\r...,Пять раз побеждали во всероссийском конкурсе «...
...,...,...,...,...,...,...
3883,Путёвки,Экспедиция StartupCабантуй: жесткий поиск лучш...,https://news.itmo.ru/ru/startups_and_business/...,2016-06-28T00:00:00+03:00,,Санкт-Петербург впервые оказался в маршрутном ...
3884,Путёвки,Первый день SLUSH-2015: Университет ИТМО зажиг...,https://news.itmo.ru/ru/archive/archive2/news/...,2015-11-12T00:00:00+03:00,,11 ноября завершилcя первый день работы крупне...
3885,Путёвки,Лучшим проектом акселератора Future Technologi...,https://news.itmo.ru/ru/archive/archive2/news/...,2015-09-07T00:00:00+03:00,,Полгода интенсивной работы подошли к концу: в ...
3886,Путёвки,Эксперты Russian Startup Tour выбрали лучшие и...,https://news.itmo.ru/ru/archive/archive2/news/...,2015-03-27T00:00:00+03:00,,Региональный этап Всероссийского стартап-тура ...


In [None]:
queries = [
    "История", "Основание", "Рейтинг", "Титулы", "Миссия",
    "Наука", "Исследования", "Гранты", "Лаборатории", "Публикации",
    "Поступление", "Проходной балл", "Приёмная", "ЕГЭ", "Бюджет",
    "Контракт", "Льготы", "Целевая", "Олимпиады", "Права", "Факультет"
    "Стипендии", "Повышенная", "Соцстипендия", "Студгранты", "Поддержка",
    "Спонсорство", "Президентская", "Международные", "Конкурсы", "Заявка",
    "Магистратура", "Аспирантура", "Докторантура", "Бакалавриат", "Направления",
    "Специальности", "Учплан", "Модули", "Заочка", "Дистанционка", "Новости"
    "Онлайн-курсы", "Допобразование", "Квалификация", "Профпереподг", "Семинары",
    "Летние школы", "Зимние школы", "Вечерние", "Курсы IT", "Компетенции",
    "Партнёры", "Обмен", "Двойной диплом", "Зарубежная", "Академмобильность",
    "Erasmus+", "Иностранцы", "Англопрограммы", "World rankings", "Спорт",
    "Студсовет", "Клубы", "Научные кружки", "Волонтёры", "Студжизнь",
    "Хакатоны", "Проекты", "Конференции", "Стажировки", "Аудитории",
    "Корпуса", "Общежитие", "Проживание", "Матбаза", "Коворкинги",
    "Библиотека", "Электронная", "Инфоресурсы", "Кампус-тур", "Адрес",
    "Транспорт", "Правила", "Академотпуск", "Перевод", "Регламент",
    "Экзамены", "Апелляция", "Э-расписание", "Выпускники", "Вакансии",
    "Трудоустройство", "Компании", "Карьерный центр", "Стартап", "Акселератор",
    "Предприниматель", "Инноватика", "Выдающиеся", "Известные", "Международные"
    "Искуственный интеллект", "Talent Hub", "AI", "ITMO", "ИТМО", "Ректор",
    "Проректор", 
]

queries_second = [
    "Деканат", "Кафедра", "Учёный совет", "Администрация", "Документы",
    "Сессия", "Зачёты", "Диплом", "Практика", "Семестр",
    "Методички", "Учебники", "Лекции", "Семинары", "Лабораторные",
    "Профком", "Общага", "Столовая", "Медпункт", "Спортзал",
    "Творчество", "Мероприятия", "Концерты", "Фестивали", "Конкурсы",
    "Робототехника", "Программирование", "Биотехнологии", "Нанотехнологии", "Квантовые технологии",
    "Машинное обучение", "Data Science", "Кибербезопасность", "Blockchain", "VR/AR",
    "Стажировки", "Обмен студентами", "Визы", "Языковые курсы", "Международные проекты",
    "Резюме", "Собеседование", "Практика", "Стартап-акселератор", "Бизнес-инкубатор",
    "Менторство", "Networking", "Soft skills", "Hard skills", "Portfolio",
    "Диссертация", "Научрук", "Патенты", "Scopus", "Web of Science",
    "РИНЦ", "Импакт-фактор", "Монография", "Рецензирование", "Индекс Хирша",
    "Психолог", "Тьютор", "Куратор", "Адаптация", "Инклюзив",
    "Доступная среда", "Соцподдержка", "Материальная помощь", "Профилакторий", "Путёвки"
]

def main():
    csv_filename = "itmo_search_results_second.csv"

    # Open CSV file for writing
    with open(csv_filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=";")
        # Header row
        writer.writerow(["Query", "Title", "url", "Date", "Snippet/desc", "content"])

        for query in tqdm(queries):
            try:
                # Call your previously defined function:
                search_response = get_search_results(query, max_articles=40)

                # Print a short summary to console
                print(f"\n=== Query: {query} (Found: {search_response.total_results}) ===")

                for idx, result in enumerate(search_response.results, 1):
                    # Save row to CSV
                    row = [
                        query,
                        result.title,
                        str(result.url),
                        result.date.isoformat() if result.date else "",
                        result.description or "",
                        result.content or ""
                    ]
                    writer.writerow(row)

            except Exception as e:
                # If there's a network error or parse error, print it but continue
                print(f"Error processing query '{query}': {e}", file=sys.stderr)

    print(f"\nAll results have been written to '{csv_filename}'.")

if __name__ == "__main__":
    main()

In [51]:
import pandas as pd

parsed_data = pd.read_csv('itmo_search_results_LARGE.csv', on_bad_lines='skip', delimiter=';')

In [52]:
parsed_data = parsed_data.rename(columns={'URL': 'url', 'Content': 'content'})
parsed_data['source'] = parsed_data['url']
parsed_data = parsed_data.drop_duplicates(subset='url')

In [None]:
parsed_data.to_csv('../../data/processed/texts_LARGE.csv', index=False)

In [13]:
parsed_data_v1 = pd.read_csv('../../data/processed/texts.csv')
parsed_data_v2 = pd.read_csv('../../data/processed/texts_second.csv')

In [30]:
full_data = pd.concat([parsed_data_v1, parsed_data_v2]).drop_duplicates(subset='url')

In [31]:
full_data.to_csv('../../data/processed/texts_final.csv', index=False)