In [None]:
import re
import requests
from bs4 import BeautifulSoup, Tag
import csv
import sys
from tqdm import tqdm
import warnings
from requests.packages.u
from datetime import datetime
from typing import List, Optional
from pydantic import BaseModel, Field, HttpUrl

rllib3.exceptions import InsecureRequestWarning

# Ignore only the InsecureRequestWarning
warnings.simplefilter("ignore", InsecureRequestWarning)

class SearchResult(BaseModel):
    """
    Individual search result from ITMO news/pages
    """
    title: str
    url: HttpUrl
    description: Optional[str] = None
    date: Optional[datetime] = None
    content: Optional[str] = None

class SearchResponse(BaseModel):
    """
    Complete search response
    """
    query: str
    total_results: int
    results: List[SearchResult]
    search_time: datetime = Field(default_factory=datetime.now)

def clean_article_content(content_block: Tag) -> str:
    """
    Remove unnecessary elements and prepare text:
      - Remove <script> and <style>
      - Convert non-breaking spaces (\xa0) to regular spaces
      - Collapse multiple spaces/newlines
    """
    # 1. Remove <script> and <style> tags
    for unwanted_tag in content_block(["script", "style"]):
        unwanted_tag.decompose()

    # 2. Extract text
    text = content_block.get_text(separator="\n", strip=True)

    # 3. Remove non-breaking spaces
    text = text.replace("\xa0", " ")

    # 4. Collapse multiple newlines and extra spaces
    text = re.sub(r"\n\s*\n+", "\n\n", text)
    text = re.sub(r" {2,}", " ", text)

    # 5. Final strip
    text = text.strip()
    return text

def parse_itmo_article_page(html_content: str) -> dict:
    """
    Extracts data from a single ITMO article page:
      - title
      - publication_datetime (e.g. 2025-01-22T11:48:35+03:00)
      - views
      - authors (list of strings)
      - tags (list of strings)
      - cleaned article_text
    """
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Title
    title_tag = soup.select_one("div.article h1")
    title = title_tag.get_text(strip=True) if title_tag else None

    # Date/time in time[datetime], e.g. 2025-01-22T11:48:35+03:00
    time_tag = soup.select_one("div.news-info-wrapper time")
    publication_datetime_str = time_tag.get("datetime", "").strip() if time_tag else None
    publication_datetime = None
    if publication_datetime_str:
        # Try to parse as an ISO datetime
        # If you need more robust parsing, install/use dateutil.parser.parse(...)
        try:
            publication_datetime = datetime.fromisoformat(publication_datetime_str)
        except ValueError:
            # fallback if the format is not strictly ISO or can't be parsed
            publication_datetime = None

    # Views (example: <span class="icon eye">1049</span>)
    views_span = time_tag.select_one("span.icon.eye") if time_tag else None
    views = views_span.get_text(strip=True) if views_span else None

    # Authors
    authors = []
    for author_block in soup.select(".author-block .author-item"):
        name_tag = author_block.select_one(".about h6 a[rel='author']")
        if name_tag:
            authors.append(name_tag.get_text(strip=True))

    # Tags
    tags = []
    for tag_item in soup.select("ul.tags li a"):
        tags.append(tag_item.get_text(strip=True))

    # Main article text
    article_block = soup.select_one(".content.js-mediator-article")
    article_text = clean_article_content(article_block) if article_block else ""

    return {
        "title": title,
        "publication_datetime": publication_datetime,
        "views": views,
        "authors": authors,
        "tags": tags,
        "article_text": article_text
    }


def parse_itmo_search_page(html_content: str):
    """
    Parses the main ITMO search result page, returning:
      - total_results (string or integer)
      - articles: list of {title, link, snippet, date (string)}
    """
    soup = BeautifulSoup(html_content, "html.parser")

    # Find total results (e.g. "152 результата")
    total_results_str = None
    h2_element = soup.select_one(".weeklyevents h2 span")
    if h2_element:
        total_results_str = h2_element.get_text(strip=True)

    # Convert e.g. "152 результата" -> 152 (if possible)
    total_results = 0
    if total_results_str:
        # Use regex to extract digits
        match = re.search(r"(\d+)", total_results_str)
        if match:
            total_results = int(match.group(1))

    # Parse each result (li.weeklyevent)
    results = []
    for item in soup.select(".weeklyevents ul li.weeklyevent"):
        h4 = item.find("h4")
        if not h4 or not h4.find("a"):
            continue

        # Title & Link
        title_tag = h4.find("a")
        title = title_tag.get_text(strip=True)
        link = title_tag["href"]
        if link.startswith("/"):
            link = "https://news.itmo.ru" + link

        # paragraphs typically contain snippet & date
        paragraphs = item.find_all("p")
        snippet = paragraphs[0].get_text(strip=True) if len(paragraphs) > 0 else None
        date_str = paragraphs[-1].get_text(strip=True) if len(paragraphs) > 1 else None

        results.append({
            "title": title,
            "link": link,
            "snippet": snippet,
            "date": date_str  # date as string for now
        })

    return {
        "total_results": total_results,
        "articles": results
    }

def get_search_results(
    query: str,
    max_articles: int = 5
) -> SearchResponse:
    """
    High-level function that:
      - Searches ITMO news for `query`
      - Parses search results
      - Fetches each article's content
      - Returns a Pydantic SearchResponse
    """
    # Build the search URL
    base_url = "https://news.itmo.ru/ru/search/"
    params = {"search": query}

    # 1. Fetch the search page
    response = requests.get(base_url, params=params, verify=False)
    response.raise_for_status()

    # 2. Parse the search page
    search_data = parse_itmo_search_page(response.text)
    total_found = search_data["total_results"]
    raw_articles = search_data["articles"]

    # 3. For each article (up to max_articles), parse additional data
    results_list = []
    for article_info in raw_articles[:max_articles]:
        url = article_info["link"]
        snippet = article_info["snippet"]
        date_str = article_info["date"]

        # 3a. Fetch & parse the article page
        article_resp = requests.get(url, verify=False)
        article_resp.raise_for_status()
        article_data = parse_itmo_article_page(article_resp.text)

        # 3b. Build a Pydantic SearchResult
        # We'll use the parse_itmo_article_page date if it's valid,
        # otherwise fallback to the date from the search snippet.
        final_date = article_data["publication_datetime"]

        # If parse_itmo_article_page could not parse any date/time, optionally try
        # to parse `date_str` from the snippet. This might be "22.01.2025" or similar.
        # For demonstration, we'll do a simple attempt:
        if not final_date and date_str:
            # Attempt a naive parse of "DD.MM.YYYY" 
            match = re.search(r"(\d{2}\.\d{2}\.\d{4})", date_str)
            if match:
                try:
                    final_date = datetime.strptime(match.group(1), "%d.%m.%Y")
                except ValueError:
                    final_date = None

        # Construct the SearchResult
        search_result = SearchResult(
            title=article_data["title"] or article_info["title"],
            url=url,
            description=snippet,
            date=final_date,
            content=article_data["article_text"]
        )
        results_list.append(search_result)

    # 4. Build & return our SearchResponse
    return SearchResponse(
        query=query,
        total_results=total_found,
        results=results_list
    )

if __name__ == "__main__":
    # Example usage
    from pprint import pprint

    query = "yandex"
    max_articles = 2  # parse first 3 articles in detail
    search_response = get_search_results(query, max_articles)

    # Print out the resulting data
    # Because it's a Pydantic model, we can do .dict() or .json() too
    print("--- SEARCH RESPONSE (dict) ---")
    pprint(search_response.dict())
    
    # Or just show each result nicely
    print("\n--- HUMAN-READABLE OUTPUT ---")
    print(f"Query: {search_response.query}")
    print(f"Total found: {search_response.total_results}")
    for idx, result in enumerate(search_response.results, 1):
        print(f"\nResult {idx}:")
        print(f"  Title: {result.title}")
        print(f"  URL:   {result.url}")
        print(f"  Date:  {result.date}")
        print(f"  Snippet/desc: {result.description}")
        # Show first 200 chars of content
        if result.content:
            print(f"  Content (truncated): {result.content[:200]}...")
        else:
            print("  Content: [empty]")

In [51]:


# Example shortened queries to parse (feel free to adjust or replace them):
queries = [
    "История", "Основание", "Рейтинг", "Титулы", "Миссия",
    "Наука", "Исследования", "Гранты", "Лаборатории", "Публикации",
    "Поступление", "Проходной балл", "Приёмная", "ЕГЭ", "Бюджет",
    "Контракт", "Льготы", "Целевая", "Олимпиады", "Права", "Факультет"
    "Стипендии", "Повышенная", "Соцстипендия", "Студгранты", "Поддержка",
    "Спонсорство", "Президентская", "Международные", "Конкурсы", "Заявка",
    "Магистратура", "Аспирантура", "Докторантура", "Бакалавриат", "Направления",
    "Специальности", "Учплан", "Модули", "Заочка", "Дистанционка", "Новости"
    "Онлайн-курсы", "Допобразование", "Квалификация", "Профпереподг", "Семинары",
    "Летние школы", "Зимние школы", "Вечерние", "Курсы IT", "Компетенции",
    "Партнёры", "Обмен", "Двойной диплом", "Зарубежная", "Академмобильность",
    "Erasmus+", "Иностранцы", "Англопрограммы", "World rankings", "Спорт",
    "Студсовет", "Клубы", "Научные кружки", "Волонтёры", "Студжизнь",
    "Хакатоны", "Проекты", "Конференции", "Стажировки", "Аудитории",
    "Корпуса", "Общежитие", "Проживание", "Матбаза", "Коворкинги",
    "Библиотека", "Электронная", "Инфоресурсы", "Кампус-тур", "Адрес",
    "Транспорт", "Правила", "Академотпуск", "Перевод", "Регламент",
    "Экзамены", "Апелляция", "Э-расписание", "Выпускники", "Вакансии",
    "Трудоустройство", "Компании", "Карьерный центр", "Стартап", "Акселератор",
    "Предприниматель", "Инноватика", "Выдающиеся", "Известные", "Международные"
    "Искуственный интеллект", "Talent Hub", "AI", "ITMO", "ИТМО", "Ректор",
    "Проректор"
]

def main():
    csv_filename = "itmo_search_results.csv"

    # Open CSV file for writing
    with open(csv_filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=";")
        # Header row
        writer.writerow(["Query", "Title", "url", "Date", "Snippet/desc", "content"])

        for query in tqdm(queries):
            try:
                # Call your previously defined function:
                search_response = get_search_results(query, max_articles=40)

                # Print a short summary to console
                print(f"\n=== Query: {query} (Found: {search_response.total_results}) ===")

                for idx, result in enumerate(search_response.results, 1):
                    # Save row to CSV
                    row = [
                        query,
                        result.title,
                        str(result.url),
                        result.date.isoformat() if result.date else "",
                        result.description or "",
                        result.content or ""
                    ]
                    writer.writerow(row)

            except Exception as e:
                # If there's a network error or parse error, print it but continue
                print(f"Error processing query '{query}': {e}", file=sys.stderr)

    print(f"\nAll results have been written to '{csv_filename}'.")

if __name__ == "__main__":
    main()

  1%|          | 1/106 [00:02<04:58,  2.85s/it]


=== Query: История (Found: 440) ===


  2%|▏         | 2/106 [00:05<05:07,  2.96s/it]


=== Query: Основание (Found: 64) ===


  3%|▎         | 3/106 [00:08<04:51,  2.83s/it]


=== Query: Рейтинг (Found: 758) ===


  4%|▍         | 4/106 [00:09<03:40,  2.16s/it]


=== Query: Титулы (Found: 0) ===


  5%|▍         | 5/106 [00:12<03:57,  2.35s/it]


=== Query: Миссия (Found: 294) ===


  6%|▌         | 6/106 [00:15<04:07,  2.47s/it]


=== Query: Наука (Found: 905) ===


  7%|▋         | 7/106 [00:17<04:08,  2.51s/it]


=== Query: Исследования (Found: 2386) ===


  8%|▊         | 8/106 [00:20<04:08,  2.54s/it]


=== Query: Гранты (Found: 242) ===


  8%|▊         | 9/106 [00:22<04:07,  2.55s/it]


=== Query: Лаборатории (Found: 1708) ===


  9%|▉         | 10/106 [00:25<04:03,  2.53s/it]


=== Query: Публикации (Found: 348) ===


 10%|█         | 11/106 [00:28<04:07,  2.60s/it]


=== Query: Поступление (Found: 188) ===


 11%|█▏        | 12/106 [00:30<04:12,  2.68s/it]


=== Query: Проходной балл (Found: 28) ===


 12%|█▏        | 13/106 [00:33<04:09,  2.68s/it]


=== Query: Приёмная (Found: 80) ===


 13%|█▎        | 14/106 [00:36<04:08,  2.70s/it]


=== Query: ЕГЭ (Found: 245) ===


 14%|█▍        | 15/106 [00:38<03:58,  2.62s/it]


=== Query: Бюджет (Found: 482) ===


 15%|█▌        | 16/106 [00:41<03:59,  2.66s/it]


=== Query: Контракт (Found: 206) ===


 16%|█▌        | 17/106 [00:44<04:00,  2.71s/it]


=== Query: Льготы (Found: 65) ===


 17%|█▋        | 18/106 [00:47<04:03,  2.77s/it]


=== Query: Целевая (Found: 52) ===


 18%|█▊        | 19/106 [00:50<04:02,  2.78s/it]


=== Query: Олимпиады (Found: 433) ===


 19%|█▉        | 20/106 [00:52<03:56,  2.74s/it]


=== Query: Права (Found: 216) ===


 20%|█▉        | 21/106 [00:53<03:06,  2.20s/it]


=== Query: ФакультетСтипендии (Found: 0) ===


 21%|██        | 22/106 [00:56<03:19,  2.37s/it]


=== Query: Повышенная (Found: 24) ===


 22%|██▏       | 23/106 [00:57<02:45,  1.99s/it]


=== Query: Соцстипендия (Found: 0) ===


 23%|██▎       | 24/106 [00:58<02:20,  1.72s/it]


=== Query: Студгранты (Found: 0) ===


 24%|██▎       | 25/106 [01:01<02:41,  1.99s/it]


=== Query: Поддержка (Found: 333) ===


 25%|██▍       | 26/106 [01:02<02:26,  1.83s/it]


=== Query: Спонсорство (Found: 2) ===


 25%|██▌       | 27/106 [01:05<02:34,  1.96s/it]


=== Query: Президентская (Found: 7) ===


 26%|██▋       | 28/106 [01:07<02:44,  2.11s/it]


=== Query: Международные (Found: 393) ===


 27%|██▋       | 29/106 [01:10<02:56,  2.29s/it]


=== Query: Конкурсы (Found: 190) ===


 28%|██▊       | 30/106 [01:12<03:00,  2.38s/it]


=== Query: Заявка (Found: 134) ===


 29%|██▉       | 31/106 [01:15<03:02,  2.44s/it]


=== Query: Магистратура (Found: 216) ===


 30%|███       | 32/106 [01:18<03:06,  2.52s/it]


=== Query: Аспирантура (Found: 62) ===


 31%|███       | 33/106 [01:19<02:35,  2.13s/it]


=== Query: Докторантура (Found: 1) ===


 32%|███▏      | 34/106 [01:21<02:45,  2.30s/it]


=== Query: Бакалавриат (Found: 771) ===


 33%|███▎      | 35/106 [01:24<02:50,  2.40s/it]


=== Query: Направления (Found: 1807) ===


 34%|███▍      | 36/106 [01:27<02:57,  2.54s/it]


=== Query: Специальности (Found: 358) ===


 35%|███▍      | 37/106 [01:28<02:24,  2.09s/it]


=== Query: Учплан (Found: 0) ===


 36%|███▌      | 38/106 [01:31<02:36,  2.30s/it]


=== Query: Модули (Found: 132) ===


 37%|███▋      | 39/106 [01:32<02:07,  1.91s/it]


=== Query: Заочка (Found: 0) ===


 38%|███▊      | 40/106 [01:33<01:52,  1.70s/it]


=== Query: Дистанционка (Found: 1) ===


 39%|███▊      | 41/106 [01:34<01:39,  1.54s/it]


=== Query: НовостиОнлайн-курсы (Found: 0) ===


 40%|███▉      | 42/106 [01:35<01:33,  1.46s/it]


=== Query: Допобразование (Found: 1) ===


 41%|████      | 43/106 [01:38<01:57,  1.86s/it]


=== Query: Квалификация (Found: 22) ===


 42%|████▏     | 44/106 [01:40<01:48,  1.74s/it]


=== Query: Профпереподг (Found: 2) ===


 42%|████▏     | 45/106 [01:42<02:04,  2.05s/it]


=== Query: Семинары (Found: 227) ===


 43%|████▎     | 46/106 [01:45<02:17,  2.30s/it]


=== Query: Летние школы (Found: 50) ===


 44%|████▍     | 47/106 [01:48<02:26,  2.49s/it]


=== Query: Зимние школы (Found: 26) ===


 45%|████▌     | 48/106 [01:51<02:22,  2.46s/it]


=== Query: Вечерние (Found: 7) ===


 46%|████▌     | 49/106 [01:52<02:01,  2.14s/it]


=== Query: Курсы IT (Found: 2) ===


 47%|████▋     | 50/106 [01:55<02:10,  2.32s/it]


=== Query: Компетенции (Found: 463) ===


 48%|████▊     | 51/106 [01:58<02:18,  2.52s/it]


=== Query: Партнёры (Found: 329) ===


 49%|████▉     | 52/106 [02:01<02:26,  2.72s/it]


=== Query: Обмен (Found: 926) ===


 50%|█████     | 53/106 [02:03<02:11,  2.48s/it]


=== Query: Двойной диплом (Found: 5) ===


 51%|█████     | 54/106 [02:05<02:02,  2.36s/it]


=== Query: Зарубежная (Found: 6) ===


 52%|█████▏    | 55/106 [02:06<01:41,  2.00s/it]


=== Query: Академмобильность (Found: 0) ===


 53%|█████▎    | 56/106 [02:09<01:53,  2.27s/it]


=== Query: Erasmus+ (Found: 53) ===


 54%|█████▍    | 57/106 [02:12<02:03,  2.52s/it]


=== Query: Иностранцы (Found: 34) ===


 55%|█████▍    | 58/106 [02:13<01:41,  2.11s/it]


=== Query: Англопрограммы (Found: 0) ===


 56%|█████▌    | 59/106 [02:15<01:31,  1.95s/it]


=== Query: World rankings (Found: 3) ===


 57%|█████▋    | 60/106 [02:18<01:41,  2.21s/it]


=== Query: Спорт (Found: 1649) ===


 58%|█████▊    | 61/106 [02:20<01:43,  2.30s/it]


=== Query: Студсовет (Found: 8) ===


 58%|█████▊    | 62/106 [02:23<01:48,  2.46s/it]


=== Query: Клубы (Found: 69) ===


 59%|█████▉    | 63/106 [02:24<01:29,  2.09s/it]


=== Query: Научные кружки (Found: 0) ===


 60%|██████    | 64/106 [02:27<01:36,  2.29s/it]


=== Query: Волонтёры (Found: 83) ===


 61%|██████▏   | 65/106 [02:28<01:19,  1.94s/it]


=== Query: Студжизнь (Found: 0) ===


 62%|██████▏   | 66/106 [02:31<01:29,  2.23s/it]


=== Query: Хакатоны (Found: 59) ===


 63%|██████▎   | 67/106 [02:34<01:31,  2.36s/it]


=== Query: Проекты (Found: 1725) ===


 64%|██████▍   | 68/106 [02:36<01:33,  2.45s/it]


=== Query: Конференции (Found: 1014) ===


 65%|██████▌   | 69/106 [02:39<01:34,  2.56s/it]


=== Query: Стажировки (Found: 377) ===


 66%|██████▌   | 70/106 [02:42<01:37,  2.71s/it]


=== Query: Аудитории (Found: 525) ===


 67%|██████▋   | 71/106 [02:45<01:37,  2.78s/it]


=== Query: Корпуса (Found: 226) ===


 68%|██████▊   | 72/106 [02:48<01:37,  2.88s/it]


=== Query: Общежитие (Found: 53) ===


 69%|██████▉   | 73/106 [02:51<01:34,  2.88s/it]


=== Query: Проживание (Found: 101) ===


 70%|██████▉   | 74/106 [02:52<01:14,  2.32s/it]


=== Query: Матбаза (Found: 0) ===


 71%|███████   | 75/106 [02:55<01:19,  2.57s/it]


=== Query: Коворкинги (Found: 31) ===


 72%|███████▏  | 76/106 [02:58<01:21,  2.70s/it]


=== Query: Библиотека (Found: 113) ===


 73%|███████▎  | 77/106 [03:01<01:20,  2.79s/it]


=== Query: Электронная (Found: 80) ===


 74%|███████▎  | 78/106 [03:03<01:04,  2.31s/it]


=== Query: Инфоресурсы (Found: 0) ===


 75%|███████▍  | 79/106 [03:04<00:52,  1.96s/it]


=== Query: Кампус-тур (Found: 0) ===


 75%|███████▌  | 80/106 [03:07<00:59,  2.28s/it]


=== Query: Адрес (Found: 541) ===


 76%|███████▋  | 81/106 [03:09<01:00,  2.43s/it]


=== Query: Транспорт (Found: 646) ===


 77%|███████▋  | 82/106 [03:12<01:02,  2.60s/it]


=== Query: Правила (Found: 581) ===


 78%|███████▊  | 83/106 [03:14<00:50,  2.19s/it]


=== Query: Академотпуск (Found: 0) ===


 79%|███████▉  | 84/106 [03:16<00:50,  2.30s/it]


=== Query: Перевод (Found: 375) ===


 80%|████████  | 85/106 [03:19<00:51,  2.43s/it]


=== Query: Регламент (Found: 129) ===


 81%|████████  | 86/106 [03:22<00:50,  2.50s/it]


=== Query: Экзамены (Found: 156) ===


 82%|████████▏ | 87/106 [03:23<00:43,  2.28s/it]


=== Query: Апелляция (Found: 4) ===


 83%|████████▎ | 88/106 [03:24<00:34,  1.89s/it]


=== Query: Э-расписание (Found: 0) ===


 84%|████████▍ | 89/106 [03:27<00:36,  2.17s/it]


=== Query: Выпускники (Found: 638) ===


 85%|████████▍ | 90/106 [03:30<00:37,  2.36s/it]


=== Query: Вакансии (Found: 77) ===


 86%|████████▌ | 91/106 [03:33<00:37,  2.52s/it]


=== Query: Трудоустройство (Found: 66) ===


 87%|████████▋ | 92/106 [03:35<00:34,  2.50s/it]


=== Query: Компании (Found: 2004) ===


 88%|████████▊ | 93/106 [03:37<00:27,  2.09s/it]


=== Query: Карьерный центр (Found: 0) ===


 89%|████████▊ | 94/106 [03:39<00:27,  2.30s/it]


=== Query: Стартап (Found: 1000) ===


 90%|████████▉ | 95/106 [03:42<00:26,  2.44s/it]


=== Query: Акселератор (Found: 449) ===


 91%|█████████ | 96/106 [03:45<00:25,  2.52s/it]


=== Query: Предприниматель (Found: 810) ===


 92%|█████████▏| 97/106 [03:48<00:23,  2.62s/it]


=== Query: Инноватика (Found: 44) ===


 92%|█████████▏| 98/106 [03:50<00:21,  2.68s/it]


=== Query: Выдающиеся (Found: 66) ===


 93%|█████████▎| 99/106 [03:53<00:19,  2.74s/it]


=== Query: Известные (Found: 213) ===


 94%|█████████▍| 100/106 [03:54<00:13,  2.23s/it]


=== Query: МеждународныеИскуственный интеллект (Found: 0) ===


 96%|█████████▌| 102/106 [03:57<00:06,  1.72s/it]


=== Query: Talent Hub (Found: 13) ===

=== Query: AI (Found: 0) ===


 97%|█████████▋| 103/106 [04:00<00:05,  1.96s/it]


=== Query: ITMO (Found: 3045) ===


 98%|█████████▊| 104/106 [04:02<00:03,  1.90s/it]


=== Query: ИТМО (Found: 6969) ===


 99%|█████████▉| 105/106 [04:04<00:02,  2.11s/it]


=== Query: Ректор (Found: 2685) ===


100%|██████████| 106/106 [04:07<00:00,  2.33s/it]


=== Query: Проректор (Found: 431) ===

All results have been written to 'itmo_search_results.csv'.





In [None]:
parsed_data = pd.read_csv('itmo_search_results.csv', on_bad_lines='skip', delimiter=';')

In [63]:
parsed_data = parsed_data.rename(columns={'URL': 'url', 'Content': 'content'})
parsed_data['source'] =  parsed_data['url']

In [68]:
parsed_data.to_csv('../data/processed/texts.csv', index=False)

In [70]:
pd.read_csv('../data/processed/texts.csv')

Unnamed: 0,Query,Title,url,Date,Snippet/desc,content,source
0,История,"«Можно быть прикольным ученым, а не скучным за...",https://news.itmo.ru/ru/science/photonics/news...,2025-01-15T13:17:31+03:00,…й в пиджаке.\r\n\r\n― И как вы решили идти к ...,В школе Алексей Кохановский прочитал автобиогр...,https://news.itmo.ru/ru/science/photonics/news...
1,История,«Инклюзия — логичная эволюция дизайна»: какие ...,https://news.itmo.ru/ru/education/trend/news/1...,2024-11-22T16:01:53+03:00,"…ь на будущем месте работы», — подчеркнул выпу...",Как сегодня создаются отечественные цифровые с...,https://news.itmo.ru/ru/education/trend/news/1...
2,История,Не только сфинксы и Эрмитаж: в Петербурге выбр...,https://news.itmo.ru/ru/education/cooperation/...,2024-10-31T10:00:10+03:00,"…о сфинксами, Исаакиевским собором и Бродским ...",В Петербурге подвели итоги проектной школы по ...,https://news.itmo.ru/ru/education/cooperation/...
3,История,Никакой скучной физкультуры: как «Кронверкские...,https://news.itmo.ru/ru/university_live/social...,2024-10-11T10:02:50+03:00,…Содержание\r\n\r\n\r\n\tКраткаяисторияклуба\r...,Пять раз побеждали во всероссийском конкурсе «...,https://news.itmo.ru/ru/university_live/social...
4,История,"Виртуальные электростанции, но реальное электр...",https://news.itmo.ru/ru/science/cyberphysics/n...,2024-10-08T11:01:13+03:00,"…ается в науку. Еще лучше, если вам удалось по...",Джексон Джон Джусто приехал в Петербург из Тан...,https://news.itmo.ru/ru/science/cyberphysics/n...
...,...,...,...,...,...,...,...
782,Проректор,Нейросеть для учителя: как эффективно использо...,https://news.itmo.ru/ru/education/trend/news/1...,2024-09-04T17:20:13+03:00,…Искусственный интеллект» ИТМО Д…,Искусственный интеллект ― далеко не просто «иг...,https://news.itmo.ru/ru/education/trend/news/1...
783,Проректор,"ITMO CONF 2024: этика, бизнес и образование в ...",https://news.itmo.ru/ru/education/trend/news/1...,2024-09-02T20:25:49+03:00,…ИТМО — это ИИ\r\n\r\nНа открытии конференции ...,В этом году главной темой ITMO CONF стал тезис...,https://news.itmo.ru/ru/education/trend/news/1...
784,Проректор,В ИТМО Хайпарке построят комплекс для фиджитал...,https://news.itmo.ru/ru/startups_and_business/...,2024-06-03T15:22:29+03:00,…бразовательные программы и методические реком...,Представители образования и городской админист...,https://news.itmo.ru/ru/startups_and_business/...
785,Проректор,Поменялись местами: зачем деканы ИТМО неделю р...,https://news.itmo.ru/ru/education/official/new...,2024-05-13T09:30:00+03:00,"…цессами Студофиса, в университете провели экс...",В ИТМО студентам не нужно бегать по инстанциям...,https://news.itmo.ru/ru/education/official/new...
