In [2]:
from bs4 import BeautifulSoup
import requests


def get_guardian_news_items():
    url = 'https://www.theguardian.com/world'
    headers = {
        'Accept': '*/*',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 YaBrowser/22.7.3.799 Yowser/2.5 Safari/537.36'
    }

    req = requests.get(url, headers=headers)
    src = req.text
    soup = BeautifulSoup(src, 'lxml')

    items = []

    for href, text, image in zip(
        soup.find_all(class_="dcr-lv2v9o"),
        soup.find_all(class_="show-underline dcr-1r9ptb2"),
        soup.find_all(class_="dcr-evn1e9")
    ):
        try:
            item_text = text.text
            item_href = href.get('href')
            item_image = image.get('src')

            full_href = f"https://www.theguardian.com{item_href}"
            items.append({
                "title": item_text,
                "href": full_href,
                "image": item_image
            })
        except Exception as e:
            print(f"Ошибка при обработке элемента: {e}")

    return items



In [8]:
import os
import requests
import pandas as pd
import numpy as np
import spacy
import pickle
from transformers import MarianMTModel, MarianTokenizer, AlbertTokenizer, AlbertForSequenceClassification
import geonamescache
import torch

In [27]:
nlp = spacy.load('en_core_web_md')
gc = geonamescache.GeonamesCache()

model_path = "./albert_model"

loaded_model = AlbertForSequenceClassification.from_pretrained(model_path)
tokenizer = AlbertTokenizer.from_pretrained(model_path)

model_name_translator = "Helsinki-NLP/opus-mt-en-ru"
model_translator = MarianMTModel.from_pretrained(model_name_translator)
tokenizer_translator = MarianTokenizer.from_pretrained(model_name_translator)



df = pd.DataFrame(get_guardian_news_items())

In [12]:
df.head()

Unnamed: 0,title,href,image
0,African and Caribbean nations agree move to se...,https://www.theguardian.com/world/2023/nov/17/...,https://i.guim.co.uk/img/media/6661dfa6bca51af...
1,South Africa ‘can’t afford’ to pay for new ant...,https://www.theguardian.com/global-development...,https://i.guim.co.uk/img/media/52f11bda57b348c...
2,Solar energy could power all health facilities...,https://www.theguardian.com/global-development...,https://i.guim.co.uk/img/media/943ce86712538fa...
3,Peers and MPs pledge to block Sunak’s Rwanda p...,https://www.theguardian.com/politics/2023/nov/...,https://i.guim.co.uk/img/media/488d8cbb863ba38...
4,UK government to present full law to set aside...,https://www.theguardian.com/uk-news/2023/nov/1...,https://i.guim.co.uk/img/media/28476b1086f473e...


In [28]:
df['predicted_class'] = None

# Пройдите по каждой строке в столбце "title"
for index, row in df.iterrows():
    text_to_classify = [row['title']]  # Создание списка из строки заголовка

    # Токенизация и предсказание класса
    inputs = tokenizer(text_to_classify, return_tensors="pt")
    outputs = loaded_model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    # Сохранение предсказанного класса в новую колонку
    df.at[index, 'predicted_class'] = predicted_class

# Выведите DataFrame, чтобы убедиться, что колонка была успешно добавлена
df

Unnamed: 0,title,href,image,predicted_class
0,African and Caribbean nations agree move to se...,https://www.theguardian.com/world/2023/nov/17/...,https://i.guim.co.uk/img/media/6661dfa6bca51af...,1
1,South Africa ‘can’t afford’ to pay for new ant...,https://www.theguardian.com/global-development...,https://i.guim.co.uk/img/media/52f11bda57b348c...,0
2,Solar energy could power all health facilities...,https://www.theguardian.com/global-development...,https://i.guim.co.uk/img/media/943ce86712538fa...,0
3,Peers and MPs pledge to block Sunak’s Rwanda p...,https://www.theguardian.com/politics/2023/nov/...,https://i.guim.co.uk/img/media/488d8cbb863ba38...,1
4,UK government to present full law to set aside...,https://www.theguardian.com/uk-news/2023/nov/1...,https://i.guim.co.uk/img/media/28476b1086f473e...,0
5,UK ministers’ efforts to revive Rwanda policy ...,https://www.theguardian.com/uk-news/2023/nov/1...,https://i.guim.co.uk/img/media/30dd662cdbe2286...,1
6,Bolsonaro under investigation for ‘harassing’ ...,https://www.theguardian.com/world/2023/nov/19/...,https://i.guim.co.uk/img/media/f6f1d3e96964158...,0
7,Mystery of ‘decades-old’ plane wreck in Canadi...,https://www.theguardian.com/world/2023/nov/19/...,https://i.guim.co.uk/img/media/2b4420f61a69b99...,0
8,‘Hell de Janeiro’: scorching heat highlights B...,https://www.theguardian.com/global-development...,https://i.guim.co.uk/img/media/d59148bf90a093b...,0
9,Argentina holds breath as far-right Milei seiz...,https://www.theguardian.com/world/2023/nov/19/...,https://i.guim.co.uk/img/media/10deb8879e4b940...,0


In [35]:
relevant_items = df[df['predicted_class'] == 1]
need_list = relevant_items['title'].tolist()
image_list = relevant_items['image'].tolist()
url_list = relevant_items['href'].tolist()
all_news_data = []
for title, image, href in zip(need_list, image_list, url_list):
    text_to_translate = title
    inputs = tokenizer_translator.encode(
        text_to_translate, return_tensors="pt")
    translated = model_translator.generate(inputs)
    translation = tokenizer_translator.decode(
        translated[0], skip_special_tokens=True)


    news_data = {
        "title_en": title,
        "title_ru": translation,
        "id": len(all_news_data),
        "href": href,
    }
    found_countries = []
    found_cities = []
    for ent in nlp(title).ents:
        if ent.label_ == 'GPE':
            country_city_name = ent.text
            if country_city_name in gc.get_cities():
                found_cities.append(country_city_name)
            elif country_city_name not in gc.get_cities() and country_city_name not in gc.get_countries_by_names():
                if country_city_name == "US":
                    found_countries.append("United States of America")
                elif country_city_name == "UK":
                    found_countries.append("United Kingdom")
                else:
                    found_cities.append(country_city_name)
            if country_city_name in gc.get_countries_by_names():
                    found_countries.append(country_city_name)
    news_data["country"] = found_countries
    news_data["city"] = found_cities
    news_data["topical_keywords"] = None
    all_news_data.append(news_data)

all_news_data

[{'title_en': 'African and Caribbean nations agree move to seek reparations for slavery',
  'title_ru': 'Африканские и карибские государства соглашаются добиваться возмещения ущерба в связи с рабством',
  'id': 0,
  'href': 'https://www.theguardian.com/world/2023/nov/17/african-and-caribbean-nations-agree-move-to-seek-reparations-for-slavery',
  'country': [],
  'city': [],
  'topical_keywords': None},
 {'title_en': 'Peers and MPs pledge to block Sunak’s Rwanda plan as Braverman labels it ‘magical thinking’',
  'title_ru': 'Коллеги и члены парламента обязуются заблокировать план «Сунакс» Руанды, поскольку Брейверман называет его «магическим мышлением»10.',
  'id': 1,
  'href': 'https://www.theguardian.com/politics/2023/nov/16/westminster-peers-and-mps-pledge-to-block-sunaks-new-rwanda-plan-on-asylum-seekers',
  'country': ['Rwanda'],
  'city': [],
  'topical_keywords': None},
 {'title_en': 'UK\xa0ministers’ efforts to revive Rwanda policy likely to fail, lawyers say',
  'title_ru': 'По