In [16]:
from bs4 import BeautifulSoup
import requests


url = 'https://www.theguardian.com/world'
headers = {
    'Accept': '*/*', 
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 YaBrowser/22.7.3.799 Yowser/2.5 Safari/537.36'
}
req = requests.get(url, headers=headers)
src = req.text
soup = BeautifulSoup(src, 'lxml')
items = []
allHrefs = soup.find_all(class_="dcr-lv2v9o")
allText = soup.find_all(class_="show-underline dcr-adlhb4")
allImage = soup.find_all(class_="dcr-evn1e9")
max_len = max(len(allHrefs), len(allText))
for i in range(max_len):
    try:
        itemText = allText[i].text
        itemHref = allHrefs[i].get('href')
        itemImage = allImage[i].get('src')
        fullHref = "https://www.theguardian.com" + str(itemHref)
        items.append(
            {
                "title": itemText,
                "href": fullHref,
                "image": itemImage
            }
        )
    except Exception:
        print(Exception)

items


<class 'Exception'>
<class 'Exception'>


[{'title': 'Court blocks Kenya from deploying  police officers to Haiti to fight gangs',
  'href': 'https://www.theguardian.com/world/2023/oct/09/court-blocks-kenya-from-deploying-police-officers-to-haiti',
  'image': 'https://i.guim.co.uk/img/media/529306234c00f65e157443c318a9f560358528a8/0_367_5500_3300/master/5500.jpg?width=360&dpr=1&s=none'},
 {'title': ' Egypt: police officer shoots dead two Israeli tourists and Egyptian guide',
  'href': 'https://www.theguardian.com/world/2023/oct/08/egypt-police-officer-shoots-dead-two-israeli-tourists-and-egyptian-guide',
  'image': 'https://i.guim.co.uk/img/media/33684d646fe8f262f9669b2e2e860ab5846b2618/0_146_4380_2629/master/4380.jpg?width=120&dpr=1&s=none'},
 {'title': 'IMF\xa0clings to a hopeful agenda as crisis follows crisis',
  'href': 'https://www.theguardian.com/business/2023/oct/08/imf-clings-to-a-hopeful-agenda-as-crisis-follows-crisis',
  'image': 'https://i.guim.co.uk/img/media/1f751e6a3b74bf8f51f13e8b297970553e50e5eb/0_93_4800_288

In [14]:
import pandas as pd
import spacy
import numpy as np
import pickle
import geonamescache

nlp = spacy.load('en_core_web_lg')
gc = geonamescache.GeonamesCache()
   

df = pd.DataFrame(items)
preprocessing_text = []
for text in df['title']:
    doc = nlp(text)
    new_text = []
    for token in doc:
        if not token.is_stop and not token.is_space and not token.is_punct:
            new_text.append(token.lemma_)
    preprocessing_text.append(' '.join(new_text))
df['preprocessing_text'] = preprocessing_text
df['vector'] = df['preprocessing_text'].apply(
    lambda text: nlp(text).vector)
preprocessing_text = np.stack(df.vector)

In [15]:

with open('model_pickle', 'rb') as file:
    rfc = pickle.load(file)
y_pred = rfc.predict(preprocessing_text)
y_pred


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


array([1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [5]:
need_list = []
image_list =[]
url_list = []
i = 0
for item in df['title']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        need_list.append(item)
    i += 1
i = 0
for item in df['image']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        image_list.append(item)
    i += 1
i = 0
for it in df['href']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        url_list.append(it)
    i += 1
items = []
for i in range(len(need_list)):
    items.append(
        {
            "title": need_list[i],
            "id": i,
            "href": url_list[i],
            "image": image_list[i]
        }
    )

Court blocks Kenya from deploying  police officers to Haiti to fight gangs 1
 Egypt: police officer shoots dead two Israeli tourists and Egyptian guide 1
IMF clings to a hopeful agenda as crisis follows crisis 1
France’s departure from Niger reflects years of failure in its former colonies 1
France to begin pulling out troops from Niger this week  1
Border walls and deportations: Biden’s migrant plans prompt outrage 1
Six suspects in murder of Ecuador presidential candidate killed in prison, authorities say 1
AOC slams sanctions against Venezuela and deportation flights 1
Campaigners aim to lower support for China on UN human rights council 1
PNG threatens to send refugees back to Australia unless it keeps funding humanitarian program 1
Three bears that holed up in factory in Japan are captured and killed 1
Closure of maternity wards fuels Chinese debate over population decline 1
Pro-Palestinian rally in Sydney calls for Australia to drop support for Israel 1
Hope for power bill relief

In [6]:
countries = gc.get_countries()
cities = gc.get_cities()
def gen_dict_extract(var, key):
    if isinstance(var, dict):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, (dict, list)):
                yield from gen_dict_extract(v, key)
    elif isinstance(var, list):
        for d in var:
            yield from gen_dict_extract(d, key)
countries = [*gen_dict_extract(countries, 'name')]
cities = [*gen_dict_extract(cities, 'name')]
info = []
for item in range(len(need_list)):
    doc = nlp(need_list[item])
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            if ent.text in countries:
                info.append(
                    {
                        "country": ent.text,
                        "text": doc.text
                    }
                )
            elif ent.text not in cities:
                if ent.text == "US":
                    info.append(
                    {
                        "country": "United States of America",
                        "text": doc.text
                    }
                )
                elif ent.text == "UK":
                    info.append(
                    {
                        "country": "United Kingdom",
                        "text": doc.text
                    }
                )
                else:
                    info.append(
                        {
                            "country": ent.text,
                            "text": doc.text
                        }
                    )

In [7]:
import pandas as pd
import spacy
import numpy as np
import pickle
import geonamescache

nlp = spacy.load('en_core_web_lg')
gc = geonamescache.GeonamesCache()
   

df = pd.DataFrame(items)
preprocessing_text = []
for text in df['title']:
    doc = nlp(text)
    new_text = []
    for token in doc:
        if not token.is_stop and not token.is_space and not token.is_punct:
            new_text.append(token.lemma_)
    preprocessing_text.append(' '.join(new_text))
df['preprocessing_text'] = preprocessing_text
df['vector'] = df['preprocessing_text'].apply(
    lambda text: nlp(text).vector)
preprocessing_text = np.stack(df.vector)

with open('model_pickle', 'rb') as file:
    rfc = pickle.load(file)
y_pred = rfc.predict(preprocessing_text)

need_list = []
image_list =[]
url_list = []
i = 0
for item in df['title']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        need_list.append(item)
    i += 1
i = 0
for item in df['image']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        image_list.append(item)
    i += 1
i = 0
for it in df['href']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        url_list.append(it)
    i += 1
items = []
for i in range(len(need_list)):
    items.append(
        {
            "title": need_list[i],
            "id": i,
            "href": url_list[i],
            "image": image_list[i]
        }
    )

countries = gc.get_countries()
cities = gc.get_cities()
def gen_dict_extract(var, key):
    if isinstance(var, dict):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, (dict, list)):
                yield from gen_dict_extract(v, key)
    elif isinstance(var, list):
        for d in var:
            yield from gen_dict_extract(d, key)
countries = [*gen_dict_extract(countries, 'name')]
cities = [*gen_dict_extract(cities, 'name')]
info = []
for item in range(len(need_list)):
    doc = nlp(need_list[item])
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            if ent.text in countries:
                info.append(
                    {
                        "country": ent.text,
                        "text": doc.text
                    }
                )
            elif ent.text not in cities:
                if ent.text == "US":
                    info.append(
                    {
                        "country": "United States of America",
                        "text": doc.text
                    }
                )
                elif ent.text == "UK":
                    info.append(
                    {
                        "country": "United Kingdom",
                        "text": doc.text
                    }
                )
                else:
                    info.append(
                        {
                            "country": ent.text,
                            "text": doc.text
                        }
                    )

In [17]:
df = pd.DataFrame(items)
preprocessing_text = []

for text in df['title']:
    doc = nlp(text)
    new_text = [token.lemma_ for token in doc if not token.is_stop and not token.is_space and not token.is_punct]
    preprocessing_text.append(' '.join(new_text))

df['preprocessing_text'] = preprocessing_text
df['vector'] = df['preprocessing_text'].apply(lambda text: nlp(text).vector)
preprocessing_text = np.stack(df.vector)

with open('model_pickle', 'rb') as file:
    rfc = pickle.load(file)

y_pred = rfc.predict(preprocessing_text)

need_list = df[y_pred == 1]['title'].tolist()
image_list = df[y_pred == 1]['image'].tolist()
url_list = df[y_pred == 1]['href'].tolist()

items = []

for title, image, href in zip(need_list, image_list, url_list):
    items.append({
        "title": title,
        "id": len(items),
        "href": href,
        "image": image
    })

countries = gc.get_countries_by_names()
cities = gc.get_cities()

def extract_names(var, key):
    if isinstance(var, dict):
        if key in var:
            yield var[key]
        for k, v in var.items():
            if isinstance(v, (dict, list)):
                yield from extract_names(v, key)
    elif isinstance(var, list):
        for d in var:
            yield from extract_names(d, key)

countries = list(extract_names(countries, 'name'))
cities = list(extract_names(cities, 'name'))

items = []

for title, image, href in zip(need_list, image_list, url_list):
    item = {
        "title": title,
        "id": len(items),
        "href": href,
        "image": image,
    }
    # Ищем страны и города для текущего элемента
    doc = nlp(title)
    found_countries = []
    found_cities = []
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            country_name = ent.text
            if country_name in countries:
                found_countries.append(country_name)
            elif country_name not in cities:
                if country_name == "US":
                    found_countries.append("United States of America")
                elif country_name == "UK":
                    found_countries.append("United Kingdom")
                else:
                    found_countries.append(country_name)
        elif ent.label_ == 'LOC':
            city_name = ent.text
            found_cities.append(city_name)

    item["country"] = found_countries
    item["city"] = found_cities
    items.append(item)
    

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [18]:
pprint(items)

[{'city': [],
  'country': ['Kenya', 'Haiti'],
  'href': 'https://www.theguardian.com/world/2023/oct/09/court-blocks-kenya-from-deploying-police-officers-to-haiti',
  'id': 0,
  'image': 'https://i.guim.co.uk/img/media/529306234c00f65e157443c318a9f560358528a8/0_367_5500_3300/master/5500.jpg?width=360&dpr=1&s=none',
  'title': 'Court blocks Kenya from deploying  police officers to Haiti to '
           'fight gangs'},
 {'city': [],
  'country': ['Egypt'],
  'href': 'https://www.theguardian.com/world/2023/oct/08/egypt-police-officer-shoots-dead-two-israeli-tourists-and-egyptian-guide',
  'id': 1,
  'image': 'https://i.guim.co.uk/img/media/33684d646fe8f262f9669b2e2e860ab5846b2618/0_146_4380_2629/master/4380.jpg?width=120&dpr=1&s=none',
  'title': ' Egypt: police officer shoots dead two Israeli tourists and '
           'Egyptian guide'},
 {'city': [],
  'country': [],
  'href': 'https://www.theguardian.com/business/2023/oct/08/imf-clings-to-a-hopeful-agenda-as-crisis-follows-crisis',
  'i

In [11]:
from pprint import pprint


items = []

for title, image, href in zip(need_list, image_list, url_list):
    item = {
        "title": title,
        "id": len(items),
        "href": href,
        "image": image,
    }
    # Ищем страны и города для текущего элемента
    doc = nlp(title)
    found_countries = []
    found_cities = []
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            country_name = ent.text
            if country_name in countries:
                found_countries.append(country_name)
            elif country_name not in cities:
                if country_name == "US":
                    found_countries.append("United States of America")
                elif country_name == "UK":
                    found_countries.append("United Kingdom")
                else:
                    found_countries.append(country_name)
        elif ent.label_ == 'LOC':
            city_name = ent.text
            found_cities.append(city_name)

    item["country"] = found_countries
    item["city"] = found_cities
    items.append(item)

pprint(items)

[{'city': [],
  'country': ['Kenya', 'Haiti'],
  'href': 'https://www.theguardian.com/world/2023/oct/09/court-blocks-kenya-from-deploying-police-officers-to-haiti',
  'id': 0,
  'image': 'https://i.guim.co.uk/img/media/529306234c00f65e157443c318a9f560358528a8/0_367_5500_3300/master/5500.jpg?width=360&dpr=1&s=none',
  'title': 'Court blocks Kenya from deploying  police officers to Haiti to '
           'fight gangs'},
 {'city': [],
  'country': ['Egypt'],
  'href': 'https://www.theguardian.com/world/2023/oct/08/egypt-police-officer-shoots-dead-two-israeli-tourists-and-egyptian-guide',
  'id': 1,
  'image': 'https://i.guim.co.uk/img/media/33684d646fe8f262f9669b2e2e860ab5846b2618/0_146_4380_2629/master/4380.jpg?width=120&dpr=1&s=none',
  'title': ' Egypt: police officer shoots dead two Israeli tourists and '
           'Egyptian guide'},
 {'city': [],
  'country': [],
  'href': 'https://www.theguardian.com/business/2023/oct/08/imf-clings-to-a-hopeful-agenda-as-crisis-follows-crisis',
  'i

In [10]:
from pprint import pprint