In [1]:
from bs4 import BeautifulSoup
import requests


url = 'https://www.theguardian.com/world'
headers = {
    'Accept': '*/*', 
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 YaBrowser/22.7.3.799 Yowser/2.5 Safari/537.36'
}
req = requests.get(url, headers=headers)
src = req.text
soup = BeautifulSoup(src, 'lxml')
items = []
allHrefs = soup.find_all(class_="dcr-lv2v9o")
allText = soup.find_all(class_="show-underline dcr-adlhb4")
allImage = soup.find_all(class_="dcr-evn1e9")
max_len = max(len(allHrefs), len(allText))
for i in range(max_len):
    try:
        itemText = allText[i].text
        itemHref = allHrefs[i].get('href')
        itemImage = allImage[i].get('src')
        fullHref = "https://www.theguardian.com" + str(itemHref)
        items.append(
            {
                "title": itemText,
                "href": fullHref,
                "image": itemImage
            }
        )
    except Exception:
        print(Exception)

items


<class 'Exception'>
<class 'Exception'>


[{'title': 'Two\xa0tourists, from UK and South Africa, and guide killed in attack in Uganda',
  'href': 'https://www.theguardian.com/world/2023/oct/17/two-tourists-and-their-guide-killed-in-attack-in-uganda-national-park',
  'image': 'https://i.guim.co.uk/img/media/6b68bf677f76fa56debb2c129c646008709530c6/0_340_5472_3283/master/5472.jpg?width=360&dpr=1&s=none'},
 {'title': 'Zimbabwe women’s football coach in custody after indecent assault claim',
  'href': 'https://www.theguardian.com/football/2023/oct/17/zimbabwe-womens-team-coach-in-custody-indecent-assault-claim-shadreck-mlauzi',
  'image': 'https://i.guim.co.uk/img/media/16ba63c1d1cbb016371e1e9d6e5b02c60dadc60b/0_241_4245_2548/master/4245.jpg?width=120&dpr=1&s=none'},
 {'title': 'Malawi swelters in record heat with temperatures nearly 20C above average ',
  'href': 'https://www.theguardian.com/world/2023/oct/17/malawi-swelters-in-record-heat-with-temperatures-nearly-20c-above-average',
  'image': 'https://i.guim.co.uk/img/media/75c

In [2]:
import pandas as pd
import spacy
import numpy as np
import pickle
import geonamescache

nlp = spacy.load('en_core_web_lg')
gc = geonamescache.GeonamesCache()
   

df = pd.DataFrame(items)
preprocessing_text = []
for text in df['title']:
    doc = nlp(text)
    new_text = []
    for token in doc:
        if not token.is_stop and not token.is_space and not token.is_punct:
            new_text.append(token.lemma_)
    preprocessing_text.append(' '.join(new_text))
df['preprocessing_text'] = preprocessing_text
df['vector'] = df['preprocessing_text'].apply(
    lambda text: nlp(text).vector)
preprocessing_text = np.stack(df.vector)

In [5]:

with open('model_pickle', 'rb') as file:
    rfc = pickle.load(file)
y_pred = rfc.predict(preprocessing_text)
y_pred


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


array([1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

# Было

In [4]:
need_list = []
image_list =[]
url_list = []
i = 0
for item in df['title']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        need_list.append(item)
    i += 1
i = 0
for item in df['image']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        image_list.append(item)
    i += 1
i = 0
for it in df['href']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        url_list.append(it)
    i += 1
items = []
for i in range(len(need_list)):
    items.append(
        {
            "title": need_list[i],
            "id": i,
            "href": url_list[i],
            "image": image_list[i]
        }
    )

Two tourists, from UK and South Africa, and guide killed in attack in Uganda 1
‘A dangerous game’: Republican chaos and indecision as crises shake the world 1
Bolsonaro was engineer of ‘wilful coup attempt’, Brazil congress inquiry alleges 1
Canada accuses China fighter jets of ‘reckless’ interception of military plane 1
Israel and Colombia in ferocious diplomatic spat over Hamas war 1
US and Venezuela set to agree deal on sanctions relief and open elections 1
Canada province uses constitutional override to advance pronoun legislation 1
US accuses China of pattern of ‘dangerous’ air force manoeuvres against military planes 1
Malaysia pulls out of Frankfurt book fair, blaming organisation’s pro-Israel stance  1
Finland faces growing Russian online threat, Finnish security services say 1
Russia-Ukraine war at a glance: what we know on day 602 of the invasion 1
EU leaders vow unified effort to mitigate humanitarian crisis in Gaza 1
Ukraine deploys US-supplied ATACMS missiles for first tim

In [6]:
countries = gc.get_countries()
cities = gc.get_cities()
def gen_dict_extract(var, key):
    if isinstance(var, dict):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, (dict, list)):
                yield from gen_dict_extract(v, key)
    elif isinstance(var, list):
        for d in var:
            yield from gen_dict_extract(d, key)
countries = [*gen_dict_extract(countries, 'name')]
cities = [*gen_dict_extract(cities, 'name')]
info = []
for item in range(len(need_list)):
    doc = nlp(need_list[item])
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            if ent.text in countries:
                info.append(
                    {
                        "country": ent.text,
                        "text": doc.text
                    }
                )
            elif ent.text not in cities:
                if ent.text == "US":
                    info.append(
                    {
                        "country": "United States of America",
                        "text": doc.text
                    }
                )
                elif ent.text == "UK":
                    info.append(
                    {
                        "country": "United Kingdom",
                        "text": doc.text
                    }
                )
                else:
                    info.append(
                        {
                            "country": ent.text,
                            "text": doc.text
                        }
                    )

In [7]:
import pandas as pd
import spacy
import numpy as np
import pickle
import geonamescache

nlp = spacy.load('en_core_web_lg')
gc = geonamescache.GeonamesCache()
   

df = pd.DataFrame(items)
preprocessing_text = []
for text in df['title']:
    doc = nlp(text)
    new_text = []
    for token in doc:
        if not token.is_stop and not token.is_space and not token.is_punct:
            new_text.append(token.lemma_)
    preprocessing_text.append(' '.join(new_text))
df['preprocessing_text'] = preprocessing_text
df['vector'] = df['preprocessing_text'].apply(
    lambda text: nlp(text).vector)
preprocessing_text = np.stack(df.vector)

with open('model_pickle', 'rb') as file:
    rfc = pickle.load(file)
y_pred = rfc.predict(preprocessing_text)

need_list = []
image_list =[]
url_list = []
i = 0
for item in df['title']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        need_list.append(item)
    i += 1
i = 0
for item in df['image']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        image_list.append(item)
    i += 1
i = 0
for it in df['href']:
    if y_pred[i] == 1:
        print(item, y_pred[i])
        url_list.append(it)
    i += 1
items = []
for i in range(len(need_list)):
    items.append(
        {
            "title": need_list[i],
            "id": i,
            "href": url_list[i],
            "image": image_list[i]
        }
    )

countries = gc.get_countries()
cities = gc.get_cities()
def gen_dict_extract(var, key):
    if isinstance(var, dict):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, (dict, list)):
                yield from gen_dict_extract(v, key)
    elif isinstance(var, list):
        for d in var:
            yield from gen_dict_extract(d, key)
countries = [*gen_dict_extract(countries, 'name')]
cities = [*gen_dict_extract(cities, 'name')]
info = []
for item in range(len(need_list)):
    doc = nlp(need_list[item])
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            if ent.text in countries:
                info.append(
                    {
                        "country": ent.text,
                        "text": doc.text
                    }
                )
            elif ent.text not in cities:
                if ent.text == "US":
                    info.append(
                    {
                        "country": "United States of America",
                        "text": doc.text
                    }
                )
                elif ent.text == "UK":
                    info.append(
                    {
                        "country": "United Kingdom",
                        "text": doc.text
                    }
                )
                else:
                    info.append(
                        {
                            "country": ent.text,
                            "text": doc.text
                        }
                    )

# СТАЛО

In [6]:
df = pd.DataFrame(items)
preprocessing_text = []

for text in df['title']:
    doc = nlp(text)
    new_text = [token.lemma_ for token in doc if not token.is_stop and not token.is_space and not token.is_punct]
    preprocessing_text.append(' '.join(new_text))

df['preprocessing_text'] = preprocessing_text
df['vector'] = df['preprocessing_text'].apply(lambda text: nlp(text).vector)
preprocessing_text = np.stack(df.vector)

with open('model_pickle', 'rb') as file:
    rfc = pickle.load(file)

y_pred = rfc.predict(preprocessing_text)

need_list = df[y_pred == 1]['title'].tolist()
image_list = df[y_pred == 1]['image'].tolist()
url_list = df[y_pred == 1]['href'].tolist()

items = []

for title, image, href in zip(need_list, image_list, url_list):
    items.append({
        "title": title,
        "id": len(items),
        "href": href,
        "image": image
    })

countries = gc.get_countries_by_names()
cities = gc.get_cities()

def extract_names(var, key):
    if isinstance(var, dict):
        if key in var:
            yield var[key]
        for k, v in var.items():
            if isinstance(v, (dict, list)):
                yield from extract_names(v, key)
    elif isinstance(var, list):
        for d in var:
            yield from extract_names(d, key)

countries = list(extract_names(countries, 'name'))
cities = list(extract_names(cities, 'name'))

items = []

for title, image, href in zip(need_list, image_list, url_list):
    item = {
        "title": title,
        "id": len(items),
        "href": href,
        "image": image,
    }
    # Ищем страны и города для текущего элемента
    doc = nlp(title)
    found_countries = []
    found_cities = []
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            country_name = ent.text
            if country_name in countries:
                found_countries.append(country_name)
            elif country_name not in cities:
                if country_name == "US":
                    found_countries.append("United States of America")
                elif country_name == "UK":
                    found_countries.append("United Kingdom")
                else:
                    found_countries.append(country_name)
        elif ent.label_ == 'LOC':
            city_name = ent.text
            found_cities.append(city_name)

    item["country"] = found_countries
    item["city"] = found_cities
    items.append(item)
    

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [18]:
pprint(items)

[{'city': [],
  'country': ['Kenya', 'Haiti'],
  'href': 'https://www.theguardian.com/world/2023/oct/09/court-blocks-kenya-from-deploying-police-officers-to-haiti',
  'id': 0,
  'image': 'https://i.guim.co.uk/img/media/529306234c00f65e157443c318a9f560358528a8/0_367_5500_3300/master/5500.jpg?width=360&dpr=1&s=none',
  'title': 'Court blocks Kenya from deploying  police officers to Haiti to '
           'fight gangs'},
 {'city': [],
  'country': ['Egypt'],
  'href': 'https://www.theguardian.com/world/2023/oct/08/egypt-police-officer-shoots-dead-two-israeli-tourists-and-egyptian-guide',
  'id': 1,
  'image': 'https://i.guim.co.uk/img/media/33684d646fe8f262f9669b2e2e860ab5846b2618/0_146_4380_2629/master/4380.jpg?width=120&dpr=1&s=none',
  'title': ' Egypt: police officer shoots dead two Israeli tourists and '
           'Egyptian guide'},
 {'city': [],
  'country': [],
  'href': 'https://www.theguardian.com/business/2023/oct/08/imf-clings-to-a-hopeful-agenda-as-crisis-follows-crisis',
  'i

In [7]:
from pprint import pprint


items = []

for title, image, href in zip(need_list, image_list, url_list):
    item = {
        "title": title,
        "id": len(items),
        "href": href,
        "image": image,
    }
    # Ищем страны и города для текущего элемента
    doc = nlp(title)
    found_countries = []
    found_cities = []
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            country_name = ent.text
            if country_name in countries:
                found_countries.append(country_name)
            elif country_name not in cities:
                if country_name == "US":
                    found_countries.append("United States of America")
                elif country_name == "UK":
                    found_countries.append("United Kingdom")
                else:
                    found_countries.append(country_name)
        elif ent.label_ == 'LOC':
            city_name = ent.text
            found_cities.append(city_name)

    item["country"] = found_countries
    item["city"] = found_cities
    items.append(item)

pprint(items)

[{'city': [],
  'country': ['United Kingdom', 'South Africa', 'Uganda'],
  'href': 'https://www.theguardian.com/world/2023/oct/17/two-tourists-and-their-guide-killed-in-attack-in-uganda-national-park',
  'id': 0,
  'image': 'https://i.guim.co.uk/img/media/6b68bf677f76fa56debb2c129c646008709530c6/0_340_5472_3283/master/5472.jpg?width=360&dpr=1&s=none',
  'title': 'Two\xa0tourists, from UK and South Africa, and guide killed in '
           'attack in Uganda'},
 {'city': [],
  'country': [],
  'href': 'https://www.theguardian.com/us-news/2023/oct/14/house-speaker-republican-leadership-absent-us-aid-israel-gaza-ukraine',
  'id': 1,
  'image': 'https://i.guim.co.uk/img/media/b794503a09d1071bd4dd8ba8539f7a8fedff14e8/0_297_8216_4933/master/8216.jpg?width=120&dpr=1&s=none',
  'title': '‘A dangerous game’: Republican chaos and indecision as crises '
           'shake the world'},
 {'city': [],
  'country': ['Brazil'],
  'href': 'https://www.theguardian.com/world/2023/oct/17/bolsonaro-brazil-cou

In [3]:
from googletrans import Translator

# Создаем экземпляр переводчика
translator = Translator()

# Текст для перевода
text_to_translate = "Hello, how are you?"

# Определяем исходный язык (английский) автоматически
detected_lang = translator.detect(text_to_translate).lang

# Переводим текст на русский
translated_text = translator.translate(text_to_translate, src=detected_lang, dest='ru')

# Выводим переведенный текст
print(f"Оригинальный текст: {text_to_translate}")
print(f"Переведенный текст: {translated_text.text}")


ConnectError: [Errno 11001] getaddrinfo failed

In [3]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-ru"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)




In [4]:
text_to_translate = "Hello, how are you?"
inputs = tokenizer.encode(text_to_translate, return_tensors="pt")
translated = model.generate(inputs)
translation = tokenizer.decode(translated[0], skip_special_tokens=True)

print(translation)


Привет, как дела?


In [1]:
from transformers import BlipProcessor, BlipForConditionalGeneration

model_name = "Salesforce/blip-image-captioning-large"
model = BlipForConditionalGeneration.from_pretrained(model_name)
processor = BlipProcessor.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from PIL import Image

# Путь к изображению, которое нужно описать
image_path = "image.jpg"

# Откройте изображение и преобразуйте его в байты
image = Image.open(image_path)
image_bytes = processor(images=image, return_tensors="pt").pixel_values

# Сгенерируйте подпись к изображению
caption = model.generate(image_bytes)

# Декодируйте результат в текст
caption_text = processor.decode(caption[0], skip_special_tokens=True)

print(caption_text)

FileNotFoundError: [Errno 2] No such file or directory: 'image.jpg'

In [20]:
from bs4 import BeautifulSoup
import requests
import os



__file__='new_backend.ipynb'

def get_news_theguardian():
    url = 'https://www.theguardian.com/world'
    headers = {
        'Accept': '*/*', 
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 YaBrowser/22.7.3.799 Yowser/2.5 Safari/537.36'
    }
    req = requests.get(url, headers=headers)
    src = req.text
    soup = BeautifulSoup(src, 'lxml')
    items = []
    allHrefs = soup.find_all(class_="dcr-lv2v9o")
    allText = soup.find_all(class_="show-underline dcr-adlhb4")
    allImage = soup.find_all(class_="dcr-evn1e9")
    max_len = max(len(allHrefs), len(allText))
    for i in range(max_len):
        # try:
            itemText = allText[i].text
            itemHref = allHrefs[i].get('href')
            itemImage = allImage[i].get('src')

            fullHref = "https://www.theguardian.com" + str(itemHref)
            items.append(
                {
                    "title": itemText,
                    "href": fullHref,
                    "image": itemImage
                }
            )
        # except Exception:
        #     print(Exception)

    return items


In [21]:
get_news_theguardian()

Изображение успешно загружено и сохранено как 5075.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 1000.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 4000.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 2560.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 5568.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 5472.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 3000.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 899.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 4000.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 6318.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 5655.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 4530.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 4500.jpg в папку 'photos'.
Изображение успешно загружено и сохранено как 5000.j

IndexError: list index out of range

In [11]:
url = 'https://www.theguardian.com/world'
headers = {
    'Accept': '*/*', 
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 YaBrowser/22.7.3.799 Yowser/2.5 Safari/537.36'
}
req = requests.get(url, headers=headers)
src = req.text
soup = BeautifulSoup(src, 'lxml')
items = []
allHrefs = soup.find_all(class_="dcr-lv2v9o")
allText = soup.find_all(class_="show-underline dcr-adlhb4")
allImage = soup.find_all(class_="dcr-evn1e9")
max_len = max(len(allHrefs), len(allText))
for i in range(max_len):

        itemText = allText[i].text
        itemHref = allHrefs[i].get('href')
        itemImage = allImage[i].get('src')
        image_url = itemImage
        file_name = image_url.split("/")[-1]

        save_directory = os.path.join((os.path.abspath(__file__)))
        save_path = os.path.join(save_directory, file_name)
        response = requests.get(image_url)
        if response.status_code == 200:
            # Сохраняем изображение в файл
            with open(save_path, "wb") as file:
                file.write(response.content)
            print(f"Изображение успешно загружено и сохранено как {file_name} в папку 'photos'.")
        else:
            print("Не удалось загрузить изображение. Проверьте URL.")
        fullHref = "https://www.theguardian.com" + str(itemHref)
        items.append(
            {
                "title": itemText,
                "href": fullHref,
                "image": itemImage
            }
        )
    # except Exception:
    #     print(Exception)
items

NameError: name '__file__' is not defined

In [9]:
get_news_theguardian()

<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>
<class 'Exception'>


[]