In [42]:
import requests
from bs4 import BeautifulSoup

def parse_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Заголовок статьи: сначала ищем <h1>, если нет - <title>, <h2>
    title_tag = soup.find('h1') or soup.find('title') or soup.find('h2')
    title = title_tag.get_text(strip=True) if title_tag else ''

    # Основной текст статьи
    paragraphs = []
    # Стандартный вариант — внутри <article>
    article_zone = soup.find('article')
    if article_zone:
        paragraphs = article_zone.find_all('p')
    else:
        # fallback: берем div с макс. количеством <p>
        divs = soup.find_all('div')
        best_div = max(divs, key=lambda div: len(div.find_all('p')), default=None)
        if best_div and len(best_div.find_all('p')) > 3:
            paragraphs = best_div.find_all('p')
        # fallback на все <p>
        if not paragraphs:
            paragraphs = soup.find_all('p')

    text = ' '.join([p.get_text(strip=True) for p in paragraphs])

    # Возврат универсальной модели
    return {"title": title, "text": text}

In [43]:
import json

# Чтение списка ссылок:
with open('news/urls_for_draft.json', 'r') as f:
    urls = json.load(f)

# Итерация по всем url для парсинга и дальнейшей AI-обработки:
articles = []
for url in urls:
    article = parse_article(url)
    articles.append(article)

In [44]:
articles

[{'title': 'Just a moment...', 'text': ''},
 {'title': 'Just a moment...', 'text': ''},
 {'title': '', 'text': ''},
 {'title': 'BlackRock Continues to Offload Millions in Bitcoin, Ethereum, But for How Long?',
  'text': 'Vini Barbosa has covered the crypto industry professionally since 2020, summing up to over 10,000 hours of research, writing, and editing related content for media outlets and key industry players. Vini is an active commentator and a heavy user of the technology, truly believing in its revolutionary potential. Topics of interest include blockchain, open-source software, decentralized finance, and real-world utility. Vini Barbosa on X'},
 {'title': '', 'text': ''},
 {'title': 'Just a moment...', 'text': ''},
 {'title': '', 'text': ''},
 {'title': 'Just a moment...', 'text': ''},
 {'title': 'Just a moment...', 'text': ''},
 {'title': '', 'text': ''},
 {'title': "The former CEO of Goldman Sachs thinks that America is due for a crisis — and pinpoints the area of the market

In [45]:
def build_prompt(title, text):
    prompt = (f'''
        Analyze the following financial news article and return the result in the following Python dictionary structure:
        headline — a brief headline;
        hotness ∈ [0,1] — the hotness score;
        why_now — 1–2 sentences explaining why this is important now (novelty, confirmations, affected assets);
        entities — companies/tickers/countries/sectors mentioned in the article;
        timeline — key time markers (first mention → confirmation → clarification);
        draft — a draft for a post/cover note (headline, lead paragraph, 3 bullets, quote/note);
        dedup_group — ID of duplicate-cluster/reprints.
        ARTICLE TITLE: {title}
        ARTICLE TEXT: {text}
        Return a Python dictionary where keys are headline, hotness, why_now, entities, timeline, draft, dedup_group. Respond in English only.
        '''
    )
    return prompt

In [46]:
from openai.types.chat import ChatCompletionUserMessageParam
from openai import OpenAI

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="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",
)

drafts = []

for article in articles:
    prompt = build_prompt(article["title"], article["text"])

    completion = client.chat.completions.create(
      extra_headers={},
      extra_body={},
      model="qwen/qwen3-vl-235b-a22b-thinking",
      messages = [
          ChatCompletionUserMessageParam(role="user", content=prompt)
      ]
    )

    draft = completion.choices[0].message.content
    drafts.append(draft)

In [47]:
drafts

['{\n    "headline": "No Content Available: Placeholder Article",\n    "hotness": 0.0,\n    "why_now": "The article contains no substantive text, making it impossible to assess current relevance or market impact; no novel information or asset movements can be identified.",\n    "entities": [],\n    "timeline": [],\n    "draft": "No Content Available: Placeholder Article\\nThe provided article consists solely of a placeholder title with no actual text content, preventing any meaningful financial analysis.\\n\\n• No entities, events, or market data were mentioned in the source material.\\n• No timeline or actionable information can be extracted from the empty article.\\n• Zero relevance to current financial markets due to complete lack of substantive content.\\n\\nNote: This appears to be a system error or incomplete article feed requiring technical review.",\n    "dedup_group": null\n}',
 '{\n    "headline": "Article Content Unavailable: Placeholder Received",\n    "hotness": 0.0,\n    

In [55]:
import pyjson5
import pandas as pd

parsed = []
for d in drafts:
    try:
        parsed.append(pyjson5.decode(d))
    except Exception:
        pass

df = pd.DataFrame(parsed)
print(df.head())

                                            headline  hotness  \
0          No Content Available: Placeholder Article     0.00   
1  Article Content Unavailable: Placeholder Received     0.00   
2                                No article provided     0.00   
3  BlackRock Continues to Offload Millions in Bit...     0.85   
4                                No Article Provided     0.00   

                                             why_now  \
0  The article contains no substantive text, maki...   
1  The article contains no substantive content (p...   
2  No news content was provided, so there is no c...   
3  BlackRock's sustained divestment from Bitcoin ...   
4  No article content was provided, making it imp...   

                                 entities timeline  \
0                                      []       []   
1                                      []       []   
2                                      []       []   
3  [BlackRock, Bitcoin, Ethereum, crypto]       []   
4 

In [56]:
df_sorted = df.sort_values(by="hotness", ascending=False)

In [57]:
df_sorted

Unnamed: 0,headline,hotness,why_now,entities,timeline,draft,dedup_group
63,Elon Musk Reaches $500 Billion Net Worth Miles...,0.97,This is the first time any individual has cros...,"[Elon Musk, Tesla, TSLA, SpaceX]",First mention: Current article (implied public...,Musk Makes History as World's First $500B Indi...,0
75,Oracle's Record-Shattering Stock Rally,0.95,"Oracle's stock has surged to all-time highs, b...","[Oracle, ORCL, cloud computing, AI infrastruct...",[],{'headline': 'Oracle's Record-Shattering Stock...,
50,"Fed Cuts Rates Amid Trump Pressure, First Cut ...",0.95,The Fed's first rate cut in nine months addres...,"[Federal Reserve, U.S., White House, Trump adm...",Monday (Senate confirms Miran) → Wednesday (ra...,Fed Cuts Rates Under Political Pressure\nThe F...,
67,Judge Blocks Trump's Attempt to Fire Fed Gover...,0.95,The ruling comes ahead of the Fed's critical r...,"[United States, Federal Reserve, Lisa Cook, Do...",{'first mention': 'Last month (Trump's social ...,{'headline': 'Fed Governor Cook Retains Positi...,none
88,"Trump Imposes $100K H-1B Visa Fee, Angering Te...",0.95,The policy took immediate effect after Friday'...,"[Amazon, Microsoft, JPMorgan Chase, Walmart, T...",{'first mention': 'Friday: Trump signs proclam...,{'headline': 'Tech Firms Scramble as Trump Sla...,
...,...,...,...,...,...,...,...
77,Missing Article Content,0.00,"No article content provided, so there is no cu...",[],[],"{'headline': 'Missing Article Content', 'lead'...",
83,Article Unavailable: Placeholder Title,0.00,The article is inaccessible with only a placeh...,[],[],Article Unavailable\nThe requested financial n...,
85,Article Unavailable: 403 Forbidden Error,0.00,The article is inaccessible due to a 403 Forbi...,[],[],{'headline': 'Critical Access Error Blocks Fin...,
90,Empty Article: Just a moment...,0.00,"The article contains no substantive content, m...",[],[],{'headline': 'Article Unavailable: System Plac...,


In [58]:
df_sorted["hotness"] = pd.to_numeric(df["hotness"], errors="coerce")

# Удаляем все строки, где hotness == 0
df_clear = df_sorted[df_sorted["hotness"] != 0]

print(df_clear.head())

                                             headline  hotness  \
63  Elon Musk Reaches $500 Billion Net Worth Miles...     0.97   
75             Oracle's Record-Shattering Stock Rally     0.95   
50  Fed Cuts Rates Amid Trump Pressure, First Cut ...     0.95   
67  Judge Blocks Trump's Attempt to Fire Fed Gover...     0.95   
88  Trump Imposes $100K H-1B Visa Fee, Angering Te...     0.95   

                                              why_now  \
63  This is the first time any individual has cros...   
75  Oracle's stock has surged to all-time highs, b...   
50  The Fed's first rate cut in nine months addres...   
67  The ruling comes ahead of the Fed's critical r...   
88  The policy took immediate effect after Friday'...   

                                             entities  \
63                   [Elon Musk, Tesla, TSLA, SpaceX]   
75  [Oracle, ORCL, cloud computing, AI infrastruct...   
50  [Federal Reserve, U.S., White House, Trump adm...   
67  [United States, Federal Rese

In [61]:
df_clear.drop(['dedup_group'], axis=1, inplace=True)
print(df_clear.head())

                                             headline  hotness  \
63  Elon Musk Reaches $500 Billion Net Worth Miles...     0.97   
75             Oracle's Record-Shattering Stock Rally     0.95   
50  Fed Cuts Rates Amid Trump Pressure, First Cut ...     0.95   
67  Judge Blocks Trump's Attempt to Fire Fed Gover...     0.95   
88  Trump Imposes $100K H-1B Visa Fee, Angering Te...     0.95   

                                              why_now  \
63  This is the first time any individual has cros...   
75  Oracle's stock has surged to all-time highs, b...   
50  The Fed's first rate cut in nine months addres...   
67  The ruling comes ahead of the Fed's critical r...   
88  The policy took immediate effect after Friday'...   

                                             entities  \
63                   [Elon Musk, Tesla, TSLA, SpaceX]   
75  [Oracle, ORCL, cloud computing, AI infrastruct...   
50  [Federal Reserve, U.S., White House, Trump adm...   
67  [United States, Federal Rese

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clear.drop(['dedup_group'], axis=1, inplace=True)


In [63]:
df_clear.head()

Unnamed: 0,headline,hotness,why_now,entities,timeline,draft
63,Elon Musk Reaches $500 Billion Net Worth Miles...,0.97,This is the first time any individual has cros...,"[Elon Musk, Tesla, TSLA, SpaceX]",First mention: Current article (implied public...,Musk Makes History as World's First $500B Indi...
75,Oracle's Record-Shattering Stock Rally,0.95,"Oracle's stock has surged to all-time highs, b...","[Oracle, ORCL, cloud computing, AI infrastruct...",[],{'headline': 'Oracle's Record-Shattering Stock...
50,"Fed Cuts Rates Amid Trump Pressure, First Cut ...",0.95,The Fed's first rate cut in nine months addres...,"[Federal Reserve, U.S., White House, Trump adm...",Monday (Senate confirms Miran) → Wednesday (ra...,Fed Cuts Rates Under Political Pressure\nThe F...
67,Judge Blocks Trump's Attempt to Fire Fed Gover...,0.95,The ruling comes ahead of the Fed's critical r...,"[United States, Federal Reserve, Lisa Cook, Do...",{'first mention': 'Last month (Trump's social ...,{'headline': 'Fed Governor Cook Retains Positi...
88,"Trump Imposes $100K H-1B Visa Fee, Angering Te...",0.95,The policy took immediate effect after Friday'...,"[Amazon, Microsoft, JPMorgan Chase, Walmart, T...",{'first mention': 'Friday: Trump signs proclam...,{'headline': 'Tech Firms Scramble as Trump Sla...


In [64]:
df_clear.shape

(48, 6)