In [2]:
import feedparser
import trafilatura
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
import torch
from transformers import pipeline
import spacy
from collections import Counter, defaultdict
import re
import yfinance as yf


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
cnbc_feed = feedparser.parse("https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000664")
cnbc_articles = []
for entry in cnbc_feed.entries:
    cnbc_articles.append({
        "title": entry.title,
        "summary": entry.get("summary", ""),
        "link": entry.link,
        "published": entry.published
    })

In [3]:
# reuter_feed = feedparser.parse("https://ir.thomsonreuters.com/rss/news-releases.xml?items=15")

reuter_feed = feedparser.parse("https://news.google.com/rss/search?q=site%3Areuters.com&hl=en-US&gl=US&ceid=US%3Aen")
reuter_articles = []
for entry in reuter_feed.entries:
    reuter_articles.append({
        "title": entry.title,
        "summary": entry.get("summary", ""),
        "link": entry.link,
        "published": entry.published
    })

In [4]:
reuter_articles

[{'title': 'Trump says US oversight of Venezuela could last years, NYT reports - Reuters',
  'summary': '<a href="https://news.google.com/rss/articles/CBMiswFBVV95cUxNaDF0ck5aZHFqOXcwRjBDT1VtdWpRNDlFNkJvS2EzWXBiRGlYRXAtdndxRkV5X0lBX1RIRVR1SFJkbGxleFhkSmZYQzBSbU9ncUFSSm5FdnZTYjdBUTFiLVlPbHlfQTFZWDA1X2tWYksyM3JNWHY1Zl8wX25fZkJEX2lpMnYzakpjUDRvcWs0NkxScTBTdm84OHlEU3l3ejVuOEZuNGFJUDJRekZOUC1GMVhzYw?oc=5" target="_blank">Trump says US oversight of Venezuela could last years, NYT reports</a>&nbsp;&nbsp;<font color="#6f6f6f">Reuters</font>',
  'link': 'https://news.google.com/rss/articles/CBMiswFBVV95cUxNaDF0ck5aZHFqOXcwRjBDT1VtdWpRNDlFNkJvS2EzWXBiRGlYRXAtdndxRkV5X0lBX1RIRVR1SFJkbGxleFhkSmZYQzBSbU9ncUFSSm5FdnZTYjdBUTFiLVlPbHlfQTFZWDA1X2tWYksyM3JNWHY1Zl8wX25fZkJEX2lpMnYzakpjUDRvcWs0NkxScTBTdm84OHlEU3l3ejVuOEZuNGFJUDJRekZOUC1GMVhzYw?oc=5',
  'published': 'Thu, 08 Jan 2026 08:18:06 GMT'},
 {'title': 'Exclusive: Nvidia requires full upfront payment for H200 chips in China, sources say - Reuters',

In [4]:
def extract_article_text(url: str) -> str | None:
    downloaded = trafilatura.fetch_url(url)
    if not downloaded:
        return None

    text = trafilatura.extract(
        downloaded,
        include_comments=False,
        include_tables=False,
        include_formatting=False
    )
    return text

In [5]:
cnbc_articles = []

for entry in cnbc_feed.entries:
    url = entry.link
    text = extract_article_text(url)

    if not text:
        continue

    cnbc_articles.append({
        "title": entry.title,
        "url": url,
        "published": entry.get("published"),
        "text": text
    })

In [6]:
cnbc_articles

[{'title': 'Hopes rise for Chinese property support ahead of key March meeting',
  'url': 'https://www.cnbc.com/2026/01/09/china-property-slump-policy-shift-qiushi-2026-ahead-march-meeting.html',
  'published': 'Fri, 09 Jan 2026 06:44:40 GMT',
  'text': 'BEIJING — Chinese policymakers may be finally warming to the idea of tackling the country\'s worsening real estate slump, raising expectations that stronger support measures could be coming later this year.\nThe Communist Party\'s official journal Qiushi, which means "seeking truth," kicked off 2026 with a Jan. 1 article calling for "more powerful and precise measures" to stabilize property market expectations.\nSince then, the Hang Seng China A Properties Index, which includes developers Vanke and Seazen, has climbed more than 6% to start the year, reflecting growing investor optimism.\nThe Qiushi commentary was notable for its scope, said Ting Lu, chief China economist at Nomura.\n"This is the most comprehensive assessment of China\'

In [7]:
tokenizer = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")

In [8]:
def chunk_text(text, tokenizer, max_tokens=800, overlap=100):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []

    for i in range(0, len(tokens), max_tokens - overlap):
        chunk = tokens[i:i + max_tokens]
        chunks.append(tokenizer.decode(chunk))

    return chunks

def summarize_chunk(
    text,
    tokenizer,
    model,
    max_input_tokens=800,
    max_output_tokens=150
):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_new_tokens=max_output_tokens,
            num_beams=4,
            length_penalty=1.0,
            early_stopping=True
        )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def summarize_article(text, tokenizer, model):
    chunks = chunk_text(text, tokenizer)

    summaries = [
        summarize_chunk(chunk, tokenizer, model)
        for chunk in chunks
    ]

    # Optional: compress summaries again if long
    combined_summary = " ".join(summaries)

    if len(tokenizer.encode(combined_summary)) > 400:
        combined_summary = summarize_chunk(
            combined_summary,
            tokenizer,
            model,
            max_input_tokens=400,
            max_output_tokens=150
        )

    return combined_summary

In [9]:
for article in cnbc_articles:
    article["summary"] = summarize_article(
        article["text"],
        tokenizer,
        model
    )

Token indices sequence length is longer than the specified maximum sequence length for this model (1304 > 512). Running this sequence through the model will result in indexing errors


In [10]:
summaries = [article["summary"] for article in cnbc_articles]
summaries

# cnbc_articles


["China's property downturn has dragged on despite a clear call from top leaders in Sept. 2024 to halt the sector's decline. China's property downturn has dragged on despite a clear call from top leaders in Sept. 2024 to halt the sector's decline. China's property downturn has dragged on despite a clear call from top leaders in Sept. 2024 to halt the sector's decline. In recent weeks, Vanke narrow is evident across the sector remains evident. In recent weeks, Vanke narrowly avoided default on a 2 billion yuan ($283 million) onshore bond. In a broader sign of strain, Chinese real estate developers' outstanding loan balance fell in the third quarter from a year ago. In a broader sign of strain, Chinese real estate developers' outstanding loan balance fell in the third quarter from a year ago.",
 '2% in extended trading after General Motors said it will record $7.1 billion in special charges for the fourth quarter of 2025 tied to its pullback in electric vehicles and restructuring efforts

# Test 1

In [11]:
def summarize_article_hierarchical(text, tokenizer, model, max_chunk_tokens=800):
    """Hierarchical summarization: chunk -> summarize -> recursively summarize"""
    chunks = chunk_text(text, tokenizer, max_tokens=max_chunk_tokens, overlap=150)
    
    # First level: summarize each chunk
    chunk_summaries = [
        summarize_chunk(chunk, tokenizer, model, max_output_tokens=100)
        for chunk in chunks
    ]
    
    # Second level: combine and summarize summaries
    combined = " ".join(chunk_summaries)
    
    # If still too long, recursively summarize
    while len(tokenizer.encode(combined)) > 400:
        combined_chunks = chunk_text(combined, tokenizer, max_tokens=400, overlap=50)
        combined = " ".join([
            summarize_chunk(chunk, tokenizer, model, max_output_tokens=80)
            for chunk in combined_chunks
        ])
    
    # Final pass: create coherent summary
    final_summary = summarize_chunk(
        combined,
        tokenizer,
        model,
        max_input_tokens=400,
        max_output_tokens=150
    )
    
    return final_summary

In [12]:
for article in cnbc_articles:
    article["summary"] = summarize_article_hierarchical(
        article["text"],
        tokenizer,
        model
    )

In [13]:
summaries = [article["summary"] for article in cnbc_articles]
summaries

["in Sept. 2024 to halt the sector's decline. China's property downturn has dragged on despite a clear call from top leaders in Sept. 2024 to halt the sector's decline. China's property downturn has dragged on despite a clear call from top leaders in Sept. 2024 to halt the sector's decline. In a broader sign of strain, China's real estate developers' outstanding loan balance fell in the third quarter from a year ago for",
 'reported record net revenue for its fiscal second quarter, leading shares to jump nearly 8%. Tilray Brands — The consumer packaged goods and cannabis giant reported record net revenue for its fiscal second quarter, leading shares to jump nearly 8%. Tilray posted revenue of $218 million for the period, leading shares to jump nearly 8%.',
 'affordability has found a clear villain: institutional investors that own large swaths of single-family homes in fast-growing Sun Belt cities. The message may be targeted at places like Atlanta and Jacksonville, metropolitan areas 

# Test 2

In [14]:
extractive_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_article_hybrid(text, tokenizer, model):
    """Extract key sentences, then abstractively summarize"""
    # Handle empty or None text
    if not text or not text.strip():
        return ""
    
    # Step 1: Extractive summarization to get key content
    # BART has a 1024 token limit, so we need to truncate if needed
    text_tokens = tokenizer.encode(text, add_special_tokens=False)
    
    if len(text_tokens) > 1024:
        # Truncate text for BART model (it has max_length=1024)
        truncated_text = tokenizer.decode(text_tokens[:1024], skip_special_tokens=True)
        
        try:
            result = extractive_summarizer(
                truncated_text,
                max_length=500,
                min_length=200,
                do_sample=False
            )
            # Check if result is not empty
            if result and len(result) > 0 and 'summary_text' in result[0]:
                extracted = result[0]['summary_text']
            else:
                # Fallback: use truncated text if summarization fails
                extracted = truncated_text
        except Exception as e:
            print(f"Extractive summarization failed: {e}")
            # Fallback: use truncated text
            extracted = truncated_text
    else:
        extracted = text
    
    # Step 2: Abstractive summarization for coherence
    final_summary = summarize_chunk(
        extracted,
        tokenizer,
        model,
        max_input_tokens=500,
        max_output_tokens=150
    )
    
    return final_summary

Device set to use cpu


In [15]:
for article in cnbc_articles:
    article["summary"] = summarize_article_hybrid(
        article["text"],
        tokenizer,
        model
    )

In [16]:
summaries = [article["summary"] for article in cnbc_articles]
summaries

["China A Properties Index has climbed more than 6% to start the year. China's property downturn has dragged on despite a clear call from top leaders in Sept. 2024 to halt the sector's decline. New home sales have nearly halved since Beijing started cracking down on developers' heavy reliance on debt for growth. In a broader sign of strain, Chinese real estate developers' outstanding loan balance fell in the third quarter from a year ago for the first time in more",
 '2% in extended trading after General Motors said it will record $7.1 billion in special charges for the fourth quarter of 2025 tied to its pullback in electric vehicles and restructuring efforts in China. Tilray Brands — The consumer packaged goods and cannabis giant reported record net revenue for its fiscal second quarter, leading shares to jump nearly 8%. Tilray posted revenue of $218 million for the period, while analysts polled by LSEG expected $211 million. Tilray Brands — The consumer packaged',
 'on housing afford

# Test 3

In [17]:
def summarize_article_improved(text, tokenizer, model):
    if not text or not text.strip():
        return ""
    
    # Better chunking with more overlap
    chunks = chunk_text(text, tokenizer, max_tokens=800, overlap=200)
    
    if len(chunks) == 1:
        # Single chunk, summarize directly
        return summarize_chunk(chunks[0], tokenizer, model, max_output_tokens=150)
    
    # Summarize each chunk
    chunk_summaries = [
        summarize_chunk(chunk, tokenizer, model, max_output_tokens=100)
        for chunk in chunks
    ]
    
    # Combine summaries
    combined = " ".join(chunk_summaries)
    
    # Final pass: create coherent unified summary
    # Use higher repetition penalty for final summary
    final_summary = summarize_chunk(
        combined,
        tokenizer,
        model,
        max_input_tokens=500,
        max_output_tokens=150
    )
    
    return final_summary

In [18]:
for article in cnbc_articles:
    article["summary"] = summarize_article_improved(
        article["text"],
        tokenizer,
        model
    )

In [None]:
summaries = [article["summary"] for article in cnbc_articles]
summaries

['the leadership" Qiushi argued against a view in Beijing that real estate is no longer important to China\'s economy. In recent weeks, Vanke narrowly avoided default on a 2 billion yuan ($283 million) onshore bond. In a broader sign of strain, Chinese real estate developers\' outstanding loan balance fell in the third quarter from a year ago. Qiushi argued against a view in Beijing that real estate is no longer important to China\'s economy ',
 '2% in extended trading after General Motors said it will record $7.1 billion in special charges for the fourth quarter of 2025 tied to its pullback in electric vehicles and restructuring efforts in China. Tilray Brands — The consumer packaged goods and cannabis giant reported record net revenue for its fiscal second quarter, leading shares to jump nearly 8%. Tilray posted revenue of $218 million for the period, while analysts polled by LSEG expected $211 million. Tilray Brands — The consumer packaged',
 'on housing affordability has found a cl

# Extraction of Sectors and Stocks

In [36]:
import spacy
from collections import Counter
import re

# Load spaCy
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Installing spaCy model...")
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

# Enhanced sector keywords
SECTOR_KEYWORDS = {
    'Technology': ['tech', 'software', 'technology', 'cloud', 'AI', 'artificial intelligence', 
                   'chip', 'semiconductor', 'digital', 'platform', 'app', 'data', 'cyber'],
    'Finance': ['bank', 'financial', 'finance', 'investment', 'trading', 'market', 
                'stock', 'equity', 'bond', 'credit', 'lending', 'mortgage'],
    'Healthcare': ['health', 'medical', 'pharmaceutical', 'drug', 'biotech', 'hospital', 
                   'treatment', 'patient', 'FDA', 'clinical', 'therapy'],
    'Energy': ['oil', 'gas', 'energy', 'petroleum', 'renewable', 'solar', 'wind', 
               'electric', 'power', 'fuel', 'drilling', 'crude'],
    'Retail': ['retail', 'store', 'shopping', 'consumer', 'e-commerce', 'online shopping', 
               'merchandise', 'sales', 'retailer'],
    'Automotive': ['car', 'automotive', 'vehicle', 'auto', 'truck', 'electric vehicle', 
                   'EV', 'manufacturing', 'Tesla'],
    'Real Estate': ['real estate', 'property', 'housing', 'construction', 'mortgage', 
                    'development', 'REIT'],
    'Telecommunications': ['telecom', 'communication', 'wireless', '5G', 'network', 'internet'],
    'Aerospace': ['aerospace', 'aircraft', 'defense', 'Boeing', 'space'],
    'Consumer Goods': ['consumer goods', 'packaged goods', 'CPG']
}

def extract_entities_from_text(text):
    """Extract stocks, companies, and sectors from text"""
    # Extract tickers (uppercase 1-5 letters, not common words)
    common_words = {'THE', 'AND', 'FOR', 'ARE', 'BUT', 'NOT', 'YOU', 'ALL', 'CAN'}
    ticker_pattern = re.compile(r'\b([A-Z]{2,5})\b')
    tickers = [m for m in ticker_pattern.findall(text) if m not in common_words]
    
    # Extract companies using NER
    doc = nlp(text)
    companies = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    
    # Extract sectors
    text_lower = text.lower()
    sectors = [sector for sector, keywords in SECTOR_KEYWORDS.items() 
               if any(kw in text_lower for kw in keywords)]
    
    return tickers, companies, sectors

def get_most_talked_about(articles):
    """Get most mentioned stocks and sectors across all articles"""
    all_tickers = []
    all_companies = []
    all_sectors = []
    
    for article in articles:
        text = f"{article.get('title', '')} {article.get('text', '')}"
        tickers, companies, sectors = extract_entities_from_text(text)
        
        all_tickers.extend(tickers)
        all_companies.extend(companies)
        all_sectors.extend(sectors)
    
    return {
        'stocks': dict(Counter(all_tickers).most_common(20)),
        'companies': dict(Counter(all_companies).most_common(20)),
        'sectors': dict(Counter(all_sectors).most_common(10))
    }


""" Delete Duplicates"""
# def get_most_talked_about(articles):
#     """Get most mentioned stocks and sectors across all articles (1 count per article)."""
#     all_tickers = []
#     all_companies = []
#     all_sectors = []
    
#     for article in articles:
#         body = article.get("text", "")
#         title = article.get("title", "")
#         text = f"{title} {body}"
        
#         tickers, companies, sectors = extract_entities_from_text(text)
        
#         # De‑duplicate within a single article
#         unique_tickers = set(tickers)
#         unique_companies = set(companies)
#         unique_sectors = set(sectors)
        
#         all_tickers.extend(unique_tickers)
#         all_companies.extend(unique_companies)
#         all_sectors.extend(unique_sectors)
    
#     return {
#         'stocks': dict(Counter(all_tickers).most_common(20)),
#         'companies': dict(Counter(all_companies).most_common(20)),
#         'sectors': dict(Counter(all_sectors).most_common(10)),
#     }

# Usage
results = get_most_talked_about(cnbc_articles)

print(" Most Talked About Stocks:")
for ticker, count in list(results['stocks'].items())[:10]:
    print(f"  {ticker}: mentioned {count} times")

print("\nMost Talked About Companies:")
for company, count in list(results['companies'].items())[:10]:
    print(f"  {company}: mentioned {count} times")

print("\nMost Talked About Sectors:")
for sector, count in list(results['sectors'].items()):
    print(f"  {sector}: mentioned {count} times")

 Most Talked About Stocks:
  CNBC: mentioned 34 times
  CEO: mentioned 17 times
  IPO: mentioned 6 times
  AI: mentioned 5 times
  EV: mentioned 5 times
  RH: mentioned 5 times
  LSEG: mentioned 4 times
  UBS: mentioned 4 times
  GDP: mentioned 4 times
  MSCI: mentioned 3 times

Most Talked About Companies:
  Fed: mentioned 38 times
  CNBC: mentioned 33 times
  JPMorgan: mentioned 21 times
  Trump: mentioned 14 times
  Qiushi: mentioned 10 times
  Chevron: mentioned 8 times
  Goldman: mentioned 7 times
  Maduro: mentioned 7 times
  Apple: mentioned 6 times
  Intel: mentioned 5 times

Most Talked About Sectors:
  Finance: mentioned 30 times
  Technology: mentioned 26 times
  Energy: mentioned 21 times
  Automotive: mentioned 18 times
  Retail: mentioned 11 times
  Real Estate: mentioned 11 times
  Healthcare: mentioned 7 times
  Telecommunications: mentioned 6 times
  Aerospace: mentioned 3 times
  Consumer Goods: mentioned 1 times


# Full Pipeline

In [21]:
def process_feed(link, tokenizer, model):
    """Process RSS feed: extract articles, summarize, and analyze"""
    feed = feedparser.parse(link)
    articles = []
    
    for entry in feed.entries:
        url = entry.link
        text = extract_article_text(url)
        
        if not text:
            continue
            
        articles.append({
            "title": entry.title,
            "link": url,
            "published": entry.get("published"),
            "text": text  
        })
    
    # Step 2: Summarize articles
    for article in articles:
        article["summary"] = summarize_article_improved(
            article["text"],
            tokenizer,
            model
        )
    
    
    results = get_most_talked_about(articles)
    
    return results

In [22]:
final = process_feed('https://www.fool.com/a/feeds/partner/googlechromefollow?apikey=5e092c1f-c5f9-4428-9219-908a47d2e2de', tokenizer, model)
# final = process_feed("https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000664", tokenizer, model)
final

KeyboardInterrupt: 

In [37]:
cnbc_feed = feedparser.parse("https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000664")
cnbc_articles = []

for entry in cnbc_feed.entries:
    url = entry.link
    text = extract_article_text(url)

    if not text:
        continue

    cnbc_articles.append({
        "title": entry.title,
        "url": url,
        "published": entry.get("published"),
        "text": text
    })

res = get_most_talked_about(cnbc_articles)
res

{'stocks': {'CNBC': 34,
  'CEO': 17,
  'IPO': 6,
  'AI': 5,
  'EV': 5,
  'RH': 5,
  'LSEG': 4,
  'UBS': 4,
  'GDP': 4,
  'MSCI': 3,
  'PDVSA': 3,
  'QXO': 3,
  'FOMC': 3,
  'LEO': 3,
  'BTIG': 2,
  'CES': 2,
  'COIN': 2,
  'FDA': 2,
  'HSBC': 1,
  'WD': 1},
 'companies': {'Fed': 38,
  'CNBC': 33,
  'JPMorgan': 21,
  'Trump': 14,
  'Qiushi': 10,
  'Chevron': 8,
  'Goldman': 7,
  'Maduro': 7,
  'Apple': 6,
  'Intel': 5,
  'IPO': 5,
  'Apple Card': 4,
  'Barclays': 4,
  'Bank of America': 4,
  'Microchip Technology': 4,
  'Nike': 4,
  'General Motors': 3,
  'Digital': 3,
  'UBS': 3,
  'PDVSA': 3},
 'sectors': {'Finance': 30,
  'Technology': 26,
  'Energy': 21,
  'Automotive': 18,
  'Retail': 11,
  'Real Estate': 11,
  'Healthcare': 7,
  'Telecommunications': 6,
  'Aerospace': 3,
  'Consumer Goods': 1}}

# Sentiment

In [24]:
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="ProsusAI/finbert",
    tokenizer="ProsusAI/finbert"
)

def get_text_sentiment_score(text: str) -> float:
    """
    Return sentiment score in [-1, 1]:
      > 0 = positive, < 0 = negative, 0 ~ neutral.
    """
    if not text or not text.strip():
        return 0.0
    
    result = sentiment_analyzer(text[:512])[0]
    label = result["label"].upper()
    score = float(result["score"])
    
    if "POS" in label:
        return score
    if "NEG" in label:
        return -score
    return 0.0

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


In [None]:
for article in cnbc_articles:
    print(article.get("summary"))

a deal to take over the Apple Card from its original issuer, Goldman Sachs. Goldman said that the transaction will boost its earnings by 46 cents per share. The deal further enmeshing JPMorgan with a dominant tech player burnishes the bank's reputation as a leader in American finance.
says its asset management division has parted ways with controversial proxy advisors for shareholder votes. In an internal memo, the firm said it no longer needs third-party data collection or voting recommendations. In an internal memo, the firm said it no longer needs third-party data collection or voting recommendations. In an internal memo, the firm said it no longer needs third-party data collection or voting recommendations.
. The board of GameStop granted Cohen performance-based stock options tied to a $100 billion market-capitalization target. GameStop shares slid 36% last year and the company currently has a market cap of $9.3 billion. The board of GameStop granted Cohen performance-based stock o

In [31]:
for article in cnbc_articles:
    text_for_sentiment = article.get("summary") or article.get("text") or ""
    article["sentiment_score"] = get_text_sentiment_score(text_for_sentiment)

In [32]:
mf_articles

[{'title': 'Hopes rise for Chinese property support ahead of key March meeting',
  'summary': 'Ahead of a major policy meeting, China may be reassessing its approach after a flagship journal urged more forceful property measures.',
  'link': 'https://www.cnbc.com/2026/01/09/china-property-slump-policy-shift-qiushi-2026-ahead-march-meeting.html',
  'published': 'Fri, 09 Jan 2026 06:44:40 GMT',
  'sentiment_score': 0.8240274786949158},
 {'title': 'Stocks making the biggest moves after hours: General Motors, Intel, Tilray Brands & more',
  'summary': 'These are the stocks posting the largest moves in extended trading.',
  'link': 'https://www.cnbc.com/2026/01/08/stocks-making-the-biggest-moves-after-hours-gm-intc-tlry.html',
  'published': 'Thu, 08 Jan 2026 23:06:32 GMT',
  'sentiment_score': 0.0},
 {'title': 'Why Trump is going after institutional homebuyers: They dominate markets like Atlanta, Jacksonville',
  'summary': "Trump's message may be aimed at places like Atlanta and Jacksonvi

In [33]:
mf_feed

{'bozo': False,
 'entries': [{'links': [{'rel': 'alternate',
     'type': 'text/html',
     'href': 'https://www.cnbc.com/2026/01/09/china-property-slump-policy-shift-qiushi-2026-ahead-march-meeting.html'}],
   'link': 'https://www.cnbc.com/2026/01/09/china-property-slump-policy-shift-qiushi-2026-ahead-march-meeting.html',
   'id': '108250289',
   'guidislink': False,
   'metadata_type': 'cnbcnewsstory',
   'metadata_id': '108250289',
   'metadata_sponsored': 'false',
   'title': 'Hopes rise for Chinese property support ahead of key March meeting',
   'title_detail': {'type': 'text/plain',
    'language': None,
    'base': 'https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000664',
    'value': 'Hopes rise for Chinese property support ahead of key March meeting'},
   'summary': 'Ahead of a major policy meeting, China may be reassessing its approach after a flagship journal urged more forceful property measures.',
   'summary_detail': {'type': 'text/html',
   

# Organized

## Retreive News

In [94]:
cnbc_feed = feedparser.parse("https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000664")
cnbc_articles = []

for entry in cnbc_feed.entries:
    url = entry.link
    text = extract_article_text(url)

    if not text:
        continue

    cnbc_articles.append({
        "title": entry.title,
        "url": url,
        "published": entry.get("published"),
        "text": text
    })

## Extract Entity Mentions

### Functions

In [98]:
# Load spaCy
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Installing spaCy model...")
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

# Enhanced sector keywords
SECTOR_KEYWORDS = {
    'Technology': ['tech', 'software', 'technology', 'cloud', 'AI', 'artificial intelligence', 
                   'chip', 'semiconductor', 'digital', 'platform', 'app', 'data', 'cyber'],
    'Finance': ['bank', 'financial', 'finance', 'investment', 'trading', 'market', 
                'stock', 'equity', 'bond', 'credit', 'lending', 'mortgage'],
    'Healthcare': ['health', 'medical', 'pharmaceutical', 'drug', 'biotech', 'hospital', 
                   'treatment', 'patient', 'FDA', 'clinical', 'therapy'],
    'Energy': ['oil', 'gas', 'energy', 'petroleum', 'renewable', 'solar', 'wind', 
               'electric', 'power', 'fuel', 'drilling', 'crude'],
    'Retail': ['retail', 'store', 'shopping', 'consumer', 'e-commerce', 'online shopping', 
               'merchandise', 'sales', 'retailer'],
    'Automotive': ['car', 'automotive', 'vehicle', 'auto', 'truck', 'electric vehicle', 
                   'EV', 'manufacturing', 'Tesla'],
    'Real Estate': ['real estate', 'property', 'housing', 'construction', 'mortgage', 
                    'development', 'REIT'],
    'Telecommunications': ['telecom', 'communication', 'wireless', '5G', 'network', 'internet'],
    'Aerospace': ['aerospace', 'aircraft', 'defense', 'Boeing', 'space'],
    'Consumer Goods': ['consumer goods', 'packaged goods', 'CPG']
}

def extract_entities_from_text(text):
    """Extract stocks, companies, and sectors from text"""
    # Extract tickers (uppercase 1-5 letters, not common words)
    common_words = {'THE', 'AND', 'FOR', 'ARE', 'BUT', 'NOT', 'YOU', 'ALL', 'CAN'}
    ticker_pattern = re.compile(r'\b([A-Z]{2,5})\b')
    tickers = [m for m in ticker_pattern.findall(text) if m not in common_words]
    
    # Extract companies using NER
    doc = nlp(text)
    companies = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    
    # Extract sectors
    text_lower = text.lower()
    sectors = [sector for sector, keywords in SECTOR_KEYWORDS.items() 
               if any(kw in text_lower for kw in keywords)]
    
    return tickers, companies, sectors

# def get_most_talked_about(articles):
#     """Get most mentioned stocks and sectors across all articles"""
#     all_tickers = []
#     all_companies = []
#     all_sectors = []
    
#     for article in articles:
#         text = f"{article.get('title', '')} {article.get('text', '')}"
#         tickers, companies, sectors = extract_entities_from_text(text)
        
#         all_tickers.extend(tickers)
#         all_companies.extend(companies)
#         all_sectors.extend(sectors)
    
#     return {
#         'stocks': dict(Counter(all_tickers).most_common(20)),
#         'companies': dict(Counter(all_companies).most_common(20)),
#         'sectors': dict(Counter(all_sectors).most_common(10))
#     }


""" Delete Duplicates"""
def get_most_talked_about_no_dupes(articles):
    """Get most mentioned stocks and sectors across all articles (1 count per article)."""
    all_tickers = []
    all_companies = []
    all_sectors = []
    
    for article in articles:
        body = article.get("text", "")
        title = article.get("title", "")
        text = f"{title} {body}"
        
        tickers, companies, sectors = extract_entities_from_text(text)
        
        # De‑duplicate within a single article
        unique_tickers = set(tickers)
        unique_companies = set(companies)
        unique_sectors = set(sectors)
        
        all_tickers.extend(unique_tickers)
        all_companies.extend(unique_companies)
        all_sectors.extend(unique_sectors)
    
    return {
        'stocks': dict(Counter(all_tickers).most_common(20)),
        'companies': dict(Counter(all_companies).most_common(20)),
        'sectors': dict(Counter(all_sectors).most_common(10)),
    }

### Use

In [100]:
res = get_most_talked_about_no_dupes(cnbc_articles)
res

{'stocks': {'CNBC': 23,
  'CEO': 14,
  'AI': 3,
  'LSEG': 3,
  'EV': 3,
  'UBS': 2,
  'MSCI': 2,
  'GDP': 2,
  'CES': 2,
  'QXO': 2,
  'IPO': 2,
  'RH': 2,
  'HSBC': 1,
  'WD': 1,
  'BTIG': 1,
  'RTX': 1,
  'JP': 1,
  'ISS': 1,
  'IQ': 1,
  'IMF': 1},
 'companies': {'CNBC': 23,
  'Trump': 6,
  'Chevron': 5,
  'Fed': 4,
  'JPMorgan': 4,
  'Barclays': 4,
  'Bank of America': 4,
  'Maduro': 3,
  'Exxon Mobil': 3,
  'EV': 3,
  'Taiwan Semiconductor Manufacturing': 3,
  'Truth Social': 2,
  'UBS': 2,
  'Digital': 2,
  'American Express': 2,
  'Goldman Sachs': 2,
  'The Wall Street Journal': 2,
  'Treasury': 2,
  'Bloomberg News': 2,
  'Nvidia': 2},
 'sectors': {'Finance': 30,
  'Technology': 26,
  'Energy': 21,
  'Automotive': 18,
  'Real Estate': 11,
  'Retail': 11,
  'Healthcare': 7,
  'Telecommunications': 6,
  'Aerospace': 3,
  'Consumer Goods': 1}}

## Get Article Summary

### Functions

In [101]:
def summarize_article_improved(text, tokenizer, model):
    if not text or not text.strip():
        return ""
    
    # Better chunking with more overlap
    chunks = chunk_text(text, tokenizer, max_tokens=800, overlap=200)
    
    if len(chunks) == 1:
        # Single chunk, summarize directly
        return summarize_chunk(chunks[0], tokenizer, model, max_output_tokens=150)
    
    # Summarize each chunk
    chunk_summaries = [
        summarize_chunk(chunk, tokenizer, model, max_output_tokens=100)
        for chunk in chunks
    ]
    
    # Combine summaries
    combined = " ".join(chunk_summaries)
    
    # Final pass: create coherent unified summary
    # Use higher repetition penalty for final summary
    final_summary = summarize_chunk(
        combined,
        tokenizer,
        model,
        max_input_tokens=500,
        max_output_tokens=150
    )
    
    return final_summary

### Use

In [102]:
for article in cnbc_articles:
    article["summary"] = summarize_article_improved(
        article["text"],
        tokenizer,
        model
    )

summaries = [article["summary"] for article in cnbc_articles]
summaries

['the leadership" Qiushi argued against a view in Beijing that real estate is no longer important to China\'s economy. In recent weeks, Vanke narrowly avoided default on a 2 billion yuan ($283 million) onshore bond. In a broader sign of strain, Chinese real estate developers\' outstanding loan balance fell in the third quarter from a year ago. Qiushi argued against a view in Beijing that real estate is no longer important to China\'s economy ',
 '2% in extended trading after General Motors said it will record $7.1 billion in special charges for the fourth quarter of 2025 tied to its pullback in electric vehicles and restructuring efforts in China. Tilray Brands — The consumer packaged goods and cannabis giant reported record net revenue for its fiscal second quarter, leading shares to jump nearly 8%. Tilray posted revenue of $218 million for the period, while analysts polled by LSEG expected $211 million. Tilray Brands — The consumer packaged',
 'on housing affordability has found a cl

## Sentiment

### Functions

In [103]:
# sa_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
# sa_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

sa_tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
sa_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model = sa_model,
    tokenizer = sa_tokenizer
)

Device set to use cpu


In [104]:
def get_text_sentiment_score(text: str, max_chars=512) -> float:
    """
    Chunk long text into ~max_chars pieces, run sentiment on each,
    and average the scores.
    """
    if not text or not text.strip():
        return 0.0

    # Simple char-based chunks to avoid tokenizer mismatch issues
    chunks = [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

    scores = []
    for chunk in chunks:
        result = sentiment_analyzer(chunk)[0]
        label = result["label"].upper()
        score = float(result["score"])
        if "POS" in label:
            scores.append(score)
        elif "NEG" in label:
            scores.append(-score)
        else:
            scores.append(0.0)

    return sum(scores) / len(scores) if scores else 0.0

### Use

In [105]:
for article in cnbc_articles:
    text_for_sentiment = article.get("text") or ""
    article["sentiment_score"] = get_text_sentiment_score(text_for_sentiment)
cnbc_articles


[{'title': 'Hopes rise for Chinese property support ahead of key March meeting',
  'url': 'https://www.cnbc.com/2026/01/09/china-property-slump-policy-shift-qiushi-2026-ahead-march-meeting.html',
  'published': 'Fri, 09 Jan 2026 06:44:40 GMT',
  'text': 'BEIJING — Chinese policymakers may be finally warming to the idea of tackling the country\'s worsening real estate slump, raising expectations that stronger support measures could be coming later this year.\nThe Communist Party\'s official journal Qiushi, which means "seeking truth," kicked off 2026 with a Jan. 1 article calling for "more powerful and precise measures" to stabilize property market expectations.\nSince then, the Hang Seng China A Properties Index, which includes developers Vanke and Seazen, has climbed more than 6% to start the year, reflecting growing investor optimism.\nThe Qiushi commentary was notable for its scope, said Ting Lu, chief China economist at Nomura.\n"This is the most comprehensive assessment of China\'

## Reference Average Sentiment Score

### Functions

In [106]:
def get_entities_with_sentiment(articles):
    stock_stats   = defaultdict(lambda: {"mentions": 0, "scores": []})
    company_stats = defaultdict(lambda: {"mentions": 0, "scores": []})
    sector_stats  = defaultdict(lambda: {"mentions": 0, "scores": []})

    for article in articles:
        title = article.get("title", "")
        body  = article.get("text", "")
        text  = f"{title} {body}"

        # entities from the article
        tickers, companies, sectors = extract_entities_from_text(text)

        # de‑duplicate per article so one article = one “mention” per entity
        tickers   = set(tickers)
        companies = set(companies)
        sectors   = set(sectors)

        sentiment = float(article.get("sentiment_score", 0.0))

        for t in tickers:
            stock_stats[t]["mentions"] += 1
            stock_stats[t]["scores"].append(sentiment)

        for c in companies:
            company_stats[c]["mentions"] += 1
            company_stats[c]["scores"].append(sentiment)

        for s in sectors:
            sector_stats[s]["mentions"] += 1
            sector_stats[s]["scores"].append(sentiment)

    def finalize(stats_dict):
        items = []
        for name, data in stats_dict.items():
            if not data["scores"]:
                avg = 0.0
            else:
                avg = sum(data["scores"]) / len(data["scores"])
            items.append({
                "name": name,
                "mentions": data["mentions"],
                "avg_sentiment": avg,
            })
        items.sort(key=lambda x: x["mentions"], reverse=True)
        return items

    return {
        "stocks":    finalize(stock_stats),
        "companies": finalize(company_stats),
        "sectors":   finalize(sector_stats),
    }

In [107]:
def get_entities_with_sentiment_all_mentions(articles):
    stock_stats   = defaultdict(lambda: {"mentions": 0, "scores": []})
    company_stats = defaultdict(lambda: {"mentions": 0, "scores": []})
    sector_stats  = defaultdict(lambda: {"mentions": 0, "scores": []})

    for article in articles:
        title = article.get("title", "")
        body  = article.get("text", "")
        text  = f"{title} {body}"

        tickers, companies, sectors = extract_entities_from_text(text)
        sentiment = float(article.get("sentiment_score", 0.0))

        # NOTE: no `set(...)`
        for t in tickers:
            stock_stats[t]["mentions"] += 1
            stock_stats[t]["scores"].append(sentiment)

        for c in companies:
            company_stats[c]["mentions"] += 1
            company_stats[c]["scores"].append(sentiment)

        for s in sectors:
            sector_stats[s]["mentions"] += 1
            sector_stats[s]["scores"].append(sentiment)

    def finalize(stats_dict): 
        items = []
        for name, data in stats_dict.items():
            if not data["scores"]:
                avg = 0.0
            else:
                avg = sum(data["scores"]) / len(data["scores"])
            items.append({
                "name": name,
                "mentions": data["mentions"],
                "avg_sentiment": avg,
            })
        items.sort(key=lambda x: x["mentions"], reverse=True)
        return items

    return {
        "stocks":    finalize(stock_stats),
        "companies": finalize(company_stats),
        "sectors":   finalize(sector_stats),
    }

### Use

In [108]:
results = get_entities_with_sentiment(cnbc_articles)

for stock in results["companies"]:
    print(stock["name"], "mentions:", stock["mentions"], "avg_sentiment:", round(stock["avg_sentiment"], 3))

CNBC mentions: 23 avg_sentiment: 0.278
Trump mentions: 6 avg_sentiment: 0.142
Chevron mentions: 5 avg_sentiment: 0.601
Fed mentions: 4 avg_sentiment: -0.114
JPMorgan mentions: 4 avg_sentiment: 0.073
Barclays mentions: 4 avg_sentiment: 0.777
Bank of America mentions: 4 avg_sentiment: 0.354
Maduro mentions: 3 avg_sentiment: 0.186
Exxon Mobil mentions: 3 avg_sentiment: 0.538
EV mentions: 3 avg_sentiment: 0.664
Taiwan Semiconductor Manufacturing mentions: 3 avg_sentiment: 0.797
Truth Social mentions: 2 avg_sentiment: -0.15
UBS mentions: 2 avg_sentiment: 0.267
Digital mentions: 2 avg_sentiment: 0.043
American Express mentions: 2 avg_sentiment: 0.061
Goldman Sachs mentions: 2 avg_sentiment: 0.334
The Wall Street Journal mentions: 2 avg_sentiment: 0.029
Treasury mentions: 2 avg_sentiment: -0.092
Bloomberg News mentions: 2 avg_sentiment: 0.246
Nvidia mentions: 2 avg_sentiment: 0.468
CES mentions: 2 avg_sentiment: 0.246
Veeva Systems mentions: 2 avg_sentiment: 0.246
Aon mentions: 2 avg_sentimen

## Improved Pipeline

This section adds a cleaner, more accurate setup for ticker and company detection,
sentence-level sentiment, and sector tagging. It is designed to reduce noise
from regex-only tickers and article-level sentiment.


In [3]:
from collections import defaultdict
from pathlib import Path
import csv
import re

FEEDS = [
    "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=10000664",
]

MAX_ARTICLES_PER_FEED = 30
CONTEXT_SENTENCES = 1
TICKER_LIST_PATH = Path("tickers.csv")  # optional: columns ticker,name

TICKER_RE = re.compile(r"(?<![A-Z])\$?[A-Z]{1,5}(?![A-Z])")
TICKER_STOP = {
    "A", "AN", "AND", "ARE", "AS", "AT", "BE", "BUT", "BY", "CAN", "CO", "FOR",
    "FROM", "HAS", "HAVE", "IN", "IS", "IT", "ITS", "NOT", "OF", "ON", "OR",
    "THE", "TO", "WAS", "WERE", "WILL", "WITH",
}

SECTOR_KEYWORDS = {
    "Technology": ["tech", "software", "technology", "cloud", "ai", "artificial intelligence",
                   "chip", "semiconductor", "digital", "platform", "app", "data", "cyber"],
    "Finance": ["bank", "financial", "finance", "investment", "trading", "market",
                "stock", "equity", "bond", "credit", "lending", "mortgage"],
    "Healthcare": ["health", "medical", "pharmaceutical", "drug", "biotech", "hospital",
                    "treatment", "patient", "fda", "clinical", "therapy"],
    "Energy": ["oil", "gas", "energy", "petroleum", "renewable", "solar", "wind",
               "electric", "power", "fuel", "drilling", "crude"],
    "Retail": ["retail", "store", "shopping", "consumer", "e-commerce", "online shopping",
               "merchandise", "sales", "retailer"],
    "Automotive": ["car", "automotive", "vehicle", "auto", "truck", "electric vehicle",
                   "ev", "manufacturing", "tesla"],
    "Real Estate": ["real estate", "property", "housing", "construction", "mortgage",
                    "development", "reit"],
    "Telecommunications": ["telecom", "communication", "wireless", "5g", "network", "internet"],
    "Aerospace": ["aerospace", "aircraft", "defense", "boeing", "space"],
    "Consumer Goods": ["consumer goods", "packaged goods", "cpg"],
}


def load_ticker_map(path: Path):
    ticker_to_name = {}
    name_to_ticker = {}
    if not path.exists():
        return ticker_to_name, name_to_ticker

    with path.open("r", newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            ticker = (row.get("ticker") or "").strip().upper()
            name = (row.get("name") or "").strip()
            if not ticker or not name:
                continue
            ticker_to_name[ticker] = name
            name_to_ticker[name.lower()] = ticker

    return ticker_to_name, name_to_ticker


ticker_to_name, name_to_ticker = load_ticker_map(TICKER_LIST_PATH)


def fetch_articles(feed_url, max_items=30):
    feed = feedparser.parse(feed_url)
    articles = []
    for entry in feed.entries[:max_items]:
        text = extract_article_text(entry.link)
        if not text:
            continue
        articles.append({
            "title": entry.title,
            "url": entry.link,
            "published": entry.get("published"),
            "text": text,
        })
    return articles


def get_tickers(text):
    tickers = []
    for m in TICKER_RE.findall(text):
        t = m.replace("$", "").upper()
        if t in TICKER_STOP:
            continue
        if ticker_to_name and t not in ticker_to_name:
            continue
        tickers.append(t)
    return tickers


def get_companies(doc):
    companies = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    mapped = []
    for name in companies:
        ticker = name_to_ticker.get(name.lower())
        if ticker:
            mapped.append(ticker)
        else:
            mapped.append(name)
    return mapped


def get_sectors(text_lower):
    return [
        sector for sector, keywords in SECTOR_KEYWORDS.items()
        if any(kw in text_lower for kw in keywords)
    ]


def sentence_windows(sentences, idx, window=1):
    start = max(0, idx - window)
    end = min(len(sentences), idx + window + 1)
    return " ".join(sentences[start:end])


def analyze_article_entities(article, window=1):
    text = f"{article.get('title','')} {article.get('text','')}"
    if not text.strip():
        return [], [], []

    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]

    stock_scores = defaultdict(list)
    company_scores = defaultdict(list)
    sector_scores = defaultdict(list)

    cache = {}

    for i, sent in enumerate(sentences):
        sent_doc = nlp(sent)
        tickers = set(get_tickers(sent))
        companies = set(get_companies(sent_doc))
        sectors = set(get_sectors(sent.lower()))

        if not tickers and not companies and not sectors:
            continue

        window_text = sentence_windows(sentences, i, window=window)
        if window_text not in cache:
            cache[window_text] = get_text_sentiment_score(window_text)
        s = cache[window_text]

        for t in tickers:
            stock_scores[t].append(s)
        for c in companies:
            company_scores[c].append(s)
        for sec in sectors:
            sector_scores[sec].append(s)

    return stock_scores, company_scores, sector_scores


def aggregate_entities_with_sentiment(articles, window=1):
    stock_stats = defaultdict(lambda: {"mentions": 0, "scores": []})
    company_stats = defaultdict(lambda: {"mentions": 0, "scores": []})
    sector_stats = defaultdict(lambda: {"mentions": 0, "scores": []})

    for article in articles:
        stock_scores, company_scores, sector_scores = analyze_article_entities(article, window=window)

        for name, scores in stock_scores.items():
            stock_stats[name]["mentions"] += 1
            stock_stats[name]["scores"].append(sum(scores) / len(scores))

        for name, scores in company_scores.items():
            company_stats[name]["mentions"] += 1
            company_stats[name]["scores"].append(sum(scores) / len(scores))

        for name, scores in sector_scores.items():
            sector_stats[name]["mentions"] += 1
            sector_stats[name]["scores"].append(sum(scores) / len(scores))

    def finalize(stats):
        rows = []
        for name, data in stats.items():
            avg = sum(data["scores"]) / len(data["scores"]) if data["scores"] else 0.0
            rows.append({
                "name": name,
                "mentions": data["mentions"],
                "avg_sentiment": avg,
            })
        rows.sort(key=lambda x: x["mentions"], reverse=True)
        return rows

    return {
        "stocks": finalize(stock_stats),
        "companies": finalize(company_stats),
        "sectors": finalize(sector_stats),
    }


# Run pipeline
all_articles = []
for feed in FEEDS:
    all_articles.extend(fetch_articles(feed, max_items=MAX_ARTICLES_PER_FEED))

results = aggregate_entities_with_sentiment(all_articles, window=CONTEXT_SENTENCES)

results["stocks"][:10], results["sectors"][:10]



NameError: name 'extract_article_text' is not defined