# Setup

## Imports

In [1]:
import feedparser
import trafilatura
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
import torch
from transformers import pipeline
import spacy
from collections import Counter, defaultdict
import re
from pathlib import Path
import csv
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google_auth_oauthlib.flow import InstalledAppFlow
# import io
from dotenv import load_dotenv
from googleapiclient.discovery import build
import os
from youtube_transcript_api.proxies import WebshareProxyConfig, GenericProxyConfig
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import json
import time
import random
from pathlib import Path
import requests
from tqdm import tqdm
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

## Spacy

In [2]:
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Installing spaCy model...")
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")


## Sentiment Analyzer

In [3]:
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="ProsusAI/finbert",
    tokenizer="ProsusAI/finbert"
)

def get_text_sentiment_score(text: str, max_chars=512) -> float:
    if not text or not text.strip():
        return 0.0

    chunks = [text[i:i+max_chars] for i in range(0, len(text), max_chars)]
    scores = []

    for chunk in chunks:
        result = sentiment_analyzer(chunk)[0]
        label = result["label"].upper()
        score = float(result["score"])

        if "POS" in label:
            scores.append(score)
        elif "NEG" in label:
            scores.append(-score)
        else:
            scores.append(0.0)

    return sum(scores) / len(scores) if scores else 0.0

Device set to use cpu


## Entity Mentions

In [4]:
CONTEXT_SENTENCES = 1
TICKER_LIST_PATH = Path("tickers.csv")  # optional: columns ticker,name

TICKER_RE = re.compile(r"(?<![A-Z])\$?[A-Z]{2,5}(?![A-Z])")
TICKER_STOP = {
    "A", "AN", "AND", "ARE", "AS", "AT", "BE", "BUT", "BY", "CAN", "CO", "FOR",
    "FROM", "HAS", "HAVE", "IN", "IS", "IT", "ITS", "NOT", "OF", "ON", "OR",
    "THE", "TO", "WAS", "WERE", "WILL", "WITH",
}

ENTITY_ALIASES = {
    # companies
    "meta": "META",
    "facebook": "META",

    "google": "GOOGL",
    "alphabet": "GOOGL",

    "apple": "AAPL",
    "amazon": "AMZN",
    "microsoft": "MSFT",

    # institutions
    "fed": "Federal Reserve",
    "federal reserve": "Federal Reserve",
    "doj": "Department of Justice",
    "department of justice": "Department of Justice",
    "supreme court": "Supreme Court",
    "cnn": "CNN",
}

SECTOR_KEYWORDS = {
    "Technology": ["tech", "software", "technology", "cloud", "ai", "artificial intelligence",
                   "chip", "semiconductor", "digital", "platform", "app", "data", "cyber"],
    "Finance": ["bank", "financial", "finance", "investment", "trading", "market",
                "stock", "equity", "bond", "credit", "lending", "mortgage"],
    "Healthcare": ["health", "medical", "pharmaceutical", "drug", "biotech", "hospital",
                    "treatment", "patient", "fda", "clinical", "therapy"],
    "Energy": ["oil", "gas", "energy", "petroleum", "renewable", "solar", "wind",
               "electric", "power", "fuel", "drilling", "crude"],
    "Retail": ["retail", "store", "shopping", "consumer", "e-commerce", "online shopping",
               "merchandise", "sales", "retailer"],
    "Automotive": ["car", "automotive", "vehicle", "auto", "truck", "electric vehicle",
                   "ev", "manufacturing", "tesla"],
    "Real Estate": ["real estate", "property", "housing", "construction", "mortgage",
                    "development", "reit"],
    "Telecommunications": ["telecom", "communication", "wireless", "5g", "network", "internet"],
    "Aerospace": ["aerospace", "aircraft", "defense", "boeing", "space"],
    "Consumer Goods": ["consumer goods", "packaged goods", "cpg"],
}

def normalize_company_name(name):
    return name.lower().replace("inc.", "").replace("corp.", "").replace("corporation", "").strip()

def extract_article_text(url: str) -> str | None:
    downloaded = trafilatura.fetch_url(url)
    if not downloaded:
        return None

    text = trafilatura.extract(
        downloaded,
        include_comments=False,
        include_tables=False,
        include_formatting=False
    )
    return text

def load_ticker_map(path: Path):
    ticker_to_name = {}
    name_to_ticker = {}
    if not path.exists():
        return ticker_to_name, name_to_ticker

    with path.open("r", newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            ticker = (row.get("ticker") or "").strip().upper()
            name = (row.get("name") or "").strip()
            if not ticker or not name:
                continue
            ticker_to_name[ticker] = name
            name_to_ticker[normalize_company_name(name)] = ticker

    return ticker_to_name, name_to_ticker


ticker_to_name, name_to_ticker = load_ticker_map(TICKER_LIST_PATH)


def fetch_articles(feed_url, max_items=30):
    feed = feedparser.parse(feed_url)
    articles = []
    for entry in feed.entries[:max_items]:
        text = extract_article_text(entry.link)
        if not text:
            continue
        articles.append({
            "title": entry.title,
            "url": entry.link,
            "published": entry.get("published"),
            "text": text,
        })
    return articles


def get_tickers(text):
    tickers = set()
    for m in TICKER_RE.findall(text):
        t = m.replace("$", "").upper()
        if t in TICKER_STOP:
            continue
        if ticker_to_name and t not in ticker_to_name:
            continue
        tickers.add(t)

    return list(tickers)

def get_companies(doc):
    mapped = []

    for ent in doc.ents:
        if ent.label_ != "ORG":
            continue
        key = normalize_company_name(ent.text)
        if key in name_to_ticker:
            mapped.append(name_to_ticker[key])   # return ticker
        else:
            mapped.append(ent.text)
    return mapped


def get_sectors(text_lower):
    return [
        sector for sector, keywords in SECTOR_KEYWORDS.items()
        if any(kw in text_lower for kw in keywords)
    ]

## Youtube Data Api Scrape

In [5]:
load_dotenv()
YOUTUBE_API_KEY = os.getenv("API_KEY")
CHANNEL_ID = "UCrp_UI8XtuYfpiqluWLD7Lw"  # CNBC channel
MAX_VIDEOS = 100

def fetch_youtube_videos_with_api(channel_id, api_key, max_results=100):
    """Fetch YouTube videos using Data API (no transcripts needed)"""
    youtube = build('youtube', 'v3', developerKey=api_key)
    videos = []
    next_page_token = None
    uploads_playlist_id = None
    
    print(f"Fetching videos from channel {channel_id}...")
    
    while len(videos) < max_results:
        try:
            # First, get the uploads playlist ID for the channel
            if uploads_playlist_id is None:  # Only need to do this once
                channel_response = youtube.channels().list(
                    part='contentDetails',
                    id=channel_id
                ).execute()
                
                if not channel_response.get('items'):
                    print(f"‚ùå Channel {channel_id} not found")
                    break
                
                uploads_playlist_id = channel_response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
            
            # Get videos from uploads playlist
            if uploads_playlist_id:
                request = youtube.playlistItems().list(
                    part='snippet,contentDetails',
                    playlistId=uploads_playlist_id,
                    maxResults=min(max_results, max_results - len(videos)),
                    pageToken=next_page_token
                )
            else:
                # Fallback: search for videos from channel
                request = youtube.search().list(
                    part='snippet',
                    channelId=channel_id,
                    type='video',
                    maxResults=min(max_results, max_results - len(videos)),
                    pageToken=next_page_token,
                    order='date'
                )
            
            response = request.execute()
            
            # Get video IDs
            video_ids = []
            for item in response['items']:
                if 'contentDetails' in item:
                    video_ids.append(item['contentDetails']['videoId'])
                elif 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])
            
            # Get detailed video information
            if video_ids:
                video_details = youtube.videos().list(
                    part='snippet,statistics',
                    id=','.join(video_ids)
                ).execute()
                
                for item in video_details['items']:
                    snippet = item['snippet']
                    videos.append({
                        'title': snippet.get('title', ''),
                        'video_id': item['id'],
                        'url': f"https://www.youtube.com/watch?v={item['id']}",
                        'published': snippet.get('publishedAt', ''),
                        'published_date': snippet.get('publishedAt', ''),
                        'author': snippet.get('channelTitle', ''),
                        'summary': snippet.get('description', ''),  # Full description
                        'transcript_text': None,  # No transcript (IP banned)
                        'view_count': item['statistics'].get('viewCount', 0),
                        'like_count': item['statistics'].get('likeCount', 0),
                    })
            
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
                
            print(f"  Fetched {len(videos)} videos so far...")
            
        except Exception as e:
            print(f"‚ùå Error fetching videos: {e}")
            break
    
    print(f"‚úÖ Total videos fetched: {len(videos)}")
    return videos


# Main

## Retrieve latest 100 videos

## Retrieve and cache Transcripts

### Testing

In [None]:
import os, requests
print("HTTP_PROXY =", os.environ.get("HTTP_PROXY"))
print("HTTPS_PROXY =", os.environ.get("HTTPS_PROXY"))
print("Proxies seen by requests:", requests.utils.get_environ_proxies("https://www.youtube.com"))

In [None]:
import os
import requests
from urllib.parse import quote

# URL-encode credentials
proxy_user = quote(os.getenv("PROXY_USER"), safe='')
proxy_pass = quote(os.getenv("PROXY_PASS"), safe='')

# Set rotating proxy globally
proxy_url = f"http://{proxy_user}:{proxy_pass}@p.webshare.io:80"
os.environ['HTTP_PROXY'] = proxy_url
os.environ['HTTPS_PROXY'] = proxy_url

# Test rotation by checking your IP multiple times
print("Testing proxy rotation...")
for i in range(5):
    try:
        response = requests.get('https://api.ipify.org?format=json', 
                               proxies={'http': proxy_url, 'https': proxy_url},
                               timeout=10)
        ip = response.json().get('ip')
        print(f"Request {i+1}: IP = {ip}")
    except Exception as e:
        print(f"Request {i+1}: Error = {e}")

In [6]:
ytt_api = YouTubeTranscriptApi(
    proxy_config=WebshareProxyConfig(
        proxy_username = os.getenv("PROXY_USER"),
        proxy_password = os.getenv("PROXY_PASS"),
    )
)
transcript = ytt_api.fetch('wN2s5uGh1YQ')
transcript

RetryError: HTTPSConnectionPool(host='www.google.com', port=443): Max retries exceeded with url: /sorry/index?continue=https://www.youtube.com/watch%3Fv%3DwN2s5uGh1YQ&q=EgQXX5aRGJj2wMsGIjDgUktWauhYQDmp5GNAVeYpHQU0kM_WB31C4CJx0aoU4SEWWxHDgWoOJjjNLtp6ws4yAnJSWgFD (Caused by ResponseError('too many 429 error responses'))

### Actual

In [6]:
TRANSCRIPT_CACHE_PATH = Path('daily_transcripts.json')
MAX_CACHE_SIZE = 100  # Maximum number of videos to keep in cache

def load_transcript_cache(path):
    if path.exists():
        try:
            content = path.read_text(encoding='utf-8')
            if content.strip():
                return json.loads(content)
            else:
                print("‚ö†Ô∏è  Cache file is empty, starting fresh")
                return {}
        except json.JSONDecodeError as e:
            print(f"‚ö†Ô∏è  Cache file is corrupted: {e}")
            print("   Starting with fresh cache")
            return {}
    return {}

def save_transcript_cache(path, cache):
    try:
        path.write_text(json.dumps(cache, ensure_ascii=False, indent=2), encoding='utf-8')
    except Exception as e:
        print(f"‚ö†Ô∏è  Failed to save cache: {e}")

def fetch_transcript_with_backoff(video_id, max_retries=10):
    """
    Fetch transcript with exponential backoff and jitter.
    No proxy - relies on longer delays to avoid rate limits.
    """
    base_delay = 5.0  # Longer initial delay without proxy
    
    for attempt in range(1, max_retries + 1):
        try:
            # Add random delay before each request (rate limit avoidance)
            jitter = random.uniform(2, 5)
            if attempt > 1:
                time.sleep(jitter)
            ytt_api = YouTubeTranscriptApi(
                proxy_config=WebshareProxyConfig(
                    proxy_username = os.getenv("PROXY_USER"),
                    proxy_password = os.getenv("PROXY_PASS"),
                )
            )
            transcript = ytt_api.fetch(video_id)
            return ' '.join([seg.text for seg in transcript])
            
        except (TranscriptsDisabled, NoTranscriptFound):
            # These are not rate limits, just unavailable transcripts
            return None
            
        except Exception as e:
            error_msg = str(e).lower()
            
            # Check for rate limit indicators
            if any(indicator in error_msg for indicator in ['429', 'too many requests', 'rate limit', 'forbidden', '403']):
                wait_time = base_delay * (2 ** (attempt - 1)) + random.uniform(5, 15)
                print(f"‚ö†Ô∏è  Rate limit detected (attempt {attempt}/{max_retries})")
                print(f"   Waiting {wait_time:.1f}s before retry...")
                time.sleep(wait_time)
                continue
            
            # Other errors
            error_type = type(e).__name__
            print(f"Attempt {attempt} failed for {video_id}: {error_type}: {str(e)[:100]}")
            # Skip immediately for unavailable/unplayable videos (no point retrying)
            if any(skip in error_msg or skip in error_type.lower() for skip in 
                   ['unavailable', 'unplayable', 'private', 'deleted']):
                print(f"  ‚è≠Ô∏è  Skipping (video unavailable)")
                return None
            if attempt == max_retries:
                return None
            
            # Exponential backoff for other errors
            wait_time = base_delay * (1.5 ** attempt) + random.uniform(1, 3)
            time.sleep(wait_time)
    
    return None

def attach_transcripts(videos, cache_path=TRANSCRIPT_CACHE_PATH, max_cache_size=MAX_CACHE_SIZE, delay_between_requests=3.0):
    """
    Attach transcripts to videos with aggressive rate limit avoidance.
    
    Args:
        videos: List of video dictionaries
        cache_path: Path to cache file
        max_cache_size: Maximum number of videos to keep in cache
        delay_between_requests: Base delay between requests in seconds (default: 3.0)
    """
    latest_ids = [v.get('video_id') for v in videos if v.get('video_id')]
    total_videos = len(latest_ids)
    print(f"\nüìù Processing {total_videos} videos for transcripts...\n")
    print(f"‚è±Ô∏è  Using delays to avoid rate limits (no proxy)\n")

    # Load cache
    cache = load_transcript_cache(cache_path)
    old_cache_size = len(cache)
    
    # Create ordered list: newest videos first
    all_video_ids = latest_ids.copy()
    
    # Add old cached videos that aren't in the new list
    for old_vid in cache.keys():
        if old_vid not in all_video_ids:
            all_video_ids.append(old_vid)
    
    # Keep only the newest MAX_CACHE_SIZE videos
    videos_to_keep = all_video_ids[:max_cache_size]
    
    # Filter cache
    filtered_cache = {vid: cache[vid] for vid in videos_to_keep if vid in cache}
    removed_count = old_cache_size - len(filtered_cache)
    
    print(f"üì¶ Cache status: {old_cache_size} total ‚Üí keeping {len(filtered_cache)} (removed {removed_count} oldest)\n")
    
    cache = filtered_cache

    success_count = 0
    failed_count = 0
    cached_count = 0
    actual_idx = 0

    for idx, video in enumerate(videos, start=1):
        vid = video.get('video_id')
        if not vid:
            continue
        
        actual_idx = idx
        
        # Check cache first
        if vid in cache:
            video['transcript_text'] = cache[vid]
            cached_count += 1
            print(f"[{idx}/{total_videos}] ‚úì Cached: {vid} - {video.get('title', 'N/A')[:50]}")
            if cache[vid]:
                print(f"  Preview: {cache[vid][:150]}...\n")
            continue
        
        # Add delay between requests to avoid rate limits
        delay = delay_between_requests + random.uniform(1, 3)
        print(f"[{idx}/{total_videos}] Fetching: {vid} (waiting {delay:.1f}s)...")
        time.sleep(delay)
        
        # Fetch transcript
        try:
            transcript_text = fetch_transcript_with_backoff(vid)
        except Exception as e:
            print(f"‚ö†Ô∏è  Unexpected error: {e}")
            transcript_text = None
        
        video['transcript_text'] = transcript_text
        cache[vid] = transcript_text
        
        if transcript_text:
            success_count += 1
            print(f"‚úì Success: {video.get('title', 'N/A')[:50]}")
            print(f"  Preview: {transcript_text[:150]}...\n")
        else:
            failed_count += 1
            print(f"‚úó Failed/No transcript: {video.get('title', 'N/A')[:50]}\n")
        
        # Save cache periodically
        if idx % 10 == 0:
            save_transcript_cache(cache_path, cache)
            print(f"  üíæ Cache saved at {idx} videos\n")

    # Final save
    save_transcript_cache(cache_path, cache)
    
    print(f"\nüìä Summary:")
    print(f"  ‚úì Successfully fetched: {success_count}")
    print(f"  ‚úì From cache: {cached_count}")
    print(f"  ‚úó Failed/No transcript: {failed_count}")
    print(f"  Total processed: {actual_idx}/{total_videos}")
    print(f"  üì¶ Final cache size: {len(cache)}/{max_cache_size}")
    print()
    
    return videos

def refresh_transcripts_in_dict(videos, cache_path=Path('daily_transcripts.json')):
    """Refresh transcript data from cache file"""
    if not cache_path.exists():
        return videos
    cache = json.loads(cache_path.read_text(encoding='utf-8'))
    updated = 0
    for video in videos:
        vid = video.get('video_id')
        if not vid:
            continue
        cached_value = cache.get(vid)
        if cached_value is not None:
            if video.get('transcript_text') != cached_value:
                video['transcript_text'] = cached_value
                updated += 1
    print(f'Overwrote {updated} transcripts from cache')
    return videos

In [8]:
youtube_videos_api = fetch_youtube_videos_with_api(CHANNEL_ID, YOUTUBE_API_KEY, max_results=MAX_VIDEOS)

Fetching videos from channel UCrp_UI8XtuYfpiqluWLD7Lw...
  Fetched 50 videos so far...
  Fetched 100 videos so far...
‚úÖ Total videos fetched: 100


In [9]:
youtube_videos_api = attach_transcripts(youtube_videos_api, delay_between_requests = 5)


üìù Processing 100 videos for transcripts...

‚è±Ô∏è  Using delays to avoid rate limits (no proxy)

üì¶ Cache status: 40 total ‚Üí keeping 40 (removed 0 oldest)

[1/100] ‚úì Cached: UIEMxGany2k - EXCLUSIVE: CNBC's Joe Kernen interviews President 
[2/100] ‚úì Cached: Ia0Ev2Xuj0g - LIVE: Trump speaks at the World Economic Forum in 
[3/100] ‚úì Cached: 4YJmAZY8V4w - Lightning Round: Shell is 'just an ok oil company'
  Preview: >> OF COURSE MY STEP PLAY. ITSELF AND THEN THE LIGHTNING ROUND IS OVER. ARE YOU READY, SKI DADDY? THE LIGHTNING ROUND. LET'S START WITH SKIP IN CALIFO...

[4/100] ‚úì Cached: tKgYSl5KSq0 - Mad Money 01/20/26 | Audio Only
  Preview: Hey, I'm Kramer. Welcome to Mad Money. Welcome to Crayer. Other friends, I'm just trying to save a little bit of money. My job is not just to educate,...

[5/100] ‚úì Cached: _LVNB2Hc5t0 - Jim Cramer sounds the alarm on speculation and see
  Preview: Money. Wel MONEY. WELCOME TO CRAMER PICKER FRIENDS I'M JUST TRYING TO SAVE YOU A LITTL

## Add transcripts to dict

In [None]:
# Overwrite in-memory transcripts with cache values when available

youtube_videos_api = refresh_transcripts_in_dict(youtube_videos_api)


## Check rotating proxies

In [None]:
username = os.getenv("PROXY_USER")
password = os.getenv("PROXY_PASS")
endpoint = os.getenv("PROXY_HOST") +  ":" + str(os.getenv("PROXY_PORT"))

proxy_url = f"http://{username}:{password}@{endpoint}"

def check_ip_rotation(num_requests=10):
    """Check if proxy IPs are rotating"""
    proxies = {
        'http': proxy_url,
        'https': proxy_url
    }
    
    ips = []
    for i in range(num_requests):
        try:
            # Using http instead of https for simpler testing
            response = requests.get('http://ipinfo.io/json', 
                                   proxies=proxies, 
                                   timeout=10)
            ip = response.json().get('ip')
            ips.append(ip)
            print(f"Request {i+1}: IP = {ip}")
        except Exception as e:
            print(f"Request {i+1} failed: {e}")
    
    unique_ips = set(ips)
    print(f"\nTotal requests: {len(ips)}")
    print(f"Unique IPs: {len(unique_ips)}")
    print(f"IPs are {'ROTATING ‚úì' if len(unique_ips) > 1 else 'NOT ROTATING ‚úó'}")
    return ips

# Test rotation
print("Testing IP rotation with Webshare:")
check_ip_rotation(10)

# After Transcript

## Sentiment and Summarizer

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import logging
logging.set_verbosity_error()

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
sentiment_analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")

def chunk_text_words(text, chunk_words=500):
    words = text.split()
    return [
        " ".join(words[i:i+chunk_words])
        for i in range(0, len(words), chunk_words)
    ]

def summarize_long_text(text):
    chunks = chunk_text_words(text, chunk_words=500)

    partial_summaries = []
    for chunk in chunks:
        result = summarizer(
            chunk,
            max_length=120,
            min_length=40,
            do_sample=False,
            truncation=True
        )
        partial_summaries.append(result[0]["summary_text"])

    combined = " ".join(partial_summaries)

    final = summarizer(
        combined,
        max_length=180,
        min_length=60,
        do_sample=False,
        truncation=True
    )

    return final[0]["summary_text"]


def analyze_video_sentiment(video, debug=False):
    """Analyze sentiment of title and transcript separately"""
    
    # Title: Direct sentiment (no summarization)
    title = video.get('title', '')
    if title:
        try:
            title_sentiment = sentiment_analyzer(title[:512])[0]
            video['title_sentiment'] = title_sentiment
        except Exception as e:
            if debug:
                print(f"    Title sentiment failed: {e}")
            video['title_sentiment'] = None
    else:
        video['title_sentiment'] = None
    
    # Transcript: Summarize ‚Üí Sentiment
    transcript_text = video.get('transcript_text', '')
    
    if debug:
        print(f"   Transcript length: {len(transcript_text)} chars, {len(transcript_text.split())} words")
    
    # Check if transcript exists and is long enough
    if not transcript_text or len(transcript_text.strip()) < 200:
        if debug:
            print(f"    Transcript too short or missing")
        video['transcript_summary'] = None
        video['transcript_sentiment'] = None
        return video
    
    try:
        # Clean and truncate transcript
        transcript_text = transcript_text.strip()
        words = transcript_text.split()
        
        if debug:
            print(f"   Word count: {len(words)}")
        
        # BART works best with 100-1024 tokens
        if len(words) < 100:
            if debug:
                print(f"    Too few words: {len(words)}")
            video['transcript_summary'] = None
            video['transcript_sentiment'] = None
            return video
        
        if len(words) > 1000:
            transcript_text = ' '.join(words[:1000])
            if debug:
                print(f"    Truncated to 1000 words")
        
        if debug:
            print(f"   Generating summary...")
            print(f"   First 200 chars: {transcript_text[:200]}")
        
        # Generate summary with better parameters

        summary = summarize_long_text(transcript_text)
        
        if debug:
            print(f"   Summary: {summary}")
        
        # Sentiment of summary
        transcript_sentiment = sentiment_analyzer(summary[:512])[0]
        
        if debug:
            print(f"   Sentiment: {transcript_sentiment}")
        
        video['transcript_summary'] = summary
        video['transcript_sentiment'] = transcript_sentiment
        
    except Exception as e:
        if debug:
            print(f"   Error: {type(e).__name__}: {str(e)}")
        video['transcript_summary'] = None
        try:
            video['transcript_sentiment'] = sentiment_analyzer(transcript_text[:512])[0]
        except Exception:
            video['transcript_sentiment'] = None
    
    return video

# Test on first video with debug output
print("\nTesting first video with debug output:\n")
if youtube_videos_api:
    test_video = youtube_videos_api[0].copy()
    print(f"Title: {test_video.get('title')}")
    analyze_video_sentiment(test_video, debug=True)
    print("\n" + "="*80 + "\n")

# Ask user if they want to continue
response = input("Continue with all videos? (y/n): ")

if response.lower() == 'y':
    print("\n Analyzing all videos...")
    
    for video in tqdm(youtube_videos_api, desc="Processing videos", unit="video"):
        if video.get('transcript_text'):
            analyze_video_sentiment(video, debug=False)
        else:
            video['title_sentiment'] = None
            video['transcript_summary'] = None
            video['transcript_sentiment'] = None
    
    print("Analysis complete!\n")

In [None]:
def save_to_json(videos, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(videos, f, ensure_ascii=False, indent=2)

# usage
save_to_json(youtube_videos_api, "youtube_analysis.json")

## Aggregation

In [None]:
def normalize_entity(name: str) -> str:
    if not name:
        return name

    n = name.strip().lower()

    n = re.sub(r"^(the|a|an)\s+", "", n)
    n = re.sub(r"[^\w\s]", "", n)
    n = re.sub(r"\s+", " ", n)

    if n in ENTITY_ALIASES:
        return ENTITY_ALIASES[n]

    return n.upper() if n.isupper() else n.title()

def sentiment_to_score(sentiment):
    if not sentiment:
        return None
    label = str(sentiment.get('label', '')).upper()
    score = float(sentiment.get('score', 0))
    if 'POS' in label:
        return score
    if 'NEG' in label:
        return -score
    return 0.0

def extract_video_text(video, prefer_summary=True):
    title = video.get('title', '')
    transcript = ''
    if prefer_summary and video.get('transcript_summary'):
        transcript = video['transcript_summary']
    elif video.get('transcript_text'):
        transcript = video['transcript_text']
    combined = f"{title} {transcript}".strip()
    return combined

def analyze_video_entities_split(video):
    title = video.get('title', '') or ''

    # Prefer summary, but fall back to full transcript_text if no summary
    raw_summary = video.get('transcript_summary') or video.get('transcript_text') or ''
    summary = raw_summary

    title_doc = nlp(title) if title else None
    summary_doc = nlp(summary) if summary else None

    title_tickers = set(get_tickers(title)) if title else set()
    title_companies = set(get_companies(title_doc)) if title_doc else set()
    title_sectors = set(get_sectors(title.lower())) if title else set()

    summary_tickers = set(get_tickers(summary)) if summary else set()
    summary_companies = set(get_companies(summary_doc)) if summary_doc else set()
    summary_sectors = set(get_sectors(summary.lower())) if summary else set()

    title_score = sentiment_to_score(video.get('title_sentiment'))
    summary_score = sentiment_to_score(video.get('transcript_sentiment'))

    return {
        "title": (title_tickers, title_companies, title_sectors, title_score),
        "summary": (summary_tickers, summary_companies, summary_sectors, summary_score),
    }

def aggregate_youtube_entities(videos):

    def new_bucket():
        return {
            "title_mentions": 0,
            "title_scores": [],  # One score per video where entity appears in title
            "summary_mentions": 0,
            "summary_scores": [],  # One score per video where entity appears in summary
        }

    stock_stats = defaultdict(new_bucket)
    company_stats = defaultdict(new_bucket)
    sector_stats = defaultdict(new_bucket)

    for video in videos:
        parts = analyze_video_entities_split(video)

        # Track which entities we've already counted for this video (per part)
        # to avoid adding the same score multiple times
        title_entities_seen = set()
        summary_entities_seen = set()

        for part_name, (tickers, companies, sectors, score) in parts.items():
            is_title = (part_name == "title")
            seen_set = title_entities_seen if is_title else summary_entities_seen

            for t in tickers:
                t = normalize_entity(t)
                stock_stats[t][f"{part_name}_mentions"] += 1
                # Only add score once per video per part
                if t not in seen_set and score is not None:
                    stock_stats[t][f"{part_name}_scores"].append(score)
                    seen_set.add(t)

            for c in companies:
                c = normalize_entity(c)
                company_stats[c][f"{part_name}_mentions"] += 1
                # Only add score once per video per part
                if c not in seen_set and score is not None:
                    company_stats[c][f"{part_name}_scores"].append(score)
                    seen_set.add(c)

            for s in sectors:
                s = normalize_entity(s)
                sector_stats[s][f"{part_name}_mentions"] += 1
                # Only add score once per video per part
                if s not in seen_set and score is not None:
                    sector_stats[s][f"{part_name}_scores"].append(score)
                    seen_set.add(s)

    def finalize(stats):
        rows = []
        for name, data in stats.items():
            rows.append({
                "name": name,

                "title_mentions": data["title_mentions"],
                "avg_title_sentiment": (
                    sum(data["title_scores"]) / len(data["title_scores"])
                    if data["title_scores"] else None
                ),

                "summary_mentions": data["summary_mentions"],
                "avg_summary_sentiment": (
                    sum(data["summary_scores"]) / len(data["summary_scores"])
                    if data["summary_scores"] else None
                ),
            })

        rows.sort(key=lambda x: (x["title_mentions"] + x["summary_mentions"]), reverse=True)
        return rows

    return {
        "stocks": finalize(stock_stats),
        "companies": finalize(company_stats),
        "sectors": finalize(sector_stats),
    }

In [None]:
result = aggregate_youtube_entities(youtube_videos_api)
save_to_json(result, "entity_mentions.json")


In [None]:
result['companies']

## Turn Mentions into readable txt file

In [None]:
def format_sentiment(score):
    """Format sentiment score for display."""
    if score is None:
        return "N/A"
    return f"{score:+.4f}"

def format_mentions(item):
    """Format a single item's mention data."""
    total_mentions = item.get("title_mentions", 0) + item.get("summary_mentions", 0)
    title_sent = format_sentiment(item.get("avg_title_sentiment"))
    summary_sent = format_sentiment(item.get("avg_summary_sentiment"))
    
    lines = [
        f"  Name: {item['name']}",
        f"  Total Mentions: {total_mentions}",
        f"    - Title Mentions: {item.get('title_mentions', 0)} (Sentiment: {title_sent})",
        f"    - Summary Mentions: {item.get('summary_mentions', 0)} (Sentiment: {summary_sent})"
    ]
    return "\n".join(lines)

In [None]:
json_path = Path("entity_mentions.json")
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Build the output text
output_lines = []
output_lines.append("=" * 80)
output_lines.append("ENTITY MENTIONS REPORT")
output_lines.append("=" * 80)
output_lines.append("")

# Stocks Section
output_lines.append("STOCKS")
output_lines.append("-" * 80)
if data.get("stocks"):
    for i, stock in enumerate(data["stocks"], 1):
        output_lines.append(f"\n{i}. {format_mentions(stock)}")
else:
    output_lines.append("  No stocks found.")
output_lines.append("")
output_lines.append("")

# Companies Section
output_lines.append("COMPANIES")
output_lines.append("-" * 80)
if data.get("companies"):
    for i, company in enumerate(data["companies"], 1):
        output_lines.append(f"\n{i}. {format_mentions(company)}")
else:
    output_lines.append("  No companies found.")
output_lines.append("")
output_lines.append("")

# Sectors Section
output_lines.append("SECTORS")
output_lines.append("-" * 80)
if data.get("sectors"):
    for i, sector in enumerate(data["sectors"], 1):
        output_lines.append(f"\n{i}. {format_mentions(sector)}")
else:
    output_lines.append("  No sectors found.")

output_lines.append("")
output_lines.append("=" * 80)

# Join all lines
output_text = "\n".join(output_lines)

print(output_text)

In [None]:
output_path = Path("entity_mentions.txt")
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(output_text)

print(f"Successfully saved to {output_path}")

# Filter by Date

In [None]:
with open('100vids.json', 'r', encoding='utf-8') as file:
    # Use json.load() to convert the file content to a Python object
    data = json.load(file)

In [None]:
from datetime import datetime, timedelta

def filter_by_date_range(videos, chosen_date_str):
    """
    chosen_date_str format: 'YYYY-MM-DD'
    """
    chosen_date = datetime.strptime(chosen_date_str, "%Y-%m-%d")
    start_date = chosen_date - timedelta(days=7)

    filtered = []

    for v in videos:
        published_str = v.get("published") or v.get("published_date")
        if not published_str:
            continue

        published_dt = datetime.fromisoformat(published_str.replace("Z", ""))

        if start_date <= published_dt <= chosen_date:
            filtered.append(v)

    return filtered

In [None]:
data

In [None]:
chosen_date = "2026-01-15"

filtered_videos = filter_by_date_range(data, chosen_date)
filtered_videos