In [25]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib.dates as mdates
plt.style.use('ggplot')
params = {'legend.fontsize': 'x-large',
        'figure.figsize': (12, 8),
        'axes.labelsize': 'x-large',
        'axes.titlesize':'x-large',
        'xtick.labelsize':'x-large',
        'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)

import nest_asyncio
nest_asyncio.apply()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
from __future__ import annotations

import datetime as dt
import json
import logging
import math
import os
import pathlib
import re
import time
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup
from dateutil import parser as dateparser
from pdfminer.high_level import extract_text as pdf_extract_text
from tqdm import tqdm
from unidecode import unidecode

In [27]:
def fetch_live_kalshi_powell_mention_market():
    url = "https://api.elections.kalshi.com/v1/series/KXPOWELLMENTION/events/KXPOWELLMENTION-25AUG24"
    res = requests.get(
        url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"}, timeout=30
    )
    res.raise_for_status()
    return res.json()["event"]["markets"]


In [56]:
BASE_YEAR_START = 2010
BASE_YEAR_END = dt.date.today().year
FRB_YEAR_URL = "https://www.federalreserve.gov/newsevents/speech/{year}-speeches.htm"
REQUEST_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PowellSpeechScraper/1.0; +https://www.federalreserve.gov/)"}
SLEEP_BETWEEN_REQUESTS = (0.1, 0.3)
def _nap():
    import random
    time.sleep(random.uniform(*SLEEP_BETWEEN_REQUESTS))

DATA_DIR = pathlib.Path("data")
OUT_DIR = pathlib.Path("out")
SPEECH_DIR = DATA_DIR / "speeches"
DATA_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)
SPEECH_DIR.mkdir(parents=True, exist_ok=True)

LOGGING_LEVEL = logging.INFO
logging.basicConfig(level=LOGGING_LEVEL, format="%(asctime)s %(levelname)s: %(message)s")

In [76]:
# https://kalshi-public-docs.s3.amazonaws.com/contract_terms/MENTION.pdf
def build_word_pattern(word: str) -> re.Pattern:
    """
    Kalshi-aligned single-token rule:
    - Include plural and possessive: word, word's, words, words'
    - Exclude tense/derivational inflections (no -ed, -ing, -ion, etc.)
    - Allow hyphen/adjacent context via word boundaries
    """
    # word boundary, then the word, then optional "'s" or "s" (with optional trailing apostrophe),
    # then word boundary. Using a non-capturing group for the optional plural/possessive.
    pat = rf"\b{re.escape(word)}(?:'s|s'?)*\b"
    return re.compile(pat, re.I)


def build_phrase_pattern(tokens: List[str], pluralize_idx: Optional[List[int]] = None) -> re.Pattern:
    """
    Build a regex for phrases where specific token indices may take plural/possessive.
    Example: tokens=['law','and','order'], pluralize_idx=[0] -> matches 'laws and order'
    """
    parts = []
    pluralize_idx = set(pluralize_idx or [])
    for i, tok in enumerate(tokens):
        if i in pluralize_idx:
            part = rf"{re.escape(tok)}(?:'s|s'?)*"
        else:
            part = rf"{re.escape(tok)}"
        parts.append(part)
    # Allow single spaces or hyphens between tokens
    inner = r"(?:[\s\-]+)".join(parts)
    pat = rf"\b{inner}\b"
    return re.compile(pat, re.I)


WORD_PATTERNS: Dict[str, re.Pattern] = {
    "Trump": re.compile(r"\btrump\b", re.I),
    "Projection": re.compile(r"\bprojection(s)?\b", re.I),
    "Good afternoon": re.compile(r"\bgood afternoon\b", re.I),
    "Russia": re.compile(r"\brussia(n)?\b", re.I),
    "Pandemic": re.compile(r"\bpandemic(s)?\b", re.I),
    "Median": re.compile(r"\bmedian(s)?\b", re.I),
    "Administration": re.compile(r"\badministration\b", re.I),
    "Tariff": re.compile(r"\btariff(s)?\b", re.I),
    "Renovation": re.compile(r"\brenovation(s)?\b", re.I),
    "Regulator/ regulatory / regulation": re.compile(r"\bregulat(?:or|ory|ion|ions|ors)\b", re.I),
    "Overheat": re.compile(r"\boverheat(?:ed|ing)?\b", re.I),
    "Michelle / Bowman": re.compile(r"\bmichelle\b|\bbowman\b", re.I),
    "Layoff": re.compile(r"\blayoff(s)?\b|\blay off\b", re.I),
    "Good morning": re.compile(r"\bgood morning\b", re.I),
    "Energy": re.compile(r"\benergy\b", re.I),
    "Dollar": re.compile(r"\bdollar(s)?\b", re.I),
    "Dissent": re.compile(r"\bdissent(s|ed|ing)?\b", re.I),
    "Cut": re.compile(r"\bcut(s|ting)?\b", re.I),
    "Crypto / Bitcoin": re.compile(r"\b(bitcoin|crypto(?:currency)?|stablecoin(s)?)\b", re.I),
    "Credit": re.compile(r"\bcredit\b", re.I),
    "Consumer confidence": re.compile(r"\bconsumer confidence\b", re.I),
    "Balance of risks": re.compile(r"\bbalance of risks\b", re.I),
    "Anchor": re.compile(r"\banchor(?:ed|ing)?\b", re.I),
    "Transitory": re.compile(r"\btransitory\b", re.I),
    "Symposium": re.compile(r"\bsymposium\b", re.I),
    "Meeting": re.compile(r"\bmeeting(s)?\b", re.I),
    "Chair": re.compile(r"\bchair\b", re.I),
    "Dual": re.compile(r"\bdual\b.{0,20}\bmandate\b|\bdual mandate\b", re.I),
	
	# lazy
	"Credit": build_word_pattern("credit"),
    "Tariff": build_word_pattern("tariff"),
    "Dollar": build_word_pattern("dollar"),
    "Dissent": build_word_pattern("dissent"),   # no tense inflections
    "Energy": build_word_pattern("energy"),
    "Good morning": re.compile(r"\bgood[\s\-]+morning\b", re.I), 
	"Crypto / Bitcoin": re.compile(r"\b(bitcoin|crypto(?:currency)?|stablecoin(?:'s|s'?)*?)\b", re.I),
    "Anchor": build_word_pattern("anchor"),
}


THRESHOLD_CONTRACTS = {
    "Tariff (10+ times)": (re.compile(r"\btariff(s)?\b", re.I), 10),
    "Labor (30+ times)": (re.compile(r"\blabor\b", re.I), 30),
    "Labor (40+ times)": (re.compile(r"\blabor\b", re.I), 40),
}

In [77]:
def get_html(url: str) -> str:
    r = requests.get(url, headers=REQUEST_HEADERS, timeout=30)
    r.raise_for_status()
    return r.text


def get_binary(url: str) -> bytes:
    r = requests.get(url, headers=REQUEST_HEADERS, timeout=60)
    r.raise_for_status()
    return r.content


@dataclass
class SpeechItem:
    date: dt.date
    title: str
    url: str
    is_pdf: bool


def find_powell_items_for_year(year: int) -> List[SpeechItem]:
    url = FRB_YEAR_URL.format(year=year)
    try:
        html = get_html(url)
    except Exception as e:
        logging.warning(f"Failed to fetch year page {url}: {e}")
        return []
    soup = BeautifulSoup(html, "lxml")

    # Year pages list many officials. We filter entries where the speaker contains "Powell".
    items: List[SpeechItem] = []
    # Each entry often sits in a <div class="row"> with date, title link, and speaker line.
    for block in soup.select("div.row"):
        text = " ".join(block.stripped_strings)
        if "Powell" not in text:
            continue
        # Date usually appears at the start; title link is <a> inside
        a = block.find("a", href=True)
        if not a:
            continue
        title = a.get_text(strip=True)
        href = a["href"]
        # Build absolute URL
        if href.startswith("/"):
            url_abs = "https://www.federalreserve.gov" + href
        else:
            url_abs = href

        # Parse date: often the year page has a sibling date, or embed in block
        date_str = None
        # Try to find a span or strong that looks like a date
        for maybe in block.find_all(["span", "strong", "em"]):
            t = maybe.get_text(" ", strip=True)
            if re.search(r"\b\d{1,2}/\d{1,2}/\d{4}\b", t):
                date_str = re.search(r"\b\d{1,2}/\d{1,2}/\d{4}\b", t).group(0)
                break
        if not date_str:
            # fallback: try to parse from text
            m = re.search(r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w* \d{1,2}, \d{4}\b", text)
            if m:
                date_str = m.group(0)
        if not date_str:
            # last resort: infer from URL structure powellYYYYMMDD*.htm
            m = re.search(r"powell(\d{8})", url_abs)
            if m:
                d = m.group(1)
                date_str = f"{d[4:6]}/{d[6:8]}/{d[0:4]}"
            else:
                # otherwise just set Jan 1 of year
                date_str = f"01/01/{year}"

        try:
            d = dateparser.parse(date_str).date()
        except Exception:
            d = dt.date(year, 1, 1)

        is_pdf = url_abs.lower().endswith(".pdf")
        items.append(SpeechItem(date=d, title=title, url=url_abs, is_pdf=is_pdf))

    return items


def extract_text_from_html(url: str, html: str, try_pdf_first: bool = True) -> str:
    """
    Parse FRB speech pages correctly:
    - Prefer the official transcript PDF if a link exists.
    - Otherwise, extract actual speech paragraphs from #article and discard page chrome.
    """
    soup = BeautifulSoup(html, "lxml")

    # ---------- Prefer the transcript PDF if present ----------
    if try_pdf_first:
        # Common places a transcript link appears
        pdf_link = (
            soup.select_one('a[href$=".pdf"][href*="/newsevents/speech/files/"]')
            or soup.select_one('#videoDetails45469 a[href$=".pdf"]')  # id varies across pages, this is fine as a fallback
            or soup.select_one('#content a[href$=".pdf"]')
        )
        if pdf_link and pdf_link.get("href"):
            pdf_url = urljoin(url, pdf_link["href"])
            try:
                b = get_binary(pdf_url)
                text_pdf = extract_text_from_pdf_bytes(b)
                # If we got a plausible transcript, return it
                if text_pdf and len(text_pdf) > 200:
                    return text_pdf
            except Exception as e:
                logging.warning(f"PDF fallback failed ({pdf_url}): {e}")

    # ---------- HTML fallback: extract real speech text ----------
    # Anchor to the article container first, then tidy it
    art = soup.select_one("#article") or soup.select_one("#content") or soup

    # Remove obvious non-body elements
    for sel in [
        ".page-header",
        ".header-group",
        ".breadcrumb",
        ".shareDL",
        ".watchLive",
        ".panel-related",
        ".panel",
        ".embed-responsive",
        ".video-js",
        "script",
        "style",
        ".sr-only",
        "#videoDetails45469",
        ".heading",
        ".list-unstyled",
        "noscript",
    ]:
        for node in art.select(sel):
            node.decompose()

    # Collect paragraphs that are not meta
    paras = []
    for p in art.select("p"):
        classes = set(p.get("class", []))
        # Drop meta lines (date/speaker/location etc.)
        if {"article__time", "speaker", "location"} & classes:
            continue
        t = p.get_text(" ", strip=True)
        # Skip empty or boilerplate
        if not t:
            continue
        # Skip single-word ‘Share’ etc.
        if t.lower() in {"share", "watch live"}:
            continue
        paras.append(t)

    # If we got paragraphs, join them; otherwise fall back to all text in art
    if paras:
        text = "\n\n".join(paras)
    else:
        text = art.get_text(" ", strip=True)

    # Clean up
    text = unidecode(text)
    text = re.sub(r"[ \t]+", " ", text).strip()

    # IMPORTANT: remove the previous "first sentence" heuristic — it was causing truncation.
    return text


def extract_text_from_pdf_bytes(b: bytes) -> str:
    # pdfminer.six on temp file
    tmp = SPEECH_DIR / "_tmp.pdf"
    tmp.write_bytes(b)
    try:
        text = pdf_extract_text(str(tmp)) or ""
    finally:
        try:
            tmp.unlink(missing_ok=True)
        except Exception:
            pass
    text = unidecode(text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# -----------------------
# Formal policy-related heuristic
# -----------------------
FORMAL_KEYWORDS = [
    "jackson hole",
    "monetary policy",
    "economic outlook",
    "price stability",
]


def IS_FORMAL(title: str, body: str) -> bool:
    t = title.lower()
    b = body.lower()
    return any(k in t for k in FORMAL_KEYWORDS) or any(k in b for k in FORMAL_KEYWORDS)


def IS_JACKSON_HOLE(title: str, body: str, url: str) -> bool:
    """
    Returns True if a speech is a Jackson Hole Economic Policy Symposium appearance.
    Heuristic: 'jackson hole' or 'jackson-hole' or 'economic policy symposium'
    appears in title/body/url (case-insensitive).
    """
    t = (title or "").lower()
    b = (body or "").lower()
    u = (url or "").lower()
    return ("jackson hole" in t) or ("jackson hole" in b) or ("jackson-hole" in u) or ("economic policy symposium" in t) or ("economic policy symposium" in b)


# -----------------------
# Corpus build
# -----------------------
def build_powell_corpus() -> pd.DataFrame:
    rows = []
    for year in range(BASE_YEAR_START, BASE_YEAR_END + 1):
        logging.info(f"Fetching year {year} index …")
        items = find_powell_items_for_year(year)
        for it in items:
            rows.append(
                {
                    "date": it.date.isoformat(),
                    "title": it.title,
                    "url": it.url,
                    "is_pdf": it.is_pdf,
                }
            )
        _nap()
    df = pd.DataFrame(rows).drop_duplicates(subset=["url"]).sort_values("date")
    df.to_csv(DATA_DIR / "scraped_index.csv", index=False)
    logging.info(f"Indexed {len(df)} Powell items.")
    return df


def fetch_and_cache_text(url: str, is_pdf: bool) -> str:
    """
    Fetch page and return full transcript text.
    - If the URL is a PDF (or marked is_pdf), parse via pdfminer.
    - If HTML, prefer the embedded transcript PDF when present, otherwise parse paragraphs.
    Caches the final cleaned text in data/speeches/.
    """
    slug = re.sub(r"[^a-zA-Z0-9]+", "_", url.strip("/"))[:150]
    path = SPEECH_DIR / f"{slug}.txt"
    if path.exists():
        return path.read_text(encoding="utf-8", errors="ignore")

    logging.info(f"Downloading: {url}")
    text = ""
    try:
        if is_pdf or url.lower().endswith(".pdf"):
            b = get_binary(url)
            text = extract_text_from_pdf_bytes(b)
        else:
            html = get_html(url)
            # NOTE: pass try_pdf_first=True so HTML pages still prefer the transcript PDF
            text = extract_text_from_html(url, html, try_pdf_first=True)
    except Exception as e:
        logging.warning(f"Failed to fetch/parse {url}: {e}")
        text = ""

    text = (text or "").strip()
    path.write_text(text, encoding="utf-8")
    _nap()
    return text

In [78]:
STOPWORDS = set(
    """
a an the and or but if while of to for in on by with from as at this that those these is are was were be been being it its into not no we you i our their his her they them there here such
""".split()
)

TOKEN_RE = re.compile(r"[A-Za-z][A-Za-z'-]+")


def tokenize(text: str) -> List[str]:
    return [w.lower() for w in TOKEN_RE.findall(text)]


def count_pattern(text: str, pat: re.Pattern) -> int:
    return len(re.findall(pat, text))


def compute_word_probs(df: pd.DataFrame, subset_mask: pd.Series) -> pd.DataFrame:
    # Coerce mask to a boolean Series aligned to df.index
    if not isinstance(subset_mask, pd.Series):
        subset_mask = pd.Series(subset_mask, index=df.index)
    elif not subset_mask.index.equals(df.index):
        subset_mask = subset_mask.reindex(df.index, fill_value=False)

    sub = df[subset_mask].copy()
    N = len(sub)
    rows = []
    for name, pat in WORD_PATTERNS.items():
        hits = 0
        total_mentions = 0
        for t in sub["text"]:
            c = count_pattern(t, pat)
            total_mentions += c
            if c > 0:
                hits += 1
        p = hits / N if N else float("nan")
        rows.append(
            {
                "contract": name,
                "speeches": N,
                "hit_speeches": hits,
                "mentions_total": total_mentions,
                "p_hist": p,
            }
        )

    for name, (pat, k) in THRESHOLD_CONTRACTS.items():
        hits = 0
        total_mentions = 0
        for t in sub["text"]:
            c = count_pattern(t, pat)
            total_mentions += c
            if c >= k:
                hits += 1
        p = hits / N if N else float("nan")
        rows.append(
            {
                "contract": name,
                "speeches": N,
                "hit_speeches": hits,
                "mentions_total": total_mentions,
                "p_hist": p,
            }
        )
    return pd.DataFrame(rows).sort_values(["contract"])


def summarize_tokens(df: pd.DataFrame, subset_mask: pd.Series, topn: int = 200) -> pd.DataFrame:
    sub = df[subset_mask].copy()
    counts: Dict[str, int] = {}
    for t in sub["text"]:
        for tok in tokenize(t):
            if tok in STOPWORDS:
                continue
            counts[tok] = counts.get(tok, 0) + 1
    top = sorted(counts.items(), key=lambda x: x[1], reverse=True)[:topn]
    return pd.DataFrame(top, columns=["token", "count"])


def implied_prob(mkt: Dict) -> Optional[float]:
    yes_bid = mkt.get("yes_bid")
    yes_ask = mkt.get("yes_ask")
    last_price = mkt.get("last_price")
    # Use mid of bid/ask if both present, else last_price
    if isinstance(yes_bid, (int, float)) and isinstance(yes_ask, (int, float)) and yes_bid >= 0 and yes_ask > 0:
        return (0.5 * (yes_bid + yes_ask)) / 100.0
    if isinstance(last_price, (int, float)) and last_price >= 0:
        return last_price / 100.0
    return None


def normalize_contract_name(raw: str) -> str:
    # Markets "name" field should match our keys; if not, try to map lightly
    # e.g., "Regulator/ regulatory / regulation" might come as that exact string.
    return raw.strip()


def compare_to_markets(hist_df: pd.DataFrame, markets: List[Dict], verbose: bool = False) -> pd.DataFrame:
    """
    Compare historical probabilities to market implied probabilities.
    If verbose=True, include per-contract sample statistics drawn from hist_df.
    """

    def _wilson_interval(p: float, n: int, z: float = 1.96) -> Tuple[float, float]:
        if n <= 0 or p is None or math.isnan(p):
            return (float("nan"), float("nan"))
        denom = 1 + (z**2) / n
        center = (p + (z**2) / (2 * n)) / denom
        margin = (z * math.sqrt((p * (1 - p) / n) + (z**2) / (4 * n * n))) / denom
        return (max(0.0, center - margin), min(1.0, center + margin))

    # Build per-contract lookup from hist_df (expects columns: contract, p_hist, speeches, hit_speeches, mentions_total)
    stats_map = {}
    for _, r in hist_df.iterrows():
        stats_map[r["contract"]] = {
            "p_hist": r.get("p_hist"),
            "speeches": r.get("speeches"),
            "hit_speeches": r.get("hit_speeches"),
            "mentions_total": r.get("mentions_total"),
        }

    rows = []
    for m in markets:
        cname = normalize_contract_name(m.get("name", m.get("title", "")))
        if not cname:
            continue
        s = stats_map.get(cname, {})
        p_hist = s.get("p_hist")
        p_mkt = implied_prob(m)
        if p_mkt is None or p_hist is None or math.isnan(p_hist):
            continue

        diff = p_hist - p_mkt
        row = {
            "contract": cname,
            "p_hist": p_hist,
            "p_mkt": p_mkt,
            "hist_minus_mkt": diff,
            "ticker": m.get("ticker_name"),
        }

        if verbose:
            N = int(s.get("speeches") or 0)
            hits = s.get("hit_speeches")
            mentions_total = s.get("mentions_total")
            lo, hi = _wilson_interval(p_hist, N)

            row.update(
                {
                    "n_speeches": N,  # size of historical sample used for this contract
                    "hit_speeches": hits,  # how many speeches had ≥1 mention
                    "mentions_total": mentions_total,  # total count of mentions across the sample
                    "p_hist_wilson_lo": lo,  # 95% Wilson lower bound
                    "p_hist_wilson_hi": hi,  # 95% Wilson upper bound
                    "p_hist_se_approx": math.sqrt(p_hist * (1 - p_hist) / N) if N > 0 else float("nan"),
                }
            )

        rows.append(row)

    return pd.DataFrame(rows).sort_values("hist_minus_mkt")

# main:

In [79]:
idx = build_powell_corpus()
logging.info("Fetching & caching speech texts …")
texts = []
for row in tqdm(idx.itertuples(index=False), total=len(idx)):
    txt = fetch_and_cache_text(row.url, bool(row.is_pdf))
    texts.append(txt)
idx["text"] = texts

idx["n_words"] = idx["text"].apply(lambda s: len(tokenize(s)))
idx["formal"] = idx.apply(lambda r: IS_FORMAL(r["title"], r["text"]), axis=1)
idx["jackson_hole"] = idx.apply(lambda r: IS_JACKSON_HOLE(r["title"], r["text"], r["url"]), axis=1)

idx.to_csv(DATA_DIR / "powell_speech_corpus.csv", index=False)

mask_all = pd.Series(True, index=idx.index)
mask_formal = idx["formal"].fillna(False)
mask_jh = idx["jackson_hole"].fillna(False)

probs_all = compute_word_probs(idx, mask_all)
probs_formal = compute_word_probs(idx, mask_formal)
probs_jh = compute_word_probs(idx, mask_jh)

markets = fetch_live_kalshi_powell_mention_market()

verbose = True
comp_all = compare_to_markets(probs_all, markets, verbose)
comp_formal = compare_to_markets(probs_formal, markets, verbose)
comp_jh = compare_to_markets(probs_jh, markets, verbose)

2025-08-20 13:21:27,985 INFO: Fetching year 2010 index …
2025-08-20 13:21:28,390 INFO: Fetching year 2011 index …
2025-08-20 13:21:28,783 INFO: Fetching year 2012 index …
2025-08-20 13:21:29,229 INFO: Fetching year 2013 index …
2025-08-20 13:21:29,669 INFO: Fetching year 2014 index …
2025-08-20 13:21:30,100 INFO: Fetching year 2015 index …
2025-08-20 13:21:30,416 INFO: Fetching year 2016 index …
2025-08-20 13:21:30,718 INFO: Fetching year 2017 index …
2025-08-20 13:21:30,990 INFO: Fetching year 2018 index …
2025-08-20 13:21:31,367 INFO: Fetching year 2019 index …
2025-08-20 13:21:31,812 INFO: Fetching year 2020 index …
2025-08-20 13:21:32,243 INFO: Fetching year 2021 index …
2025-08-20 13:21:32,579 INFO: Fetching year 2022 index …
2025-08-20 13:21:32,919 INFO: Fetching year 2023 index …
2025-08-20 13:21:33,307 INFO: Fetching year 2024 index …
2025-08-20 13:21:33,779 INFO: Fetching year 2025 index …
2025-08-20 13:21:34,182 INFO: Indexed 134 Powell items.
2025-08-20 13:21:34,182 INFO: Fe

## all powell speeches

In [80]:
comp_all.set_index("contract")

Unnamed: 0_level_0,p_hist,p_mkt,hist_minus_mkt,ticker,n_speeches,hit_speeches,mentions_total,p_hist_wilson_lo,p_hist_wilson_hi,p_hist_se_approx
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Tariff,0.037313,0.9,-0.862687,KXPOWELLMENTION-25AUG24-TARI,134,5,11,0.016041,0.084375,0.016373
Pandemic,0.283582,0.82,-0.536418,KXPOWELLMENTION-25AUG24-PAND,134,38,238,0.214125,0.365102,0.038938
Dual,0.291045,0.82,-0.528955,KXPOWELLMENTION-25AUG24-DUAL,134,39,65,0.220813,0.372924,0.039241
Good morning,0.141791,0.615,-0.473209,KXPOWELLMENTION-25AUG24-GOODM,134,19,19,0.092689,0.210859,0.030135
Energy,0.246269,0.705,-0.458731,KXPOWELLMENTION-25AUG24-ENER,134,33,64,0.181068,0.325612,0.037219
Cut,0.179104,0.63,-0.450896,KXPOWELLMENTION-25AUG24-CUT,134,24,35,0.123414,0.252682,0.033124
Balance of risks,0.074627,0.475,-0.400373,KXPOWELLMENTION-25AUG24-BALA,134,10,11,0.041038,0.131926,0.022701
Transitory,0.074627,0.47,-0.395373,KXPOWELLMENTION-25AUG24-TRAN,134,10,24,0.041038,0.131926,0.022701
Layoff,0.104478,0.425,-0.320522,KXPOWELLMENTION-25AUG24-LAYO,134,14,21,0.06326,0.167741,0.026424
Projection,0.261194,0.565,-0.303806,KXPOWELLMENTION-25AUG24-PROJ,134,35,226,0.194213,0.341486,0.037948


## only formal powell speeches

In [81]:
comp_formal.set_index("contract")

Unnamed: 0_level_0,p_hist,p_mkt,hist_minus_mkt,ticker,n_speeches,hit_speeches,mentions_total,p_hist_wilson_lo,p_hist_wilson_hi,p_hist_se_approx
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Tariff,0.05618,0.9,-0.84382,KXPOWELLMENTION-25AUG24-TARI,89,5,11,0.024233,0.124856,0.024408
Good morning,0.11236,0.615,-0.50264,KXPOWELLMENTION-25AUG24-GOODM,89,10,10,0.062187,0.194612,0.033476
Pandemic,0.337079,0.82,-0.482921,KXPOWELLMENTION-25AUG24-PAND,89,30,218,0.247427,0.440213,0.050107
Dual,0.426966,0.82,-0.393034,KXPOWELLMENTION-25AUG24-DUAL,89,38,64,0.329326,0.530651,0.052431
Cut,0.258427,0.63,-0.371573,KXPOWELLMENTION-25AUG24-CUT,89,23,33,0.178814,0.358031,0.046404
Balance of risks,0.11236,0.475,-0.36264,KXPOWELLMENTION-25AUG24-BALA,89,10,11,0.062187,0.194612,0.033476
Transitory,0.11236,0.47,-0.35764,KXPOWELLMENTION-25AUG24-TRAN,89,10,24,0.062187,0.194612,0.033476
Energy,0.359551,0.705,-0.345449,KXPOWELLMENTION-25AUG24-ENER,89,32,63,0.267576,0.463148,0.050866
Layoff,0.134831,0.425,-0.290169,KXPOWELLMENTION-25AUG24-LAYO,89,12,19,0.078842,0.221041,0.036204
Tariff (10+ times),0.0,0.24,-0.24,KXPOWELLMENTION-25AUG24-TARR,89,0,11,0.0,0.041378,0.0


## only jackson hole powell speeches

In [82]:
comp_jh.set_index("contract")

Unnamed: 0_level_0,p_hist,p_mkt,hist_minus_mkt,ticker,n_speeches,hit_speeches,mentions_total,p_hist_wilson_lo,p_hist_wilson_hi,p_hist_se_approx
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Tariff,0.047619,0.9,-0.852381,KXPOWELLMENTION-25AUG24-TARI,21,1,1,0.008456,0.226698,0.046471
Good morning,0.190476,0.615,-0.424524,KXPOWELLMENTION-25AUG24-GOODM,21,4,4,0.076674,0.40001,0.085689
Pandemic,0.47619,0.82,-0.34381,KXPOWELLMENTION-25AUG24-PAND,21,10,119,0.283437,0.676308,0.108985
Balance of risks,0.142857,0.475,-0.332143,KXPOWELLMENTION-25AUG24-BALA,21,3,3,0.049809,0.346365,0.07636
Layoff,0.142857,0.425,-0.282143,KXPOWELLMENTION-25AUG24-LAYO,21,3,6,0.049809,0.346365,0.07636
Transitory,0.190476,0.47,-0.279524,KXPOWELLMENTION-25AUG24-TRAN,21,4,14,0.076674,0.40001,0.085689
Cut,0.380952,0.63,-0.249048,KXPOWELLMENTION-25AUG24-CUT,21,8,14,0.207508,0.591217,0.105971
Tariff (10+ times),0.0,0.24,-0.24,KXPOWELLMENTION-25AUG24-TARR,21,0,1,0.0,0.154644,0.0
Dual,0.619048,0.82,-0.200952,KXPOWELLMENTION-25AUG24-DUAL,21,13,17,0.408783,0.792492,0.105971
Dollar,0.095238,0.285,-0.189762,KXPOWELLMENTION-25AUG24-DOLL,21,2,9,0.026518,0.289146,0.064056
