In [2]:
import requests
import json
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import pandas as pd

In [3]:
try:
    stopwords.words('english')
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK tokenizer...")
    nltk.download('punkt')

Downloading NLTK stopwords...
Downloading NLTK tokenizer...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [4]:
def get_speech_urls():
    urls = [
        # 2024
        'https://www.federalreserve.gov/newsevents/speech/powell20240519a.htm',
        'https://www.federalreserve.gov/newsevents/speech/powell20240416a.htm',
        'https://www.federalreserve.gov/newsevents/speech/powell20240403a.htm',
        'https://www.federalreserve.gov/newsevents/testimony/powell20240306a.htm',
        # 2023
        'https://www.federalreserve.gov/newsevents/speech/powell20231201a.htm',
        'https://www.federalreserve.gov/newsevents/speech/powell20231109a.htm',
        'https://www.federalreserve.gov/newsevents/speech/powell20230825a.htm',
        'https://www.federalreserve.gov/newsevents/testimony/powell20230621a.htm',
        'https://www.federalreserve.gov/newsevents/speech/powell20230519a.htm',
        # 2022
        'https://www.federalreserve.gov/newsevents/speech/powell20221130a.htm',
        'https://www.federalreserve.gov/newsevents/speech/powell20220826a.htm',
        'https://www.federalreserve.gov/newsevents/speech/powell20220517a.htm',
        'https://www.federalreserve.gov/newsevents/testimony/powell20220302a.htm',
        # 2021
        'https://www.federalreserve.gov/newsevents/speech/powell20211122a.htm',
        'https://www.federalreserve.gov/newsevents/speech/powell20210827a.htm',
        'https://www.federalreserve.gov/newsevents/testimony/powell20210622a.htm',
        # 2020
        'https://www.federalreserve.gov/newsevents/speech/powell20200827a.htm',
        'https://www.federalreserve.gov/newsevents/speech/powell20200616a.htm',
        'https://www.federalreserve.gov/newsevents/testimony/powell20200211a.htm'
    ]
    return urls


def fetch_and_clean_text(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the main content area of the page (selectors may need adjustment)
        # This is a common selector for the Fed's website
        content = soup.find('div', id='content')
        if content:
            return content.get_text()
        return ""
    except requests.exceptions.RequestException as e:
        print(f"Could not fetch {url}: {e}")
        return None


In [5]:
def analyze_documents(documents, keywords_map):
    doc_count = len(documents)
    if doc_count == 0:
        return {key: 0 for key in keywords_map}

    # Dictionary to store the count of documents containing each keyword
    keyword_doc_counts = defaultdict(int)

    for doc in documents:
        # Normalize the document text for searching
        doc_lower = doc.lower()
        for key, search_terms in keywords_map.items():
            # Check if any of the search terms for a given key are in the document
            found = False
            for term in search_terms:
                # Use regex to find whole words/phrases to avoid partial matches (e.g., 'cut' in 'execute')
                if re.search(r"\b" + re.escape(term) + r"\b", doc_lower):
                    found = True
                    break
            if found:
                keyword_doc_counts[key] += 1

    # Calculate document frequency as a percentage
    doc_frequencies = {key: (count / doc_count) * 100 for key, count in keyword_doc_counts.items()}

    return doc_frequencies


def load_market_data(market_json):
    """Loads the market data from a JSON string."""
    return json.loads(market_json)["markets"]


def compare_analysis_with_market(doc_frequencies, market_data):
    """
    Compares the historical analysis with current market pricing and identifies mispricings.

    Returns:
        pandas.DataFrame: A DataFrame summarizing the comparison.
    """
    comparison_data = []
    for contract in market_data:
        contract_name = contract["name"]
        implied_prob = contract.get("last_price", 0)

        # Find the corresponding historical frequency
        historical_prob = doc_frequencies.get(contract_name, 0.0)

        mispricing_gap = implied_prob - historical_prob

        comparison_data.append(
            {
                "Contract": contract_name,
                "Historical Prob. (%)": round(historical_prob, 2),
                "Market Implied Prob. (%)": implied_prob,
                "Mispricing Gap (%)": round(mispricing_gap, 2),
            }
        )

    df = pd.DataFrame(comparison_data)
    df = df.sort_values(by="Mispricing Gap (%)", ascending=False)
    return df

In [12]:
KEYWORDS_MAP = {
    "Trump": ["trump"],
    "Projection": ["projection", "projections"],
    "Good afternoon": ["good afternoon"],
    "Russia": ["russia", "russian"],
    "Pandemic": ["pandemic"],
    "Median": ["median"],
    "Administration": ["administration"],
    "Tariff (10+ times)": ["tariff"],  # Note: Analysis is for mention, not count
    "Tariff": ["tariff"],
    "Renovation": ["renovation"],
    "Regulator/ regulatory / regulation": ["regulator", "regulatory", "regulation"],
    "Overheat": ["overheat", "overheating"],
    "Michelle / Bowman": ["michelle", "bowman"],
    "Layoff": ["layoff", "layoffs"],
    "Labor (40+ times)": ["labor"],  # Note: Analysis is for mention, not count
    "Labor (30+ times)": ["labor"],  # Note: Analysis is for mention, not count
    "Good morning": ["good morning"],
    "Energy": ["energy"],
    "Dollar": ["dollar"],
    "Dissent": ["dissent"],
    "Cut": ["cut"],
    "Crypto / Bitcoin": ["crypto", "bitcoin"],
    "Credit": ["credit"],
    "Consumer confidence": ["consumer confidence"],
    "Balance of risks": ["balance of risks"],
    "Anchor": ["anchor", "anchored"],
    "Transitory": ["transitory"],
    "Symposium": ["symposium"],
    "Soft landing": ["soft landing"],
    "Political": ["political"],
    "Meeting": ["meeting"],
    "Dot plot": ["dot plot"],
    "Dual": ["dual"],
    "Climate": ["climate"],
    "Christopher / Waller": ["christopher", "waller"],
    "Chair": ["chair"],
}

urls = get_speech_urls()
speech_documents = []
for url in urls:
    text = fetch_and_clean_text(url)
    if text:
        speech_documents.append(text)

historical_frequencies = analyze_documents(speech_documents, KEYWORDS_MAP)

market_json_string = """
{
	"markets": [
	{ "id": "177fef2a-8ab7-42e6-8e3c-df37313cf57f", "name": "Trump", "last_price": 13 },
	{ "id": "cb914ed6-6e3c-4264-aa72-d56cfc2fa421", "name": "Projection", "last_price": 54 },
	{ "id": "aace5557-b664-4b14-be21-6c639a002f31", "name": "Good afternoon", "last_price": 1 },
	{ "id": "07662840-ecfe-4968-acbe-1c18189ddc58", "name": "Russia", "last_price": 22 },
	{ "id": "9bc9de9e-9ff6-46c2-a57e-74fba23f560d", "name": "Pandemic", "last_price": 77 },
	{ "id": "60915fc6-3680-4072-8128-30405eda6913", "name": "Median", "last_price": 36 },
	{ "id": "21778f2e-881d-4a72-bb36-5aa3ccecf236", "name": "Administration", "last_price": 27 },
	{ "id": "91ce2d29-0d01-49d8-bc27-13222eea8566", "name": "Tariff (10+ times)", "last_price": 27 },
	{ "id": "67a583ad-5670-419d-a845-3790cac1872d", "name": "Renovation", "last_price": 11 },
	{ "id": "304b39c8-728a-496c-acb1-d6074ebb3cda", "name": "Regulator/ regulatory / regulation", "last_price": 26 },
	{ "id": "4a058829-db01-44be-989a-161d843f1dda", "name": "Overheat", "last_price": 13 },
	{ "id": "5581b2bd-643b-4b79-97cd-0d1b24d284d6", "name": "Michelle / Bowman", "last_price": 9 },
	{ "id": "d89c85ae-1e68-4b17-998d-08cb74bdd39c", "name": "Layoff", "last_price": 41 },
	{ "id": "af47ac02-d277-432c-a605-a8aa7ecfa47d", "name": "Labor (40+ times)", "last_price": 14 },
	{ "id": "78129fad-a50d-4021-ae36-9cbae7fcb819", "name": "Labor (30+ times)", "last_price": 28 },
	{ "id": "e0be432b-bdfc-4ee4-9e24-c80a0a3d877c", "name": "Good morning", "last_price": 62 },
	{ "id": "dea412f3-9f9c-4016-83f3-4cb7a99c13a3", "name": "Energy", "last_price": 71 },
	{ "id": "a0003033-bcb5-41f4-a690-9741ae833248", "name": "Dollar", "last_price": 31 },
	{ "id": "51c7c6e8-e1cf-4b14-bf94-d35abfa8b5b7", "name": "Dissent", "last_price": 13 },
	{ "id": "be63a919-1b02-4948-b3ee-bd79a2ca7e4e", "name": "Cut", "last_price": 62 },
	{ "id": "5379a6e1-d9b4-4d22-9c10-c4490ef22bf2", "name": "Crypto / Bitcoin", "last_price": 10 },
	{ "id": "d379d7bf-4e89-4818-a09d-6cc6f24e7143", "name": "Credit", "last_price": 34 },
	{ "id": "9f943a58-a507-4ffe-89ab-7cb29fc89848", "name": "Consumer confidence", "last_price": 21 },
	{ "id": "996e9147-2b94-472a-83a5-86c9397e8072", "name": "Balance of risks", "last_price": 45 },
	{ "id": "627e5d46-8fae-4569-ba2d-579fa4f59583", "name": "Anchor", "last_price": 29 },
	{ "id": "50b8013f-9625-443f-a69f-ae8d7d98b050", "name": "Transitory", "last_price": 45 },
	{ "id": "e6d1b39d-f0c3-4396-aad6-294e1b6101b2", "name": "Tariff", "last_price": 91 },
	{ "id": "fda7f9d1-b44d-470b-bbc6-e26ea08e97cb", "name": "Symposium", "last_price": 34 },
	{ "id": "b9b78a7c-50d1-4919-a411-9aabb050149b", "name": "Soft landing", "last_price": 11 },
	{ "id": "2fd0944e-2d53-43a9-9204-9d890eaf362a", "name": "Political", "last_price": 10 },
	{ "id": "f008d276-86d9-464e-ac2d-8281f477088b", "name": "Meeting", "last_price": 70 },
	{ "id": "dc883260-991f-4ccb-b57e-56520a06ce59", "name": "Dot plot", "last_price": 10 },
	{ "id": "654d5127-ea84-408d-9303-4d2f1855522a", "name": "Dual", "last_price": 83 },
	{ "id": "da3a20a1-3040-49db-80f9-bc600fc27399", "name": "Climate", "last_price": 18 },
	{ "id": "b91002b1-4b62-4c28-945c-03e51849046c", "name": "Christopher / Waller", "last_price": 11 },
	{ "id": "5c82cfd3-01fc-4c7f-9931-5226eec4ea51", "name": "Chair", "last_price": 31 }
	]
}
"""

market_data = load_market_data(market_json_string)
comparison_df = compare_analysis_with_market(historical_frequencies, market_data)

Could not fetch https://www.federalreserve.gov/newsevents/speech/powell20240416a.htm: 404 Client Error: Not Found for url: https://www.federalreserve.gov/newsevents/speech/powell20240416a.htm
Could not fetch https://www.federalreserve.gov/newsevents/speech/powell20230519a.htm: 404 Client Error: Not Found for url: https://www.federalreserve.gov/newsevents/speech/powell20230519a.htm
Could not fetch https://www.federalreserve.gov/newsevents/speech/powell20220517a.htm: 404 Client Error: Not Found for url: https://www.federalreserve.gov/newsevents/speech/powell20220517a.htm
Could not fetch https://www.federalreserve.gov/newsevents/speech/powell20211122a.htm: 404 Client Error: Not Found for url: https://www.federalreserve.gov/newsevents/speech/powell20211122a.htm
Could not fetch https://www.federalreserve.gov/newsevents/speech/powell20200616a.htm: 404 Client Error: Not Found for url: https://www.federalreserve.gov/newsevents/speech/powell20200616a.htm


In [14]:
pd.set_option("display.width", 1000)
print("\n--- Analysis vs. Market Pricing ---")
comparison_df.set_index("Contract")


--- Analysis vs. Market Pricing ---


Unnamed: 0_level_0,Historical Prob. (%),Market Implied Prob. (%),Mispricing Gap (%)
Contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tariff,0.0,91,91.0
Cut,7.14,62,54.86
Good morning,14.29,62,47.71
Dual,42.86,83,40.14
Dollar,0.0,31,31.0
Transitory,14.29,45,30.71
Tariff (10+ times),0.0,27,27.0
Layoff,14.29,41,26.71
Regulator/ regulatory / regulation,0.0,26,26.0
Consumer confidence,0.0,21,21.0
