# Word Frequency Analysis: Rising and Declining Terms in Singapore Budget Speeches

This notebook analyzes word frequency trends across **67 Singapore budget speeches** spanning from 1965 to 2026 (61 unique years, with some years containing multiple supplementary budgets).

## Methodology

### Objective
Identify the **top 12 rising words** and **top 12 declining words** by comparing:
- **First 10 years**: 1965-1975 (speeches from the early era)
- **Last 10 years**: 2017-2026 (speeches from the recent era)

### Key Metric: Normalized Frequency
To compare word usage across speeches of different lengths, we use **mentions per 10,000 words**:

$$\text{Normalized Frequency} = \frac{\text{Word Count}}{\text{Total Words in Document}} \times 10,000$$

### Rising/Declining Score
For each word, we calculate the **change in average normalized frequency**:

$$\text{Change Score} = \bar{f}_{\text{last 10 years}} - \bar{f}_{\text{first 10 years}}$$

Where $\bar{f}$ is the mean normalized frequency across the period.

- **Positive scores** indicate **rising** words (more frequent in recent years)
- **Negative scores** indicate **declining** words (less frequent in recent years)

In [None]:
import os
import re
from pathlib import Path
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## 1. Load and Process the Corpus

Each file is named with the pattern: `YYYY-MM-DD_Minister_Name.txt`

We extract the year from the filename and aggregate speeches by year (some years have multiple supplementary budgets).

In [None]:
# Path to corpus
CORPUS_PATH = Path('/Users/wongpeiting/Desktop/CU/python-work/budget-strict/corpus')

def extract_year(filename):
    """Extract the year from filename like '1965-12-13_Lim_Kim_San.txt'"""
    match = re.match(r'(\d{4})', filename)
    return int(match.group(1)) if match else None

# UK to US spelling mappings (normalize to US spelling)
UK_TO_US_SPELLING = {
    # -ise -> -ize
    'organisation': 'organization', 'organisations': 'organizations',
    'organise': 'organize', 'organised': 'organized', 'organising': 'organizing',
    'recognise': 'recognize', 'recognised': 'recognized', 'recognising': 'recognizing',
    'realise': 'realize', 'realised': 'realized', 'realising': 'realizing',
    'utilise': 'utilize', 'utilised': 'utilized', 'utilising': 'utilizing',
    'maximise': 'maximize', 'maximised': 'maximized', 'maximising': 'maximizing',
    'minimise': 'minimize', 'minimised': 'minimized', 'minimising': 'minimizing',
    'prioritise': 'prioritize', 'prioritised': 'prioritized', 'prioritising': 'prioritizing',
    'emphasise': 'emphasize', 'emphasised': 'emphasized', 'emphasising': 'emphasizing',
    'stabilise': 'stabilize', 'stabilised': 'stabilized', 'stabilising': 'stabilizing',
    'modernise': 'modernize', 'modernised': 'modernized', 'modernising': 'modernizing',
    'liberalise': 'liberalize', 'liberalised': 'liberalized', 'liberalising': 'liberalizing',
    'privatise': 'privatize', 'privatised': 'privatized', 'privatising': 'privatizing',
    'specialise': 'specialize', 'specialised': 'specialized', 'specialising': 'specializing',
    'subsidise': 'subsidize', 'subsidised': 'subsidized', 'subsidising': 'subsidizing',
    'computerise': 'computerize', 'computerised': 'computerized', 'computerising': 'computerizing',
    'standardise': 'standardize', 'standardised': 'standardized', 'standardising': 'standardizing',
    'harmonise': 'harmonize', 'harmonised': 'harmonized', 'harmonising': 'harmonizing',
    'capitalise': 'capitalize', 'capitalised': 'capitalized', 'capitalising': 'capitalizing',
    'centralise': 'centralize', 'centralised': 'centralized', 'centralising': 'centralizing',
    'decentralise': 'decentralize', 'decentralised': 'decentralized', 'decentralising': 'decentralizing',
    
    # -our -> -or
    'labour': 'labor', 'labours': 'labors',
    'colour': 'color', 'colours': 'colors', 'coloured': 'colored',
    'favour': 'favor', 'favours': 'favors', 'favoured': 'favored', 'favourable': 'favorable',
    'honour': 'honor', 'honours': 'honors', 'honoured': 'honored', 'honourable': 'honorable',
    'neighbour': 'neighbor', 'neighbours': 'neighbors', 'neighbourhood': 'neighborhood',
    'behaviour': 'behavior', 'behaviours': 'behaviors',
    'endeavour': 'endeavor', 'endeavours': 'endeavors',
    
    # -re -> -er
    'centre': 'center', 'centres': 'centers', 'centred': 'centered',
    'metre': 'meter', 'metres': 'meters',
    'litre': 'liter', 'litres': 'liters',
    'fibre': 'fiber', 'fibres': 'fibers',
    'theatre': 'theater', 'theatres': 'theaters',
    
    # -ogue -> -og
    'catalogue': 'catalog', 'catalogues': 'catalogs',
    'dialogue': 'dialog', 'dialogues': 'dialogs',
    'analogue': 'analog',
    
    # -ence -> -ense
    'defence': 'defense', 'defences': 'defenses',
    'offence': 'offense', 'offences': 'offenses',
    'licence': 'license', 'licences': 'licenses',
    
    # -gramme -> -gram
    'programme': 'program', 'programmes': 'programs', 'programmed': 'programmed',
    'kilogramme': 'kilogram', 'kilogrammes': 'kilograms',
    
    # Other common variations
    'cheque': 'check', 'cheques': 'checks',
    'grey': 'gray',
    'ageing': 'aging',
    'judgement': 'judgment', 'judgements': 'judgments',
    'acknowledgement': 'acknowledgment', 'acknowledgements': 'acknowledgments',
    'fulfil': 'fulfill', 'fulfilled': 'fulfilled', 'fulfilling': 'fulfilling', 'fulfilment': 'fulfillment',
    'enrol': 'enroll', 'enrolled': 'enrolled', 'enrolment': 'enrollment',
    'skilful': 'skillful',
    'instalment': 'installment', 'instalments': 'installments',
    'counselling': 'counseling', 'counsellor': 'counselor',
    'travelling': 'traveling', 'traveller': 'traveler',
    'modelling': 'modeling',
    'levelling': 'leveling',
    'labelling': 'labeling',
    'signalling': 'signaling',
    'cancelled': 'canceled', 'cancelling': 'canceling',
}

def normalize_spelling(word):
    """Normalize UK spelling to US spelling."""
    return UK_TO_US_SPELLING.get(word, word)

def tokenize(text):
    """Convert text to lowercase words, normalize spelling, keeping only alphabetic tokens."""
    # Remove numbers and punctuation, convert to lowercase
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    # Normalize UK -> US spelling
    words = [normalize_spelling(w) for w in words]
    return words

# Load all speeches grouped by year
speeches_by_year = defaultdict(list)
file_count = 0

for filepath in sorted(CORPUS_PATH.glob('*.txt')):
    year = extract_year(filepath.name)
    if year:
        file_count += 1
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        speeches_by_year[year].append(text)

print(f"Total speeches loaded: {file_count}")
print(f"Unique years: {len(speeches_by_year)}")
print(f"Year range: {min(speeches_by_year.keys())} - {max(speeches_by_year.keys())}")
print(f"\nUK/US spelling variants normalized: {len(UK_TO_US_SPELLING)}")

# Show years with multiple speeches
print("\nYears with multiple speeches:")
for year in sorted(speeches_by_year.keys()):
    if len(speeches_by_year[year]) > 1:
        print(f"  {year}: {len(speeches_by_year[year])} speeches")

## 2. Calculate Word Frequencies Per Year

For each year, we:
1. Concatenate all speeches for that year
2. Tokenize into words
3. Count word occurrences
4. Calculate total word count for normalization

In [None]:
# Count words per year
word_counts_by_year = {}  # year -> Counter of words
total_words_by_year = {}  # year -> total word count

for year, texts in sorted(speeches_by_year.items()):
    combined_text = ' '.join(texts)
    words = tokenize(combined_text)
    word_counts_by_year[year] = Counter(words)
    total_words_by_year[year] = len(words)

# Display summary
print("Total words per year:\n")
for year in sorted(total_words_by_year.keys()):
    print(f"{year}: {total_words_by_year[year]:,} words")

## 3. Build Normalized Frequency Matrix

Create a DataFrame where:
- **Rows** = unique words
- **Columns** = years
- **Values** = normalized frequency (mentions per 10,000 words)

$$f_{w,y} = \frac{\text{count}(w, y)}{\text{total}(y)} \times 10,000$$

In [None]:
# Get all unique words across all years
all_words = set()
for counter in word_counts_by_year.values():
    all_words.update(counter.keys())

print(f"Total unique words: {len(all_words):,}")

# Build normalized frequency matrix
years = sorted(word_counts_by_year.keys())
freq_data = {}

for year in years:
    total = total_words_by_year[year]
    counter = word_counts_by_year[year]
    # Normalized frequency: mentions per 10,000 words
    freq_data[year] = {word: (counter.get(word, 0) / total) * 10000 for word in all_words}

# Create DataFrame
freq_df = pd.DataFrame(freq_data)
freq_df = freq_df.fillna(0)

print(f"\nFrequency matrix shape: {freq_df.shape} (words x years)")
freq_df.head()

## 4. Define Early and Recent Periods

We compare:
- **Early period**: First 10 distinct years in the corpus
- **Recent period**: Last 10 distinct years in the corpus

In [None]:
# Define periods
all_years = sorted(freq_df.columns)
early_years = all_years[:10]
recent_years = all_years[-10:]

print(f"Early period years: {early_years}")
print(f"Recent period years: {recent_years}")

## 5. Calculate Rising and Declining Words

### Mathematical Formula

For each word $w$:

$$\bar{f}_{\text{early}}(w) = \frac{1}{|Y_{\text{early}}|} \sum_{y \in Y_{\text{early}}} f_{w,y}$$

$$\bar{f}_{\text{recent}}(w) = \frac{1}{|Y_{\text{recent}}|} \sum_{y \in Y_{\text{recent}}} f_{w,y}$$

$$\Delta(w) = \bar{f}_{\text{recent}}(w) - \bar{f}_{\text{early}}(w)$$

- **Rising words**: Words with highest positive $\Delta$
- **Declining words**: Words with lowest (most negative) $\Delta$

In [None]:
# Calculate mean frequency for each period
early_mean = freq_df[early_years].mean(axis=1)
recent_mean = freq_df[recent_years].mean(axis=1)

# Calculate change score
change_score = recent_mean - early_mean

# Create summary DataFrame
word_analysis = pd.DataFrame({
    'word': freq_df.index,
    'early_mean_freq': early_mean.values,
    'recent_mean_freq': recent_mean.values,
    'change_score': change_score.values
})

# Filter out very rare words (must appear at least 5 times per 10k words on average in EITHER period)
# This filters out noise from very infrequent words
MIN_FREQ_THRESHOLD = 1.0  # At least 1 mention per 10,000 words on average
word_analysis_filtered = word_analysis[
    (word_analysis['early_mean_freq'] >= MIN_FREQ_THRESHOLD) | 
    (word_analysis['recent_mean_freq'] >= MIN_FREQ_THRESHOLD)
]

print(f"Words meeting minimum frequency threshold: {len(word_analysis_filtered):,}")

In [None]:
# Expanded stopwords list - common words that don't carry meaning
# Includes: articles, prepositions, pronouns, auxiliaries, conjunctions, 
# common verbs, adverbs, and budget-speech-specific filler words

STOPWORDS = {
    # Articles & determiners
    'the', 'a', 'an', 'this', 'that', 'these', 'those', 'my', 'your', 'his', 'her',
    'its', 'our', 'their', 'some', 'any', 'no', 'every', 'each', 'all', 'both',
    'few', 'many', 'much', 'most', 'other', 'another', 'such', 'what', 'which',
    'whose', 'whatever', 'whichever',
    
    # Pronouns
    'i', 'me', 'we', 'us', 'you', 'he', 'him', 'she', 'her', 'it', 'they', 'them',
    'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves',
    'who', 'whom', 'whose', 'whoever', 'whomever',
    
    # Prepositions
    'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'into', 'through',
    'during', 'before', 'after', 'above', 'below', 'between', 'under', 'over', 'out',
    'up', 'down', 'off', 'about', 'against', 'among', 'around', 'behind', 'beside',
    'beyond', 'within', 'without', 'along', 'across', 'upon', 'towards', 'toward',
    'throughout', 'despite', 'via', 'per', 'including', 'regarding', 'concerning',
    
    # Conjunctions
    'and', 'or', 'but', 'nor', 'so', 'yet', 'for', 'because', 'since', 'although',
    'though', 'while', 'whereas', 'if', 'unless', 'until', 'when', 'whenever',
    'where', 'wherever', 'whether', 'however', 'therefore', 'thus', 'hence',
    'moreover', 'furthermore', 'nevertheless', 'nonetheless', 'otherwise',
    
    # Auxiliary/Modal verbs
    'be', 'is', 'am', 'are', 'was', 'were', 'been', 'being',
    'have', 'has', 'had', 'having',
    'do', 'does', 'did', 'doing', 'done',
    'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'can', 'could',
    'need', 'dare', 'ought', 'used',
    
    # Common verbs (too generic to be meaningful)
    'get', 'got', 'getting', 'gets',
    'make', 'made', 'making', 'makes',
    'take', 'took', 'taken', 'taking', 'takes',
    'give', 'gave', 'given', 'giving', 'gives',
    'go', 'went', 'gone', 'going', 'goes',
    'come', 'came', 'coming', 'comes',
    'see', 'saw', 'seen', 'seeing', 'sees',
    'know', 'knew', 'known', 'knowing', 'knows',
    'think', 'thought', 'thinking', 'thinks',
    'say', 'said', 'saying', 'says',
    'tell', 'told', 'telling', 'tells',
    'put', 'putting', 'puts',
    'let', 'letting', 'lets',
    'keep', 'kept', 'keeping', 'keeps',
    'set', 'setting', 'sets',
    'seem', 'seemed', 'seeming', 'seems',
    'want', 'wanted', 'wanting', 'wants',
    'look', 'looked', 'looking', 'looks',
    'use', 'used', 'using', 'uses',
    'find', 'found', 'finding', 'finds',
    'show', 'showed', 'shown', 'showing', 'shows',
    'try', 'tried', 'trying', 'tries',
    'leave', 'left', 'leaving', 'leaves',
    'call', 'called', 'calling', 'calls',
    'ask', 'asked', 'asking', 'asks',
    'turn', 'turned', 'turning', 'turns',
    'begin', 'began', 'begun', 'beginning', 'begins',
    'start', 'started', 'starting', 'starts',
    'move', 'moved', 'moving', 'moves',
    'run', 'ran', 'running', 'runs',
    'bring', 'brought', 'bringing', 'brings',
    'hold', 'held', 'holding', 'holds',
    'write', 'wrote', 'written', 'writing', 'writes',
    'read', 'reading', 'reads',
    'learn', 'learned', 'learnt', 'learning', 'learns',
    'change', 'changed', 'changing', 'changes',
    'follow', 'followed', 'following', 'follows',
    'stop', 'stopped', 'stopping', 'stops',
    'mean', 'meant', 'meaning', 'means',
    'add', 'added', 'adding', 'adds',
    'play', 'played', 'playing', 'plays',
    'pay', 'paid', 'paying', 'pays',
    'hear', 'heard', 'hearing', 'hears',
    'include', 'included', 'including', 'includes',
    'believe', 'believed', 'believing', 'believes',
    'allow', 'allowed', 'allowing', 'allows',
    'meet', 'met', 'meeting', 'meets',
    'lead', 'led', 'leading', 'leads',
    'live', 'lived', 'living', 'lives',
    'stand', 'stood', 'standing', 'stands',
    'happen', 'happened', 'happening', 'happens',
    'carry', 'carried', 'carrying', 'carries',
    'talk', 'talked', 'talking', 'talks',
    'appear', 'appeared', 'appearing', 'appears',
    'produce', 'produced', 'producing', 'produces',
    'sit', 'sat', 'sitting', 'sits',
    'offer', 'offered', 'offering', 'offers',
    'consider', 'considered', 'considering', 'considers',
    'expect', 'expected', 'expecting', 'expects',
    'suggest', 'suggested', 'suggesting', 'suggests',
    'remain', 'remained', 'remaining', 'remains',
    'require', 'required', 'requiring', 'requires',
    'report', 'reported', 'reporting', 'reports',
    'decide', 'decided', 'deciding', 'decides',
    'reach', 'reached', 'reaching', 'reaches',
    'rise', 'rose', 'risen', 'rising', 'rises',
    'pass', 'passed', 'passing', 'passes',
    'sell', 'sold', 'selling', 'sells',
    'buy', 'bought', 'buying', 'buys',
    'create', 'created', 'creating', 'creates',
    'spend', 'spent', 'spending', 'spends',
    'grow', 'grew', 'grown', 'growing', 'grows',
    'open', 'opened', 'opening', 'opens',
    'walk', 'walked', 'walking', 'walks',
    'win', 'won', 'winning', 'wins',
    'lose', 'lost', 'losing', 'loses',
    'send', 'sent', 'sending', 'sends',
    'build', 'built', 'building', 'builds',
    'fall', 'fell', 'fallen', 'falling', 'falls',
    'cut', 'cutting', 'cuts',
    'kill', 'killed', 'killing', 'kills',
    'reduce', 'reduced', 'reducing', 'reduces',
    'develop', 'developed', 'developing', 'develops',
    'remember', 'remembered', 'remembering', 'remembers',
    'speak', 'spoke', 'spoken', 'speaking', 'speaks',
    'agree', 'agreed', 'agreeing', 'agrees',
    'raise', 'raised', 'raising', 'raises',
    'pick', 'picked', 'picking', 'picks',
    'pull', 'pulled', 'pulling', 'pulls',
    'push', 'pushed', 'pushing', 'pushes',
    'watch', 'watched', 'watching', 'watches',
    'drive', 'drove', 'driven', 'driving', 'drives',
    'break', 'broke', 'broken', 'breaking', 'breaks',
    'draw', 'drew', 'drawn', 'drawing', 'draws',
    'explain', 'explained', 'explaining', 'explains',
    'receive', 'received', 'receiving', 'receives',
    'determine', 'determined', 'determining', 'determines',
    'serve', 'served', 'serving', 'serves',
    'apply', 'applied', 'applying', 'applies',
    'prepare', 'prepared', 'preparing', 'prepares',
    'accept', 'accepted', 'accepting', 'accepts',
    'achieve', 'achieved', 'achieving', 'achieves',
    'obtain', 'obtained', 'obtaining', 'obtains',
    'contain', 'contained', 'containing', 'contains',
    'present', 'presented', 'presenting', 'presents',
    'exist', 'existed', 'existing', 'exists',
    'result', 'resulted', 'resulting', 'results',
    'continue', 'continued', 'continuing', 'continues',
    'provide', 'provided', 'providing', 'provides',
    'ensure', 'ensured', 'ensuring', 'ensures',
    'enable', 'enabled', 'enabling', 'enables',
    'increase', 'increased', 'increasing', 'increases',
    'decrease', 'decreased', 'decreasing', 'decreases',
    'expand', 'expanded', 'expanding', 'expands',
    'extend', 'extended', 'extending', 'extends',
    'maintain', 'maintained', 'maintaining', 'maintains',
    'establish', 'established', 'establishing', 'establishes',
    'address', 'addressed', 'addressing', 'addresses',
    'implement', 'implemented', 'implementing', 'implements',
    'enhance', 'enhanced', 'enhancing', 'enhances',
    'strengthen', 'strengthened', 'strengthening', 'strengthens',
    'improve', 'improved', 'improving', 'improves',
    'invest', 'invested', 'investing', 'invests',
    'fund', 'funded', 'funding', 'funds',
    'allocate', 'allocated', 'allocating', 'allocates',
    
    # Adverbs
    'also', 'just', 'only', 'very', 'even', 'well', 'back', 'still', 'too',
    'here', 'there', 'now', 'then', 'again', 'already', 'always', 'never',
    'often', 'sometimes', 'usually', 'really', 'quite', 'rather', 'almost',
    'enough', 'especially', 'particularly', 'certainly', 'clearly', 'simply',
    'finally', 'actually', 'recently', 'probably', 'perhaps', 'maybe',
    'indeed', 'currently', 'recently', 'generally', 'specifically', 'directly',
    'certainly', 'obviously', 'definitely', 'necessarily', 'relatively',
    'eventually', 'immediately', 'effectively', 'significantly', 'substantially',
    
    # Numbers/quantity words & numerical terms (budget filler)
    'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
    'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen',
    'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty', 'sixty',
    'seventy', 'eighty', 'ninety', 'hundred', 'thousand',
    'million', 'millions', 'billion', 'billions', 'trillion', 'trillions',
    'first', 'second', 'third', 'fourth', 'fifth', 'last', 'next', 'previous',
    'same', 'different', 'various', 'several', 'whole', 'entire', 'full',
    'half', 'part', 'less', 'least', 'more', 'most', 'further', 'additional',
    'total', 'totals', 'totalling', 'overall', 'aggregate', 'sum', 'average',
    'approximately', 'roughly', 'nearly', 'almost', 'around', 'circa', 'about',
    
    # Common adjectives (too generic)
    'good', 'better', 'best', 'bad', 'worse', 'worst',
    'great', 'small', 'large', 'big', 'little', 'long', 'short',
    'high', 'low', 'higher', 'lower', 'highest', 'lowest',
    'new', 'old', 'young', 'early', 'late',
    'important', 'major', 'main', 'key', 'significant', 'able', 'certain',
    'clear', 'likely', 'possible', 'available', 'necessary', 'true', 'real',
    'right', 'wrong', 'sure', 'hard', 'easy', 'simple', 'complex',
    'own', 'particular', 'special', 'specific', 'general', 'common',
    'similar', 'basic', 'free', 'full', 'single', 'open', 'close',
    'strong', 'weak', 'positive', 'negative', 'public', 'private',
    
    # Budget speech fillers & parliamentary language
    'mr', 'mrs', 'ms', 'sir', 'madam', 'speaker', 'chairman', 'member', 'members',
    'honorable', 'honourable', 'minister', 'ministers', 'government', 'parliament',
    'singapore', 'singaporean', 'singaporeans', 'thank', 'please', 'like', 'way', 'ways',
    'thing', 'things', 'time', 'times', 'year', 'years', 'month', 'months', 'day', 'days',
    'week', 'weeks', 'quarter', 'quarters', 'annual', 'annually', 'fiscal',
    'point', 'points', 'fact', 'facts', 'case', 'cases', 'example', 'examples',
    'number', 'numbers', 'amount', 'amounts', 'level', 'levels', 'rate', 'rates',
    'term', 'terms', 'area', 'areas', 'part', 'parts', 'place', 'places',
    'end', 'ends', 'side', 'sides', 'kind', 'kinds', 'sort', 'sorts', 'type', 'types',
    'form', 'forms', 'group', 'groups', 'line', 'lines', 'order', 'orders',
    'problem', 'problems', 'question', 'questions', 'issue', 'issues',
    'reason', 'reasons', 'result', 'results', 'effect', 'effects',
    'need', 'needs', 'view', 'views', 'idea', 'ideas', 'interest', 'interests',
    'system', 'systems', 'plan', 'plans', 'period', 'periods', 'state', 'states',
    'matter', 'matters', 'basis', 'base', 'range', 'ranges',
    'cent', 'cents', 'percent', 'percentage', 'percentages', 'proportion', 'proportions',
    'figure', 'figures', 'estimate', 'estimates', 'estimated', 'projection', 'projections',
    'budget', 'budgets', 'budgeted', 'budgeting', 'expenditure', 'expenditures',
    'revenue', 'revenues', 'income', 'incomes', 'spending', 'spendings',
    'growth', 'gdp', 'economy', 'economic', 'economies', 'financial', 'finance',
    'policy', 'policies', 'measure', 'measures', 'initiative', 'initiatives',
    'program', 'programs', 'scheme', 'schemes', 'project', 'projects',
    'sector', 'sectors', 'industry', 'industries', 'industrial',
    
    # Single letters and short tokens
    's', 't', 'd', 'll', 've', 're', 'm', 'don', 'doesn', 'didn', 'won', 'wouldn',
    'couldn', 'shouldn', 'isn', 'aren', 'wasn', 'weren', 'hasn', 'haven', 'hadn',
    'fy', 'eg', 'ie', 'etc', 'vs',
}

# Filter out stopwords
word_analysis_clean = word_analysis_filtered[~word_analysis_filtered['word'].isin(STOPWORDS)]

print(f"Stopwords defined: {len(STOPWORDS)}")
print(f"Words after removing stopwords: {len(word_analysis_clean):,}")

In [None]:
# Get top 12 rising and declining words
top_rising = word_analysis_clean.nlargest(12, 'change_score')
top_declining = word_analysis_clean.nsmallest(12, 'change_score')

print("=" * 60)
print("TOP 12 RISING WORDS (most increased in recent years)")
print("=" * 60)
print(f"{'Word':<15} {'Early Freq':>12} {'Recent Freq':>12} {'Change':>10}")
print("-" * 60)
for _, row in top_rising.iterrows():
    print(f"{row['word']:<15} {row['early_mean_freq']:>12.2f} {row['recent_mean_freq']:>12.2f} {row['change_score']:>+10.2f}")

print("\n" + "=" * 60)
print("TOP 12 DECLINING WORDS (most decreased in recent years)")
print("=" * 60)
print(f"{'Word':<15} {'Early Freq':>12} {'Recent Freq':>12} {'Change':>10}")
print("-" * 60)
for _, row in top_declining.iterrows():
    print(f"{row['word']:<15} {row['early_mean_freq']:>12.2f} {row['recent_mean_freq']:>12.2f} {row['change_score']:>+10.2f}")

## 6. Extract Yearly Frequency Data for Selected Words

Now we prepare the full time series data for each of our 24 selected words (12 rising + 12 declining).

In [None]:
# Get the selected words
rising_words = top_rising['word'].tolist()
declining_words = top_declining['word'].tolist()
selected_words = rising_words + declining_words

print(f"Rising words: {rising_words}")
print(f"\nDeclining words: {declining_words}")

# Extract time series for selected words
selected_freq_df = freq_df.loc[selected_words].copy()
selected_freq_df.index.name = 'word'

# Transpose for easier plotting (years as rows, words as columns)
selected_freq_transposed = selected_freq_df.T
selected_freq_transposed.index.name = 'year'

print(f"\nTime series shape: {selected_freq_transposed.shape}")
selected_freq_transposed.head()

## 7. Visualize Rising Words

In [None]:
# Plot rising words
fig, axes = plt.subplots(4, 3, figsize=(15, 16))
axes = axes.flatten()

for i, word in enumerate(rising_words):
    ax = axes[i]
    years_plot = selected_freq_transposed.index
    values = selected_freq_transposed[word]
    
    ax.fill_between(years_plot, values, alpha=0.3, color='#2ecc71')
    ax.plot(years_plot, values, color='#27ae60', linewidth=2)
    ax.set_title(f'"{word}"', fontsize=14, fontweight='bold')
    ax.set_xlabel('Year')
    ax.set_ylabel('per 10k words')
    ax.set_xlim(years_plot.min(), years_plot.max())
    ax.set_ylim(0, None)
    
    # Add trend annotation
    early_avg = selected_freq_transposed.loc[early_years, word].mean()
    recent_avg = selected_freq_transposed.loc[recent_years, word].mean()
    change = recent_avg - early_avg
    ax.annotate(f'+{change:.1f}', xy=(0.95, 0.95), xycoords='axes fraction',
                fontsize=12, ha='right', va='top', color='#27ae60', fontweight='bold')

plt.suptitle('TOP 12 RISING WORDS\n(mentions per 10,000 words)', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('rising_words_sparklines.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Visualize Declining Words

In [None]:
# Plot declining words
fig, axes = plt.subplots(4, 3, figsize=(15, 16))
axes = axes.flatten()

for i, word in enumerate(declining_words):
    ax = axes[i]
    years_plot = selected_freq_transposed.index
    values = selected_freq_transposed[word]
    
    ax.fill_between(years_plot, values, alpha=0.3, color='#e74c3c')
    ax.plot(years_plot, values, color='#c0392b', linewidth=2)
    ax.set_title(f'"{word}"', fontsize=14, fontweight='bold')
    ax.set_xlabel('Year')
    ax.set_ylabel('per 10k words')
    ax.set_xlim(years_plot.min(), years_plot.max())
    ax.set_ylim(0, None)
    
    # Add trend annotation
    early_avg = selected_freq_transposed.loc[early_years, word].mean()
    recent_avg = selected_freq_transposed.loc[recent_years, word].mean()
    change = recent_avg - early_avg
    ax.annotate(f'{change:.1f}', xy=(0.95, 0.95), xycoords='axes fraction',
                fontsize=12, ha='right', va='top', color='#c0392b', fontweight='bold')

plt.suptitle('TOP 12 DECLINING WORDS\n(mentions per 10,000 words)', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('declining_words_sparklines.png', dpi=150, bbox_inches='tight')
plt.show()

## 9. Export Data for D3 Visualization

Export the time series data in a format suitable for D3 sparklines on the website.

In [None]:
import json

# Prepare data for export
export_data = {
    'metadata': {
        'total_speeches': 67,
        'unique_years': len(list(speeches_by_year.keys())),
        'year_range': [min(speeches_by_year.keys()), max(speeches_by_year.keys())],
        'early_period': list(early_years),
        'recent_period': list(recent_years),
        'metric': 'mentions per 10,000 words'
    },
    'rising_words': [],
    'declining_words': []
}

for word in rising_words:
    word_data = {
        'word': word,
        'trend': 'rising',
        'early_avg': round(selected_freq_transposed.loc[early_years, word].mean(), 2),
        'recent_avg': round(selected_freq_transposed.loc[recent_years, word].mean(), 2),
        'change': round(selected_freq_transposed.loc[recent_years, word].mean() - 
                       selected_freq_transposed.loc[early_years, word].mean(), 2),
        'yearly_data': [
            {'year': int(year), 'frequency': round(selected_freq_transposed.loc[year, word], 2)}
            for year in selected_freq_transposed.index
        ]
    }
    export_data['rising_words'].append(word_data)

for word in declining_words:
    word_data = {
        'word': word,
        'trend': 'declining',
        'early_avg': round(selected_freq_transposed.loc[early_years, word].mean(), 2),
        'recent_avg': round(selected_freq_transposed.loc[recent_years, word].mean(), 2),
        'change': round(selected_freq_transposed.loc[recent_years, word].mean() - 
                       selected_freq_transposed.loc[early_years, word].mean(), 2),
        'yearly_data': [
            {'year': int(year), 'frequency': round(selected_freq_transposed.loc[year, word], 2)}
            for year in selected_freq_transposed.index
        ]
    }
    export_data['declining_words'].append(word_data)

# Save to JSON
with open('word_frequency_data.json', 'w') as f:
    json.dump(export_data, f, indent=2)

print("Data exported to word_frequency_data.json")
print(f"\nMetadata:")
print(json.dumps(export_data['metadata'], indent=2))
print(f"\nSample rising word entry:")
print(json.dumps(export_data['rising_words'][0], indent=2))

## 10. Summary Statistics

In [None]:
# Summary table
summary_df = pd.DataFrame({
    'Word': selected_words,
    'Trend': ['Rising']*12 + ['Declining']*12,
    'Early Avg (per 10k)': [selected_freq_transposed.loc[early_years, w].mean() for w in selected_words],
    'Recent Avg (per 10k)': [selected_freq_transposed.loc[recent_years, w].mean() for w in selected_words],
    'Change': [selected_freq_transposed.loc[recent_years, w].mean() - 
               selected_freq_transposed.loc[early_years, w].mean() for w in selected_words]
})

summary_df = summary_df.round(2)
print("Summary of Word Frequency Changes")
print("=" * 70)
print(summary_df.to_string(index=False))

In [None]:
# Export summary to CSV
summary_df.to_csv('word_frequency_summary.csv', index=False)
print("Summary exported to word_frequency_summary.csv")

## 11. Finance Minister "Pet Words" Analysis

Each Finance Minister has a distinctive vocabulary that reflects their era and priorities. We identify **distinctive words** for each FM — words they used significantly more frequently than other ministers.

### Methodology

For each Finance Minister $m$:

1. Calculate the **normalized frequency** of each word across all their speeches:
$$f_m(w) = \frac{\sum_{s \in S_m} \text{count}(w, s)}{\sum_{s \in S_m} \text{total}(s)} \times 10,000$$

2. Calculate the **distinctiveness ratio** — how much more this FM used the word compared to other FMs:
$$\text{ratio}(w, m) = \frac{f_m(w)}{\max_{m' \neq m} f_{m'}(w) + 0.5}$$

The $+0.5$ smoothing prevents division by zero and reduces noise from rarely-used words.

3. Select words where:
   - FM frequency ≥ 2.0 per 10k words (meaningful usage)
   - Word length ≥ 5 characters (filter acronyms)
   - Word is not in stopwords list
   - Distinctiveness ratio is highest among valid words

In [None]:
# Load speeches grouped by Finance Minister
def extract_fm(filename):
    """Extract FM name from filename like '1965-12-13_Lim_Kim_San.txt'"""
    match = re.match(r'\d{4}-\d{2}-\d{2}_(.+?)(?:_supplementary)?\.txt', filename)
    if match:
        return match.group(1).replace('_', ' ')
    return None

speeches_by_fm = defaultdict(list)
fm_file_count = defaultdict(int)

for filepath in sorted(CORPUS_PATH.glob('*.txt')):
    fm = extract_fm(filepath.name)
    if fm:
        fm_file_count[fm] += 1
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        speeches_by_fm[fm].append(text)

print("Finance Ministers and their speech counts:")
print("-" * 40)
for fm in sorted(speeches_by_fm.keys(), key=lambda x: min([extract_year(f.name) for f in CORPUS_PATH.glob('*.txt') if extract_fm(f.name) == x])):
    years = [extract_year(f.name) for f in CORPUS_PATH.glob('*.txt') if extract_fm(f.name) == fm]
    print(f"{fm}: {fm_file_count[fm]} speeches ({min(years)}-{max(years)})")

In [None]:
# Calculate normalized word frequencies per FM
word_counts_by_fm = {}  # fm -> Counter of words
total_words_by_fm = {}  # fm -> total word count

for fm, texts in speeches_by_fm.items():
    combined_text = ' '.join(texts)
    words = tokenize(combined_text)
    word_counts_by_fm[fm] = Counter(words)
    total_words_by_fm[fm] = len(words)

# Get all unique words across all FMs
all_words_fm = set()
for counter in word_counts_by_fm.values():
    all_words_fm.update(counter.keys())

# Build normalized frequency matrix (words x FMs)
fm_list = sorted(word_counts_by_fm.keys())
fm_freq_data = {}

for fm in fm_list:
    total = total_words_by_fm[fm]
    counter = word_counts_by_fm[fm]
    fm_freq_data[fm] = {word: (counter.get(word, 0) / total) * 10000 for word in all_words_fm}

fm_freq_df = pd.DataFrame(fm_freq_data)
fm_freq_df = fm_freq_df.fillna(0)

print(f"FM frequency matrix shape: {fm_freq_df.shape} (words x FMs)")
print(f"\nTotal words per FM:")
for fm in fm_list:
    print(f"  {fm}: {total_words_by_fm[fm]:,} words")

In [None]:
# Extended stopwords for FM analysis (includes generic budget terms)
FM_STOPWORDS = STOPWORDS | {
    # Additional generic terms
    'country', 'countries', 'people', 'need', 'needs', 'work', 'working', 'works',
    'world', 'global', 'national', 'local', 'companies', 'company', 'business',
    'cost', 'costs', 'price', 'prices', 'value', 'values', 'market', 'markets',
    'service', 'services', 'social', 'economic', 'political', 'development',
    'developing', 'developed', 'progress', 'future', 'current', 'present',
    'potential', 'opportunity', 'opportunities', 'challenge', 'challenges',
    'effort', 'efforts', 'action', 'actions', 'step', 'steps', 'approach',
    'strategy', 'strategies', 'target', 'targets', 'goal', 'goals', 'objective',
    'objectives', 'priority', 'priorities', 'focus', 'support', 'supporting',
    'supported', 'assist', 'assistance', 'help', 'helping', 'helps',
    'achieve', 'achieved', 'achieving', 'success', 'successful', 'effective',
    'efficiency', 'efficient', 'quality', 'performance', 'standards', 'standard',
    'capacity', 'capabilities', 'capability', 'resources', 'resource', 'asset',
    'assets', 'investment', 'investments', 'contribution', 'contributions',
    'benefit', 'benefits', 'benefiting', 'impact', 'impacts', 'outcomes', 'outcome',
    'framework', 'frameworks', 'structure', 'structures', 'process', 'processes',
    'activities', 'activity', 'operations', 'operation', 'management', 'managing',
    'enterprise', 'enterprises', 'organization', 'organizations', 'institution',
    'institutions', 'agency', 'agencies', 'department', 'departments', 'ministry',
    'board', 'boards', 'council', 'councils', 'committee', 'committees',
    'legislation', 'laws', 'law', 'regulations', 'regulation', 'rules', 'rule',
    'conditions', 'condition', 'circumstances', 'situation', 'situations',
    'environment', 'environments', 'context', 'aspects', 'aspect', 'factors',
    'factor', 'elements', 'element', 'features', 'feature', 'characteristics',
    'property', 'properties', 'nature', 'forms', 'modes', 'model', 'models',
}

def get_distinctive_words(fm, freq_df, n=10, min_freq=2.0, min_len=5):
    """
    Find words distinctively used by this FM compared to others.
    Uses ratio: FM_freq / (max_other_FM_freq + 0.5)
    """
    fm_freq = freq_df[fm]
    other_fms = [f for f in freq_df.columns if f != fm]
    other_max = freq_df[other_fms].max(axis=1)
    
    # Calculate distinctiveness ratio
    ratio = fm_freq / (other_max + 0.5)
    
    # Filter criteria
    valid = (
        (fm_freq >= min_freq) &                          # Minimum frequency
        (~fm_freq.index.isin(FM_STOPWORDS)) &           # Not a stopword
        (fm_freq.index.str.len() >= min_len) &          # Minimum length
        (fm_freq.index.str.islower()) &                  # Lowercase only (filter acronyms)
        (fm_freq.index.str.isalpha())                    # Alphabetic only
    )
    
    return ratio[valid].nlargest(n)

# Calculate distinctive words for each FM
print("=" * 70)
print("FINANCE MINISTER DISTINCTIVE WORDS")
print("=" * 70)
print("(Words each FM used significantly more than other ministers)")
print()

fm_distinctive_words = {}

for fm in sorted(fm_freq_df.columns, key=lambda x: min([extract_year(f.name) for f in CORPUS_PATH.glob('*.txt') if extract_fm(f.name) == x])):
    distinctive = get_distinctive_words(fm, fm_freq_df, n=10)
    fm_distinctive_words[fm] = distinctive
    
    years = [extract_year(f.name) for f in CORPUS_PATH.glob('*.txt') if extract_fm(f.name) == fm]
    era = f"{min(years)}-{max(years)}" if min(years) != max(years) else str(min(years))
    
    print(f"\n{fm} ({era}):")
    print("-" * 50)
    for word, ratio in distinctive.items():
        freq = fm_freq_df.loc[word, fm]
        print(f"  {word:<20} (freq: {freq:.1f}/10k, ratio: {ratio:.1f}x)")

In [None]:
# Create summary table of FM pet words (top 3 per FM)
print("\n" + "=" * 70)
print("SUMMARY: FINANCE MINISTER PET WORDS (Top 3)")
print("=" * 70)
print("\nFor curated_story.json fm_words field:\n")

fm_summary = []
for fm in sorted(fm_freq_df.columns, key=lambda x: min([extract_year(f.name) for f in CORPUS_PATH.glob('*.txt') if extract_fm(f.name) == x])):
    years = [extract_year(f.name) for f in CORPUS_PATH.glob('*.txt') if extract_fm(f.name) == fm]
    decade = f"{min(years)//10*10}s" if min(years)//10 == max(years)//10 else f"{min(years)//10*10}s-{max(years)//10*10}s"
    
    top_words = list(fm_distinctive_words[fm].head(3).index)
    
    fm_summary.append({
        'fm': fm,
        'era': decade,
        'years': f"{min(years)}-{max(years)}",
        'words': ', '.join(top_words)
    })
    
    print(f'{{"fm": "{fm}", "era": "{decade}", "words": "{", ".join(top_words)}"}}')

# Create DataFrame for display
summary_table = pd.DataFrame(fm_summary)
print("\n")
print(summary_table.to_string(index=False))

In [None]:
# Export FM distinctive words to JSON
fm_export_data = {
    'methodology': {
        'metric': 'distinctiveness ratio = FM_freq / (max_other_FM_freq + 0.5)',
        'min_frequency': 2.0,
        'min_word_length': 5,
        'normalized_to': 'mentions per 10,000 words'
    },
    'finance_ministers': []
}

for fm in sorted(fm_freq_df.columns, key=lambda x: min([extract_year(f.name) for f in CORPUS_PATH.glob('*.txt') if extract_fm(f.name) == x])):
    years = [extract_year(f.name) for f in CORPUS_PATH.glob('*.txt') if extract_fm(f.name) == fm]
    
    distinctive = fm_distinctive_words[fm]
    words_data = [
        {
            'word': word,
            'frequency': round(fm_freq_df.loc[word, fm], 2),
            'distinctiveness_ratio': round(ratio, 2)
        }
        for word, ratio in distinctive.items()
    ]
    
    fm_export_data['finance_ministers'].append({
        'name': fm,
        'year_range': [min(years), max(years)],
        'num_speeches': fm_file_count[fm],
        'total_words': total_words_by_fm[fm],
        'distinctive_words': words_data
    })

with open('fm_distinctive_words.json', 'w') as f:
    json.dump(fm_export_data, f, indent=2)

print("FM distinctive words exported to fm_distinctive_words.json")
print(f"\nSample entry:")
print(json.dumps(fm_export_data['finance_ministers'][0], indent=2))

## 12. Persistent Words Analysis (Across All Decades)

Some words appear consistently across all six decades of budget speeches. These represent the enduring expectations of the social compact — demands that never went away.

### Methodology

1. Group speeches by decade (1960s, 1970s, ..., 2020s)
2. Calculate normalized frequency for each word per decade (mentions per 10,000 words)
3. Identify words that:
   - Appear in **every decade** with frequency ≥ 1.0 per 10k words
   - Are not stopwords
   - Show meaningful presence across the corpus

In [None]:
# Group speeches by decade
def get_decade(year):
    """Convert year to decade label (e.g., 1965 -> '60s', 2024 -> '20s')"""
    decade_num = (year // 10) % 10
    return f"{decade_num}0s"

speeches_by_decade = defaultdict(list)

for year, texts in speeches_by_year.items():
    decade = get_decade(year)
    speeches_by_decade[decade].extend(texts)

# Sort decades in chronological order
decade_order = ['60s', '70s', '80s', '90s', '00s', '10s', '20s']
decades_present = [d for d in decade_order if d in speeches_by_decade]

print("Speeches grouped by decade:")
print("-" * 40)
for decade in decades_present:
    combined = ' '.join(speeches_by_decade[decade])
    words = tokenize(combined)
    print(f"{decade}: {len([y for y in speeches_by_year.keys() if get_decade(y) == decade])} years, {len(words):,} total words")

In [None]:
# Calculate word frequencies per decade
word_counts_by_decade = {}
total_words_by_decade = {}

for decade in decades_present:
    combined = ' '.join(speeches_by_decade[decade])
    words = tokenize(combined)
    word_counts_by_decade[decade] = Counter(words)
    total_words_by_decade[decade] = len(words)

# Get all unique words across decades
all_words_decades = set()
for counter in word_counts_by_decade.values():
    all_words_decades.update(counter.keys())

# Build normalized frequency matrix (words x decades)
decade_freq_data = {}
for decade in decades_present:
    total = total_words_by_decade[decade]
    counter = word_counts_by_decade[decade]
    decade_freq_data[decade] = {word: (counter.get(word, 0) / total) * 10000 for word in all_words_decades}

decade_freq_df = pd.DataFrame(decade_freq_data)
decade_freq_df = decade_freq_df[decades_present]  # Ensure correct column order
decade_freq_df = decade_freq_df.fillna(0)

print(f"Decade frequency matrix shape: {decade_freq_df.shape} (words x decades)")
decade_freq_df.head()

In [None]:
# Find words that appear consistently across ALL decades
# Criteria: 
# - Present in every decade with frequency >= MIN_THRESHOLD
# - Not a stopword
# - Related to expectations/demands (effort, skills, competitiveness, etc.)

MIN_DECADE_FREQ = 1.0  # Minimum frequency per decade (per 10k words)

# Check which words appear in all decades above threshold
all_decades_present = decade_freq_df.min(axis=1) >= MIN_DECADE_FREQ

# Filter out stopwords
persistent_stopwords = STOPWORDS | FM_STOPWORDS | {
    'workers', 'businesses', 'families', 'seniors',  # These are rising words
    'labor', 'workforce', 'employment', 'employed', 'employees',
    'training', 'productivity', 'productive', 'production'
}
not_stopword = ~decade_freq_df.index.isin(persistent_stopwords)

# Apply filters
consistent_words = decade_freq_df[all_decades_present & not_stopword]

print(f"Words appearing in ALL {len(decades_present)} decades (≥{MIN_DECADE_FREQ}/10k each):")
print(f"Found {len(consistent_words)} consistent words")
print()

# Calculate overall presence score (average across decades)
consistent_words['avg_freq'] = consistent_words[decades_present].mean(axis=1)
consistent_words['min_freq'] = consistent_words[decades_present].min(axis=1)
consistent_words['max_freq'] = consistent_words[decades_present].max(axis=1)
consistent_words['consistency'] = consistent_words['min_freq'] / consistent_words['max_freq']  # 1.0 = perfectly consistent

# Sort by average frequency
consistent_sorted = consistent_words.sort_values('avg_freq', ascending=False)

print("Top 30 consistent words (by average frequency):")
print("-" * 80)
print(f"{'Word':<20} {'Avg':>8} {'Min':>8} {'Max':>8} {'Consist.':>8}")
print("-" * 80)
for word in consistent_sorted.head(30).index:
    row = consistent_sorted.loc[word]
    print(f"{word:<20} {row['avg_freq']:>8.1f} {row['min_freq']:>8.1f} {row['max_freq']:>8.1f} {row['consistency']:>8.2f}")

In [None]:
# Get rates for specific "demand" words that persist across decades
# These represent enduring expectations: effort, skills, competitiveness

# Target words related to expectations/demands
demand_related_words = [
    'effort', 'efforts',
    'skill', 'skills', 'skilled',
    'train', 'trained',
    'competitive', 'competitiveness', 'compete',
    'improve', 'improvement', 'improvements',
    'contribute', 'contribution', 'contributions',
    'upgrade', 'upgrading',
    'adapt', 'adaptable',
    'productive', 'productivity',
    'efficient', 'efficiency',
    'innovative', 'innovate',
    'strive', 'striving'
]

print("=" * 80)
print("PERSISTENT DEMAND WORDS - Frequency by Decade")
print("=" * 80)
print("(Mentions per 10,000 words)")
print()
print(f"{'Word':<18}", end='')
for d in decades_present:
    print(f"{d:>8}", end='')
print(f"{'Avg':>8}")
print("-" * 80)

for word in demand_related_words:
    if word in decade_freq_df.index:
        row = decade_freq_df.loc[word, decades_present]
        avg = row.mean()
        min_val = row.min()
        # Only show if present in at least 5 decades
        if (row >= 0.5).sum() >= 5:
            print(f"{word:<18}", end='')
            for d in decades_present:
                val = row[d]
                print(f"{val:>8.1f}", end='')
            print(f"{avg:>8.1f}")

In [None]:
# Export persistent words for curated_story.json
# Select meaningful words that represent enduring expectations/demands

SELECTED_PERSISTENT_WORDS = ['effort', 'skills', 'training', 'competitive', 'improve', 'productivity']

print("=" * 70)
print("PERSISTENT WORDS FOR curated_story.json")
print("=" * 70)
print("\nword_data format for 'The expectation persisted' section:\n")

word_data_export = []

for word in SELECTED_PERSISTENT_WORDS:
    if word in decade_freq_df.index:
        rates = [round(decade_freq_df.loc[word, d], 1) for d in decades_present]
        word_entry = {"word": word, "rates": rates}
        word_data_export.append(word_entry)
        print(f'{{"word": "{word}", "rates": {rates}}}')

print(f'\n"decades": {decades_present}')

# Also export to JSON file
persistent_export = {
    'description': 'Words that appeared consistently across all decades of budget speeches',
    'metric': 'mentions per 10,000 words',
    'decades': decades_present,
    'word_data': word_data_export
}

with open('persistent_words_data.json', 'w') as f:
    json.dump(persistent_export, f, indent=2)

print("\n\nExported to persistent_words_data.json")