# Exploring American Presidency Project Kaggle

In [None]:
import os 


os.chdir(r"C:\Users\shime\Downloads")

print(os.getcwd())

C:\Users\shime\Downloads
C:\Users\shime\Downloads


### We scraped the data using the following scraper 

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
from pathlib import Path

BASE_URL = "https://www.presidency.ucsb.edu"
LIST_PATH = "/documents/app-categories/statements"
LIST_URL = BASE_URL + LIST_PATH
OUTFILE = "presidential_statements.csv"
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; Scraper/1.0)"}
TEMP_SAVE_EVERY = 100   # flush every N records
DELAY_BETWEEN_REQUESTS = 0.35

def fetch(url, timeout=15, tries=3):
    for attempt in range(tries):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=timeout)
            resp.raise_for_status()
            return resp
        except Exception as e:
            print(f"Fetch error ({attempt+1}/{tries}) for {url}: {e}")
            time.sleep(1)
    return None

def extract_detail_content(detail_url):
    r = fetch(detail_url)
    if not r:
        return "", "", ""
    s = BeautifulSoup(r.text, "html.parser")
    content_node = s.select_one("div.field-docs-content")
    content = content_node.get_text("\n", strip=True) if content_node else ""

    # categories (if present)
    cats = s.select("div.group-meta a, div.field-ds-filed-under- a, .field-ds-filed-under a")
    categories = ", ".join([c.get_text(strip=True) for c in cats]) if cats else ""

    # citation
    cit = s.select_one(".field-prez-document-citation, .ucsbapp_citation")
    citation = cit.get_text(" ", strip=True) if cit else ""

    return content, categories, citation

def read_existing_urls(outfile):
    p = Path(outfile)
    if not p.exists():
        return set()
    try:
        with p.open("r", encoding="utf-8", newline="") as f:
            reader = csv.DictReader(f)
            return {row.get("url","").strip() for row in reader if row.get("url")}
    except Exception as e:
        print("Error reading existing CSV, will start fresh:", e)
        return set()

def scrape_statements(max_pages=None):
    existing_urls = read_existing_urls(OUTFILE)
    page = 0
    total_saved = 0

    # prepare CSV writer (append mode)
    headers = ["title", "url", "president", "date", "content", "categories", "citation"]
    outfile_path = Path(OUTFILE)
    write_header = not outfile_path.exists()

    csvfile = open(outfile_path, "a", newline="", encoding="utf-8")
    writer = csv.DictWriter(csvfile, 
                            fieldnames=headers,
                            quoting = csv.QUOTE_ALL,
                            escapechar='\\')
    if write_header:
        writer.writeheader()

    try:
        while True:
            if max_pages is not None and page >= max_pages:
                print("Reached max_pages limit, stopping.")
                break

            page_url = f"{LIST_URL}?page={page}"
            resp = fetch(page_url)
            if not resp:
                print("Failed to fetch listing page:", page_url)
                break

            soup = BeautifulSoup(resp.text, "html.parser")

            # listing item containers for statements
            items = soup.select("div.views-row, div.node-teaser, div.node-documents.node-teaser")
            # filter duplicates and ensure items have a link
            items = [it for it in items if it.select_one("a[href*='/documents/']")]

            if not items:
                print(f"No items found on page {page}. Stopping.")
                break

            num_items = len(items)
            print(f"Scraping page {page+1}: {num_items} statements (Total so far: {total_saved + num_items})")

            for item in items:
                # Title + link
                title_a = item.select_one(".field-title a, h3 a, a[href*='/documents/']")
                if not title_a:
                    print("Skipping item (no title link). snippet:", item.get_text(" ", strip=True)[:150])
                    continue
                title = title_a.get_text(strip=True)
                href = title_a.get("href", "").strip()
                full_link = href if href.startswith("http") else BASE_URL + href

                # skip if already scraped
                if full_link in existing_urls:
                    # print("Skipping already-saved:", full_link)
                    continue

                # president (the "Related" link on the right column)
                pres_a = item.select_one(".col-sm-4 a, .views-field-field-president a, .field-title ~ .col-sm-4 a")
                president = pres_a.get_text(strip=True) if pres_a else ""

                # date
                date_span = item.select_one("span.date-display-single, .views-field-field-docs-date span, .views-field-created span")
                date = date_span.get("content", date_span.get_text(strip=True)) if date_span else ""

                # fetch detail content
                content, categories, citation = extract_detail_content(full_link)

                row = {
                    "title": title,
                    "url": full_link,
                    "president": president,
                    "date": date,
                    "content": content,
                    "categories": categories,
                    "citation": citation
                }

                writer.writerow(row)
                existing_urls.add(full_link)
                total_saved += 1

                if total_saved % TEMP_SAVE_EVERY == 0:
                    csvfile.flush()
                    print(f"Checkpoint: saved {total_saved} records so far.")

                time.sleep(DELAY_BETWEEN_REQUESTS)

            page += 1
            # small delay between pages
            time.sleep(0.5)

    finally:
        csvfile.close()

    print("Finished. Total new records saved:", total_saved)
    return total_saved

if __name__ == "__main__":
    # For a quick test set max_pages=2
    # For full run use max_pages=None
    scrape_statements(max_pages=None)


In [None]:
#Original scraper had issues with formatting columns, so we reformat here

import pandas as pd

df_final = pd.read_csv("presidential_statements.csv", header=None)

df_final.columns = [
    "title",
    "url",
    "president",
    "date",
    "content",
    "categories",
    "citation"
]

df_final.to_csv("presidential_statements_scraped.csv", index=False)

# From here below I'm working on the American Presidency presidential statements csv

In [13]:
#Read in the CSV file
import pandas as pd

dataset = pd.read_csv("presidential_statements_scraped.csv")

#Inspect the dataset
print(dataset.shape)

#Print the first few rows
print(dataset.head())

#Check columns
print(dataset.columns)

(12399, 7)
                                               title  \
0            Joint Statement on U.S.â€“Ukraine Meeting   
1  Statement on Signing the Epstein Files Transpa...   
2  Joint Statement on a Framework for a United St...   
3  Joint Statement on a Framework for United Stat...   
4  Joint Statement on a Framework for United Stat...   

                                                 url  \
0  https://www.presidency.ucsb.edu/documents/join...   
1  https://www.presidency.ucsb.edu/documents/stat...   
2  https://www.presidency.ucsb.edu/documents/join...   
3  https://www.presidency.ucsb.edu/documents/join...   
4  https://www.presidency.ucsb.edu/documents/join...   

                    president                       date  \
0  Donald J. Trump (2nd Term)  2025-11-23T00:00:00+00:00   
1  Donald J. Trump (2nd Term)  2025-11-19T00:00:00+00:00   
2  Donald J. Trump (2nd Term)  2025-11-14T00:00:00+00:00   
3  Donald J. Trump (2nd Term)  2025-11-13T00:00:00+00:00   
4  Donald J. 

### Adding features for future visualization 

In [14]:
#Add party affiliation based on president name
#Make copy of df to keep original intact
dataset_raw = dataset.copy()

'''From this point on we will manipulate dataset and keep dataset_raw as original'''


#Check dataset again
print(dataset.columns)

#Check presidents in dataset
print(dataset['president'].unique())
print(len(dataset['president'].unique()))

#Add party affiliation
party_affiliation = { "Donald J. Trump (1st Term)" : "Republican",
                        "Donald J. Trump (2nd Term)" : "Republican",
                        "Joseph R. Biden, Jr.": "Democrat",
                        "Barack Obama": "Democrat",
                        "George W. Bush": "Republican",
                        "William J. Clinton": "Democrat",
                        "George Bush": "Republican",
                        "Ronald Reagan": "Republican",
                        "Jimmy Carter": "Democrat",
                        "Gerald R. Ford": "Republican",
                        "Richard Nixon": "Republican",
                        "Lyndon B. Johnson": "Democrat",
                        "John F. Kennedy": "Democrat",
                        "Dwight D. Eisenhower": "Republican",
                        "Harry S Truman": "Democrat",
                        "Franklin D. Roosevelt": "Democrat",
                        "Herbert Hoover": "Republican", 
                        "Calvin Coolidge": "Republican",
                        "Warren G. Harding": "Republican",
                        "Woodrow Wilson": "Democrat",
                        "William Howard Taft": "Republican",
                        "Theodore Roosevelt": "Republican",
                        "William McKinley": "Republican",
                        "Grover Cleveland": "Democrat",
                        "Benjamin Harrison": "Republican",
                        "Chester A. Arthur": "Republican",
                        "James A. Garfield": "Republican",
                        "Rutherford B. Hayes": "Republican",
                        "Ulysses S. Grant": "Republican",
                        "Andrew Johnson": "Democrat",
                        "Abraham Lincoln": "Republican",
                        "James Buchanan": "Democrat",
                        "Franklin Pierce": "Democrat",
                        "Millard Fillmore": "Whig",
                        "Zachary Taylor": "Whig",
                        "James K. Polk": "Democrat",
                        "John Tyler": "Whig",
                        "William Harrison": "Whig",
                        "Martin Van Buren": "Democrat",
                        "Andrew Jackson": "Democrat",
                        "John Quincy Adams": "National Republican",
                        "James Monroe": "Democrat-Republican",
                        "James Madison": "Democrat-Republican",
                        "Thomas Jefferson": "Democrat-Republican",
                        "John Adams": "Federalist",
                        "George Washington": "Federalist"
                        }

#Check if we have all presidents listed
print(len(party_affiliation))

#Map party affiliation to df
dataset['party'] = dataset['president'].map(party_affiliation)

#Check if it correctly mapped
print(dataset[['president', 'party']].drop_duplicates().sort_values(by='president'))

'''Prior to the two party system we know, there were other parties such as Whig, Federalist, National Republican, and Democrat-Republican. We will keep these as is for now.'''

#Remove gavin newsom speeches if any since he is not a president
dataset = dataset[dataset['president'] != 'Gavin Newsom']

#Check dataset again to see if drop worked
"Gavin Newsom" in dataset['president'].unique()

Index(['title', 'url', 'president', 'date', 'content', 'categories',
       'citation'],
      dtype='object')
['Donald J. Trump (2nd Term)' 'Gavin Newsom' 'Joseph R. Biden, Jr.'
 'Donald J. Trump (1st Term)' 'Barack Obama' 'George W. Bush'
 'William J. Clinton' 'George Bush' 'Ronald Reagan' 'Jimmy Carter'
 'Gerald R. Ford' 'Richard Nixon' 'Lyndon B. Johnson' 'John F. Kennedy'
 'Dwight D. Eisenhower' 'Harry S Truman' 'Franklin D. Roosevelt'
 'Herbert Hoover' 'Calvin Coolidge' 'Warren G. Harding' 'Woodrow Wilson'
 'William Howard Taft' 'Theodore Roosevelt' 'William McKinley'
 'Grover Cleveland']
25
46
                        president       party
1890                 Barack Obama    Democrat
12383             Calvin Coolidge  Republican
1497   Donald J. Trump (1st Term)  Republican
0      Donald J. Trump (2nd Term)  Republican
10773        Dwight D. Eisenhower  Republican
11805       Franklin D. Roosevelt    Democrat
40                   Gavin Newsom         NaN
6601                  Ge

False

### Dataset is inspected, now we can preprocess the text

In [15]:
#Preprocess text

#Import necessary libraries
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords') # Download if needed
from nltk.tokenize import word_tokenize
nltk.download('punkt') #Download if needed
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet') #Download if needed

#Define stopwords
stopwords = set(stopwords.words('english'))

#Create function to preprocess text
def preprocess(text): 
    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'&[a-z]+;', ' ', text)  # Remove HTML entities
    text = re.sub(r"[^a-z\s']", ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ',text).strip()  # Remove extra whitespace
    text = re.sub (r'\d+', '', text)  # Remove numbers
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stopwords]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)  # Join tokens back to string

#Apply to text 
dataset['cleaned_content'] = dataset['content'].apply(preprocess)

#Check df 
print(dataset[['content', 'cleaned_content']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shime\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shime\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shime\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                             content  \
0  On 23 November 2025, representatives of the Un...   
1  Jeffrey Epstein, who was charged by the Trump ...   
2  Today, the United States of America (United St...   
3  The United States of America (United States, o...   
4  President Donald J. Trump and President Daniel...   

                                     cleaned_content  
0  november representative united state ukraine m...  
1  jeffrey epstein charged trump justice departme...  
2  today united state america united state swiss ...  
3  united state america united state u republic e...  
4  president donald j trump president daniel nobo...  


In [None]:
# Topic modeling with LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#Make new stopword list due to overlap in previous runs
stopwords = ['american', 'america', 'states', 'state', 'president']

#Vectorize statements 
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords, ngram_range=(1,3))
X = vectorizer.fit_transform(dataset['cleaned_content'])

#Extract the topics 
lda = LatentDirichletAllocation(n_components=7, random_state=42)
lda.fit(X)

#Function for displaying the topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print("|".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

#Display topics
no_top_words = 10
feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, no_top_words)

#Topic coherence evaluation
print(lda.perplexity(X))
print(lda.score(X))


Topic 0:
soviet state agreement country energy united world united state arm security
Topic 1:
including support country state global united security cooperation development international
Topic 2:
act provision section executive congress law authority national president branch
Topic 3:
american year today job business family america economy care work
Topic 4:
people american life world family nation year day state united
Topic 5:
president united state united state minister prime prime minister country cooperation security
Topic 6:
government year state congress federal people program new american work
Topic 7:
program health act american congress child law legislation today service
Topic 8:
united international state ukraine united state people support russia right country
Topic 9:
state united united state nuclear government weapon nation international president country
21783.700169002244
-37146336.044040665


In [None]:
#Attempting topic modeling with Gensim LDA
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

#Prepare data for Gensim
texts = [doc.split() for doc in dataset['cleaned_content']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

#Build LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15, random_state=42)

#Display topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

#Evaluate topic coherence
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_lda}")

In [None]:
#Topic modeling again this time with NMF
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

#Vectorize again using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_tfidf = vectorizer.fit_transform(dataset['cleaned_content'])

#Fit NMF
nmf = NMF(n_components=5, random_state=42)
nmf.fit(X_tfidf)

#Display topics
no_top_words = 10 
feature_names = vectorizer.get_feature_names_out()
display_topics(nmf, feature_names, no_top_words)

In [None]:
#Visualizations for topic modeling

### First classifier