In [None]:
import os
import re
import time
import random
import requests
from bs4 import BeautifulSoup
from google.colab import drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

def extract_trump_statements(text):
    """Extracts only statements made by Trump from a speech transcript."""
    out_lines = []
    capturing = True  # Flag to track when Trump starts speaking

    lines = text.split("\n")

    for line in lines:
        line = line.strip()

        if re.match(r'^(PRESIDENT TRUMP:|THE PRESIDENT:|Donald Trump:|TRUMP:|President Trump:|Mr. Trump:|President Donald J. Trump:)', line, re.IGNORECASE):
            capturing = True  # Start capturing Trump's speech
            line = re.sub(r'^(PRESIDENT TRUMP:|THE PRESIDENT:|Donald Trump:|TRUMP:|President Trump:|Mr. Trump:|President Donald J. Trump:)', "", line, flags=re.IGNORECASE).strip()
            out_lines.append(line)

        # Stop capturing if another speaker is detected
        elif re.match(r'^[A-Za-z\s.]+:', line):
            capturing = False

        elif capturing:
            out_lines.append(line)

    return "\n".join(out_lines).strip()


def remove_questions(transcript):

    cleaned_transcript = re.sub(r'(?m)^Q\s+.*?\n', '', transcript)

    return cleaned_transcript.strip()

In [None]:

# Set folder path in Google Drive
SAVE_FOLDER = "/content/drive/MyDrive/TFG/POST_ELEC_SCRIPTS"
os.makedirs(SAVE_FOLDER, exist_ok=True)

# Base URL
BASE_URL = "https://trumpwhitehouse.archives.gov"
start_url = "https://trumpwhitehouse.archives.gov/remarks/page/199/"

def scrape_page(url):
    global count
    global stop_scraping
    print(f"Fetching page: {url}")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching page: {url}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    # page = soup.find("page-results")

    if not soup:
        print("No content found on the page.")
        return None

    for article in soup.find_all("article"):
        if stop_scraping:
            break

        title_link = article.find("a")

        if not title_link:
            continue

        title = title_link.text.strip()
        link = title_link["href"]
        date = article.find("time").text.strip()

        year = int(date[-4:])
        if year != 2017 or count >= 200:
            print('Scraping stopped with',count, 'documents scraped until year', year)
            stop_scraping = True
            break

        print(f"Fetching document: {title} ({date})")
        if "trump" not in title.lower() and 'address' not in title.lower():
            print(f"Title does not contain 'trump': {title}")
            continue

        time.sleep(random.uniform(2, 5))

        doc_response = requests.get(link)
        if doc_response.status_code != 200:
            print(f"Failed to fetch document: {link}")
            continue

        doc_soup = BeautifulSoup(doc_response.text, "html.parser")
        content_div = doc_soup.find("div", {"class": "page-content__content editor"})

        if content_div:
            paragraphs = content_div.find_all("p")
            text = "\n".join(p.get_text(strip=True) for p in paragraphs)  # Join text content
        else:
            print("Content not found.")

        # retaining trump-only text
        extracted_text = extract_trump_statements(text)

        # removing the questions
        script = remove_questions(extracted_text)

        word_count = len(script.split())
        if word_count < 450:
            print(f"Skipping {title} ({date}) - Only {word_count} words.")
            continue

        clean_title = re.sub(r"[^\w\-_]", "_", title)
        filename = f"{SAVE_FOLDER}/{clean_title}.txt"

        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{script}")

        print(f"Saved to Google Drive: {filename} ({word_count} words)\n")
        count += 1

    return soup

def get_previous_page(soup):
    prev_link = soup.find("a", class_="pagination__prev")  # Search by class
    if prev_link and "href" in prev_link.attrs:
        return prev_link["href"]
    return None

current_url = start_url
stop_scraping = False
count = 0

while current_url and not stop_scraping:
    soup = scrape_page(current_url)
    if soup:
        current_url = get_previous_page(soup)
    else:
        break


Fetching page: https://trumpwhitehouse.archives.gov/remarks/page/199/
Fetching document: Remarks by President Trump and Vice President Pence at CIA Headquarters (Jan 21, 2017)
Saved to Google Drive: /content/drive/MyDrive/TFG/POST_ELEC_SCRIPTS/Remarks_by_President_Trump_and_Vice_President_Pence_at_CIA_Headquarters.txt (2395 words)

Fetching document: The Inaugural Address (Jan 20, 2017)
Skipping The Inaugural Address (Jan 20, 2017) - Only 405 words.
Fetching page: https://trumpwhitehouse.archives.gov/remarks/page/198/
Fetching document: Remarks by President Trump in Strategy and Policy Forum (Feb 3, 2017)
Saved to Google Drive: /content/drive/MyDrive/TFG/POST_ELEC_SCRIPTS/Remarks_by_President_Trump_in_Strategy_and_Policy_Forum.txt (1049 words)

Fetching document: Remarks by President Trump at Signing of Executive Order on Fiduciary Rule (Feb 3, 2017)
Skipping Remarks by President Trump at Signing of Executive Order on Fiduciary Rule (Feb 3, 2017) - Only 66 words.
Fetching document: Rem

## EXTRAS

In [None]:
# BASE_URL = "https://trumpwhitehouse.archives.gov"
# start_url = "https://trumpwhitehouse.archives.gov/remarks/page/199/"

# response = requests.get(start_url)
# soup = BeautifulSoup(response.text, "html.parser")
# articles = soup.find_all("article")

# i = 0
# for article in articles:
#   # if i<1:
#     title_link = article.find("a")
#     title = title_link.text.strip()
#     link = title_link["href"]
#     date = article.find("time").text.strip()
#     # i += 1



# doc_response = requests.get(link)
# doc_soup = BeautifulSoup(doc_response.text, "html.parser")
# content_div = doc_soup.find("div", {"class": "page-content__content editor"})
# paragraphs = content_div.find_all("p")  # Get all <p> elements
# useful_text = "\n".join(p.get_text(strip=True) for p in paragraphs)  # Join text content
# ut =  extract_trump_statements(useful_text)
# print(ut)
# # print(text)

In [None]:
# lines = useful_text.split("\n")

# out_lines = []
# capturing = True

# for line in lines:
#     line = line.strip()

#     # Stop capturing if "END" is found
#     if re.match(r'^END\d', line):
#         capturing = False
#         break  # Stop processing further lines

#     # print(line)
#     if re.match(r'^(PRESIDENT TRUMP:|THE PRESIDENT:|Donald Trump:|TRUMP:|President Trump:|Mr. Trump:)', line, re.IGNORECASE):
#         print(2)
#         capturing = True  # Start capturing Trump's speech
#         line = re.sub(r'^(PRESIDENT TRUMP:|THE PRESIDENT:|Donald Trump:|TRUMP:|President Trump:|Mr. Trump:)', "", line, flags=re.IGNORECASE).strip()
#         out_lines.append(line)

#     # Stop capturing if another speaker is detected
#     elif re.match(r'^[A-Z\s.]+[:]', line):
#         print(line)
#         print(0)
#         capturing = False

#     elif capturing:
#         # print(1)
#         out_lines.append(line)

# uts = "\n".join(out_lines).strip()
# print(uts)