In [None]:
import os
import re
import time
import random
import requests
from bs4 import BeautifulSoup
from google.colab import drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
def extract_trump_statements(text):
    """Extracts only statements made by Trump from a speech transcript."""
    out_lines = []
    capturing = True  # Flag to track when Trump starts speaking

    lines = text.split("\n")

    for line in lines:
        line = line.strip()
        # print("YOOOOOOOOOOOOOOOOOOOOOOOOOOOO")
        # print(line)
        # print("YOOOOOOOOOOOOOOOOOOOOOOOOOOOO")

        if re.match(r'^(PRESIDENT TRUMP:|THE PRESIDENT:|Donald Trump:|TRUMP:|President Trump:|Mr. Trump:|President Donald J. Trump:)', line, re.IGNORECASE):
            capturing = True  # Start capturing Trump's speech
            line = re.sub(r'^(PRESIDENT TRUMP:|THE PRESIDENT:|Donald Trump:|TRUMP:|President Trump:|Mr. Trump:|President Donald J. Trump:)', "", line, flags=re.IGNORECASE).strip()
            out_lines.append(line)
            # print(1)

        # Stop capturing if another speaker is detected
        elif re.match(r'^[A-Z][A-Za-z]+(?:\s[A-Za-z]+)?:', line):
            # print("0001")
            capturing = False

        elif capturing:
            # print(2)
            out_lines.append(line)

    return "\n".join(out_lines).strip()


def remove_questions(transcript):
    cleaned_transcript = re.sub(r'(?m)^Question\..*?\n', '', transcript)
    return cleaned_transcript.strip()


In [None]:

# def extract_trump_statements(text):
#     """Extracts only statements made by Trump in an interview, ignoring other speakers like audience members."""
#     out_lines = []

#     lines = text.split("\n")
#     for line in lines:
#         line = line.strip()  # Remove extra spaces

#         if re.match(r'^(THE PRESIDENT:|Donald Trump:|DONALD TRUMP:|TRUMP:|Trump:|President Trump:|Mr. Trump:)', line, re.IGNORECASE):
#             line = re.sub(r'^(THE PRESIDENT:|Donald Trump:|DONALD TRUMP:|TRUMP:|Trump:|President Trump:|Mr. Trump:)', "", line, flags=re.IGNORECASE).strip()
#             out_lines.append(line)

#         elif re.match(r'^[A-Za-z\s]+:\s*', line):
#             continue
#         else:
#             out_lines.append(line)

#     return "\n".join(out_lines).strip()


# Set folder path in Google Drive
SAVE_FOLDER = "/content/drive/MyDrive/TFG/PRE_ELEC_SCRIPTS"
os.makedirs(SAVE_FOLDER, exist_ok=True)

# Base URL
BASE_URL = "https://www.presidency.ucsb.edu"
start_url = "https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/documents-related-to-presidential-elections/2016/report/200301/all/63"

def scrape_page(url):
    global aux_text
    global count
    global stop_scraping
    print(f"Fetching page: {url}")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching page: {url}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    table = soup.find("table")
    if not table:
        print("No table found on the page.")
        return None

    for row in table.find_all("tr")[1:]:
        if stop_scraping:
            break

        columns = row.find_all("td")
        if len(columns) < 3:
            continue

        title_link = columns[0].find("a")
        if not title_link:
            continue

        title = title_link.text.strip()
        if "press release" in title.lower():
            print(f"Skipping {title} - Contains 'press release'.")
            continue

        date = columns[1].text.strip()
        related = columns[2].text.strip()
        link = BASE_URL + title_link["href"]

        year = int(date[-4:])
        if year >= 2017:
            print(f"Reached documents from {year}. Stopping with {count} documents obtained.")
            stop_scraping = True
            break

        print(f"Fetching document: {title} ({date})")
        time.sleep(random.uniform(2, 5))

        doc_response = requests.get(link)
        if doc_response.status_code != 200:
            print(f"Failed to fetch document: {title} - {link}")
            continue

        doc_soup = BeautifulSoup(doc_response.text, "html.parser")
        content_div = doc_soup.find("div", {"class": "field-docs-content"})
        text = content_div.get_text().strip() if content_div else "Content not found."

        extracted_text = extract_trump_statements(text)
        script = remove_questions(extracted_text)

        word_count = len(script.split())
        if word_count < 450:
            print(f"Skipping {title} ({date}) - Only {word_count} words.")
            continue

        # if "anselm" in title.lower():
        #   aux_text = text

        clean_title = re.sub(r"[^\w\-_]", "_", title)
        filename = f"{SAVE_FOLDER}/{clean_title}.txt"

        with open(filename, "w", encoding="utf-8") as file:
            file.write(f"{script}")

        print(f"Saved to Google Drive: {filename} ({word_count} words)\n")
        count += 1

    return soup

def get_next_page(soup):
    next_link = soup.find("a", text="next ›")
    if next_link:
        return BASE_URL + next_link["href"]
    return None

current_url = start_url
stop_scraping = False
count = 0

while current_url and not stop_scraping:
    soup = scrape_page(current_url)
    if soup:
        current_url = get_next_page(soup)
    else:
        break


Fetching page: https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/documents-related-to-presidential-elections/2016/report/200301/all/63
Fetching document: Remarks Announcing Candidacy for President in New York City (Jun 16, 2015)
Saved to Google Drive: /content/drive/MyDrive/TFG/PRE_ELEC_SCRIPTS/Remarks_Announcing_Candidacy_for_President_in_New_York_City.txt (6411 words)

Skipping Trump Campaign Press Release - Donald Trump: Obama Is A Horrible Negotiator "We Got Traitor Bergdahl, They Got 5 Killer Terrorists" - Contains 'press release'.
Skipping Trump Campaign Press Release - Trump on Hillary: I Was Watching Her Talk About Income Inequality...Have You Looked at Her Donor List? - Contains 'press release'.
Skipping Trump Campaign Press Release - Donald J. Trump Declares Candidacy for President of the United States - Contains 'press release'.
Skipping Trump Campaign Press Release - Donald Trump: I Would Build a Great, Great Wall on Our Southern Border and 

  next_link = soup.find("a", text="next ›")


Skipping Trump Campaign Press Release - Donald J. Trump Announces State Directors in Florida, Texas and Virginia - Contains 'press release'.
Skipping Trump Campaign Press Release - Trump's Campaign Manager Opens Up About Strategy - Contains 'press release'.
Skipping Trump Campaign Press Release - Donald J. Trump Announces State Directors in Alabama and Illinois - Contains 'press release'.
Skipping Trump Campaign Press Release - Donald J. Trump Receives Endorsement of Iowa State Senator Brad Zaun - Contains 'press release'.
Skipping Trump Campaign Press Release - Donald J. Trump Receives Endorsements from Oklahoma Leaders - Contains 'press release'.
Skipping Trump Campaign Press Release - Donald J. Trump Calls on All Presidential Candidates to Return Dark Money Sent to Super PAC's - Contains 'press release'.
Skipping Trump Campaign Press Release - Donald J. Trump Officially Qualifies for Ballot in Nevada and Kentucky - Contains 'press release'.
Skipping Trump Campaign Press Release - Io

In [None]:
# CODE FOR SEPARATE LINKS

# Set folder path in Google Drive
SAVE_FOLDER = "/content/drive/MyDrive/TFG/SEPARATE_DOCS"
os.makedirs(SAVE_FOLDER, exist_ok=True)


def fetch_document(url):
    print(f"Fetching document: {url}")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching document: {url}")
        return

    soup = BeautifulSoup(response.text, "html.parser")
    title_tag = soup.find("h1", class_="title")
    title = title_tag.text.strip() if title_tag else "Untitled Document3"

    content_div = soup.find("div", {"class": "field-docs-content"})
    text = content_div.get_text().strip() if content_div else "Content not found."

    extracted_text = extract_trump_statements(text)  # Function should be defined elsewhere
    script = remove_questions(extracted_text)  # Function should be defined elsewhere

    word_count = len(script.split())

    clean_title = re.sub(r"[^\w\-_]", "_", title)
    filename = f"{SAVE_FOLDER}/{clean_title}.txt"

    with open(filename, "w", encoding="utf-8") as file:
        file.write(script)

    print(f"Saved to Google Drive: {filename} ({word_count} words)\n")

# Example usage
single_document_url = "https://www.presidency.ucsb.edu/documents/statement-donald-j-trump-response-hillary-clinton-0"
fetch_document(single_document_url)

Fetching document: https://www.presidency.ucsb.edu/documents/statement-donald-j-trump-response-hillary-clinton-0
Saved to Google Drive: /content/drive/MyDrive/TFG/SEPARATE_DOCS/Untitled_Document3.txt (237 words)

