<a href="https://colab.research.google.com/github/terraray2/Python_Automation/blob/main/Regex_Web_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
# BeautifulSoup: for parsing HTML content
# requests: for fetching web pages
# re: for regular expressions (used for text cleaning)
# pandas: for data manipulation and analysis
# google.colab.drive: to mount Google Drive (if running in Google Colab)

from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from google.colab import drive


# Define the URL of the webpage containing MLK's "I Have a Dream" speech
url = 'http://www.analytictech.com/mb021/mlk.htm'

# Fetch the webpage content using requests
page = requests.get(url)

# Parse the HTML content using BeautifulSoup
# 'html.parser' is specified as the parser
soup = BeautifulSoup(page.text, 'html.parser')


# --- Extract the speech text ---
# Find all <p> (paragraph) tags within the parsed HTML
mlkj_speech = soup.find_all('p')

# Extract the text content from each paragraph tag and combine it into a list
speech_combined = [p.text for p in mlkj_speech]

# Join the list of paragraph texts into a single string
string_speech = ' '.join(speech_combined)

# --- Clean the speech text ---
# Replace newline characters (\r\n) with spaces
string_speech_cleaned = string_speech.replace('\r\n',' ')

# Remove punctuation and special characters using regular expressions
# This keeps only alphanumeric characters and whitespace
speech_no_punt = re.sub(r'[^\w\s]', '', string_speech_cleaned)

# Convert the text to lowercase for case-insensitive analysis
speech_lower = speech_no_punt.lower()

# Split the text into individual words using whitespace as the delimiter
speech_broken_out = re.split(r'\s+', speech_lower)


# --- Analyze word frequency ---
# Create a pandas DataFrame from the list of words
# and count the occurrences of each word using `value_counts()`
df = pd.DataFrame(speech_broken_out).value_counts()


# --- Save the word counts to a CSV file ---
# Mount Google Drive (if using Google Colab)
drive.mount('/content/drive')

# Save the DataFrame to a CSV file in Google Drive
# 'header' specifies the column name for the counts
# 'index_label' specifies the column name for the words

df.to_csv('/content/drive/My Drive/Data Analyst/MLKJ_Speech_Counts.csv', header = ['Counts'], index_label = 'Word')