1. Updating Eliza

In [1]:
import re

user_input_history = []

def reflect(fragment):
    """Reflects user input to make responses more natural."""
    reflections = {
        "am": "are",
        "was": "were",
        "i": "you",
        "i'd": "you would",
        "i've": "you have",
        "i'll": "you will",
        "my": "your",
        "are": "am",
        "you've": "I have",
        "you'll": "I will",
        "your": "my",
        "yours": "mine",
        "you": "me",
        "me": "you"
    }
    words = fragment.lower().split()
    return ' '.join([reflections.get(word, word) for word in words])

def is_repeated_input(user_input):
    """Check if user input has been asked before."""
    user_input_normalized = user_input.lower().strip()
    if user_input_normalized in user_input_history:
        return True
    user_input_history.append(user_input_normalized)
    return False

def eliza_response(user_input):
    """Generates ELIZA-style responses based on input."""
    
    # Check if input is repeated - if so, respond sarcastically
    if is_repeated_input(user_input):
        return "Didn't we already talk about this? I think you're stuck on repeat! ngani"
    
    patterns = [
        (r"I need (.*)", "Why do you need {0}?"),
        (r"Why don't you (.*)", "Do you really think I don't {0}?"),
        (r"I feel (.*)", "Tell me more about feeling {0}."),
        # New patterns - a. depressed all the time
        (r"I want to know the reasons? why I am? feeling depressed all the time", "Depression is a serious matter. Have you considered talking to a professional about these feelings?"),
        # New patterns - b. feeling stressed
        (r"I am? feeling stressed", "Stress can be overwhelming. What specific situation is causing you stress?"),
        # New patterns - c. crush feelings invalidated
        (r"My feelings towards? my crush are? invalidated", "Your feelings are valid and deserve to be respected. Why do you think they're being dismissed?"),
        # New patterns - d. you don't understand me
        (r"You do(n't| not) understand me", "I'm sorry you feel that way. Help me understand what you're going through."),
        # New patterns - e. can't focus on studies
        (r"I ca(n't| not) focus on my studies", "Difficulty focusing can be frustrating. What's preventing you from concentrating?")
    ]
    
    for pattern, response in patterns:
        match = re.match(pattern, user_input, re.IGNORECASE)
        # print(match)
        if match:
            # Only print captured group if it exists (for patterns with groups)
            if match.groups():
                print(match.group(1))  # captures the substring after the pattern
            return response.format(reflect(match.group(1))) if match.groups() else response
    
    return "Can you tell me more?"

print("ELIZA: Hello! How can I help you today?")
print("(Type 'quit' or 'exit' to end the conversation, or press Ctrl+C to interrupt)\n")

try:
    while True:
        user_input = input("You: ")
        if user_input.lower() in ["quit", "exit"]:
            print("ELIZA: Goodbye!")
            break
        print(f"ELIZA: {eliza_response(user_input)}")
except KeyboardInterrupt:
    print("\n\nELIZA: Goodbye! (Interrupted)")

ELIZA: Hello! How can I help you today?
(Type 'quit' or 'exit' to end the conversation, or press Ctrl+C to interrupt)

shit
ELIZA: Tell me more about feeling shit.
ELIZA: Can you tell me more?
ELIZA: Can you tell me more?
sleep
ELIZA: Why do you need sleep?
ELIZA: Stress can be overwhelming. What specific situation is causing you stress?
ELIZA: Your feelings are valid and deserve to be respected. Why do you think they're being dismissed?
ELIZA: Goodbye!


2. Implementing RegEx on NLP

a. Extract all of the words starting with an upper case letter
from the text:

In [2]:
import re

# Text from Alice in Wonderland
text = """Alice was beginning to get very tired of sitting by her sister on the bank,
and of having nothing to do. Once or twice she had peeped into the book
her sister was reading, but it had no pictures or conversations in it, "and
what is the use of a book," thought Alice, "without pictures or
conversations?"""

# RegEx pattern to extract words starting with uppercase letter
# \b[A-Z]\w* matches:
# \b - word boundary
# [A-Z] - uppercase letter at the start
# \w* - followed by zero or more word characters (letters, digits, underscore)
pattern = r'\b[A-Z]\w*'

# Extract all matches
uppercase_words = re.findall(pattern, text)

print("RegEx Pattern: r'\\b[A-Z]\\w*'")
print("\nWords starting with uppercase letter:")
print(uppercase_words)
print(f"\nTotal count: {len(uppercase_words)}")
print(f"Unique words: {set(uppercase_words)}")

RegEx Pattern: r'\b[A-Z]\w*'

Words starting with uppercase letter:
['Alice', 'Once', 'Alice']

Total count: 3
Unique words: {'Alice', 'Once'}


b. Read the “melville-moby_dick.txt” into a Python program 
and extract all of the instances of the word Whale, Whales, whale 
and whales in said source. Then replace the first 10 instances with the word “leviathan”.

In [3]:
import re
from pathlib import Path

# --- Locate and read the source text ---
def find_moby_dick_path() -> Path:
    candidates = [
        Path("melville-moby_dick.txt"),
        Path("BSCS 3A") / "STEVEN_KEN_PONTILLAS" / "melville-moby_dick.txt",
    ]
    for p in candidates:
        if p.exists():
            return p.resolve()

    # Fallback: search within the current working directory
    for p in Path.cwd().rglob("melville-moby_dick.txt"):
        return p.resolve()

    raise FileNotFoundError("Could not find melville-moby_dick.txt")

txt_path = find_moby_dick_path()
raw_text = txt_path.read_text(encoding="utf-8", errors="replace")

# Normalize newlines so paragraph detection is consistent
text = raw_text.replace("\r\n", "\n")
print(f"Loaded: {txt_path}")

# --- RegEx pattern ---
# Matches: Whale, Whales, whale, whales (whole words only)
pattern = r"\bwhales?\b"
print(f"RegEx pattern: r'{pattern}' with re.IGNORECASE")

# Extract all match objects before replacement (to locate the 10th)
match_iter = list(re.finditer(pattern, text, flags=re.IGNORECASE))
print(f"Total matches found: {len(match_iter):,}")

# Replace the first 10 instances with "leviathan"
replaced_text, replacements_made = re.subn(
    pattern,
    "leviathan",
    text,
    count=10,
    flags=re.IGNORECASE,
)
print(f"Replacements made: {replacements_made}\n")

def paragraph_bounds(full_text: str, index_start: int, index_end: int) -> tuple[int, int]:
    """Return (start, end) indices of the paragraph containing [index_start, index_end)."""
    start = full_text.rfind("\n\n", 0, index_start)
    start = 0 if start == -1 else (start + 2)
    end = full_text.find("\n\n", index_end)
    end = len(full_text) if end == -1 else end
    return start, end

def print_paragraph(label: str, s: str, max_chars: int = 10_000) -> None:
    """Print the full paragraph unless it's extremely large."""
    print(label)
    if len(s) <= max_chars:
        print(s)
        return
    head = 4000
    tail = 4000
    print(f"(Paragraph is {len(s):,} chars; showing first {head:,} and last {tail:,} chars)\n")
    print(s[:head])
    print("\n...\n")
    print(s[-tail:])

if len(match_iter) < 10:
    print("Less than 10 matches were found.")
    if match_iter:
        m = match_iter[0]
        p_start, p_end = paragraph_bounds(text, m.start(), m.end())
        paragraph_full = replaced_text[p_start:p_end].strip()
        print_paragraph("\nParagraph containing the 1st instance (after replacement):\n", paragraph_full)
else:
    tenth = match_iter[9]  # 10th match (0-based index, from original text)
    p_start, p_end = paragraph_bounds(text, tenth.start(), tenth.end())

    # Show the paragraph up to the 10th match from the REPLACED text
    paragraph_up_to_10th = replaced_text[p_start:tenth.end()].strip()
    print_paragraph("Paragraph up to the 10th instance (with replacements):\n", paragraph_up_to_10th)

Loaded: C:\Users\acer\Downloads\Github\CCS-249_25-26_Activities\BSCS 3A\STEVEN_KEN_PONTILLAS\melville-moby_dick.txt
RegEx pattern: r'\bwhales?\b' with re.IGNORECASE
Total matches found: 1,492
Replacements made: 10

Paragraph up to the 10th instance (with replacements):

The Project Gutenberg eBook of Moby Dick; Or, The leviathan This ebook is for the use of anyone
anywhere in the United States and most other parts of the world at no cost and with almost no
restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project
Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not
located in the United States, you will have to check the laws of the country where you are
located before using this eBook. Title: Moby Dick; Or, The leviathan Author: Herman Melville
Release date: July 1, 2001 [eBook #2701] Most recently updated: January 19, 2025 Language:
English Credits: Daniel Lazarus, Jonesey, and David Widger *** START OF THE


c. Extract Jack Sparrow's Lines from pirates.txt
Using NLTK's webtext corpus to extract all lines spoken by Jack Sparrow using RegEx pattern matching.

In [4]:
# Install NLTK package
import sys
!{sys.executable} -m pip install nltk -q

In [5]:
# Download the webtext corpus
import nltk
nltk.download('webtext')

[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\webtext.zip.


True

In [6]:
import re
from nltk.corpus import webtext

# Load pirates.txt
pirates_text = webtext.raw('pirates.txt')

# RegEx pattern to extract Jack Sparrow's lines
# Pattern explanation:
# JACK\s*SPARROW\s*: - Matches "JACK SPARROW:" with optional whitespace
# \s* - Zero or more whitespace after the colon
# (.*?) - Non-greedy capture group for the dialogue content
# (?=\n[A-Z\[]) - Lookahead: stops at newline followed by uppercase letter or '[' (next speaker/action)
# (?=\n*$) - OR stops at the end of text
pattern = r'JACK\s*SPARROW\s*:\s*(.*?)(?=\n[A-Z\[]|\n*$)'

# Extract all Jack Sparrow's lines
jack_lines = re.findall(pattern, pirates_text, re.DOTALL | re.IGNORECASE)

# Clean up the lines (remove extra whitespace and newlines)
jack_lines_cleaned = [re.sub(r'\s+', ' ', line.strip()) for line in jack_lines if line.strip()]

print(f"RegEx Pattern: r'JACK\\s*SPARROW\\s*:\\s*(.*?)(?=\\n[A-Z\\[]|\\n*$)'")
print(f"\nTotal lines spoken by Jack Sparrow: {len(jack_lines_cleaned)}\n")
print("=" * 70)
print("JACK SPARROW'S LINES:")
print("=" * 70)

for i, line in enumerate(jack_lines_cleaned, 1):
    print(f"\n{i}. {line}")

RegEx Pattern: r'JACK\s*SPARROW\s*:\s*(.*?)(?=\n[A-Z\[]|\n*$)'

Total lines spoken by Jack Sparrow: 193

JACK SPARROW'S LINES:

1. Sorry, mate.

2. Mind if we make a little side trip? I didn't think so.

3. Complications arose, ensued, were overcome.

4. Mm-hmm!

5. Shiny?

6. Is that how you're all feeling, then? Perhaps dear old Jack is not serving your best interests as captain?

7. What did the bird say?

8. Ohhh!

9. It does me.

10. No! Much more better. It is a *drawing* of a key.

11. Gentlemen, what do keys do?

12. No! If we don't have the key, we can't open whatever it is we don't have that it unlocks. So what purpose would be served in finding whatever need be unlocked, which we don't have, without first having found the key what unlocks it?

13. You're not making any sense at all. Any more questions?

14. Hah! A heading. Set sail in a... mmm... a general... in *that* way - direction.

15. Come on, snap to and make sail, you know how this works. Come on, oy/quick, oy/quick,