In [10]:
# To extract the specified information from text, we can use regular expressions.
# Here is a Python function that takes a string input and returns the reformatted information.

import re

def extract_info(text):
    # Define a regular expression pattern to capture the required elements
    # The pattern captures a name followed by descriptors and a birthday date.
    pattern = re.compile(
        r'(?P<name>[A-Z][a-z]+)\s+is\s+a\s+(?P<attitude>[a-z]+\s)?(?P<species>[a-z]+\s[villager]+).*?'
        r'(?P<name>\1)\s+has\s+the\s+(?P<hobby>[a-z]+\s[hobby]+).*?'
        r'birthday\s+is\s+(?P<birthday>[A-Z][a-z]+\s+\d+)'
    )
    
    # Find all matches in the text
    matches = pattern.finditer(text)
    
    # Extract the information and reformat it
    results = []
    for match in matches:
        name = match.group('name')
        birthday = match.group('birthday')
        hobby = match.group('hobby').replace(' hobby', '')
        species = match.group('species').replace(' villager', '')
        results.append(f"{name} - {birthday} - {hobby.capitalize()} Hobby - {species.capitalize()}")
    
    return results

# Example text
text = """
Raymond is a smug cat villager in the Animal Crossing series. Raymond has the nature hobby. His birthday is October 1.
Shino is a peppy deer villager. Shino has the education hobby . Her birthday is October 31.
Bob is a lazy cat villager. Bob was the first villager created, and his January 1 birthday came as a result of that. Bob has the play hobby.
Marshal is a smug squirrel villager. Marshal has the music hobby and may sing anywhere without the need of a stereo. His birthday is September 29.
Marty is a lazy bear cub villager . Marty has the play hobby. His birthday is April 16 .
"""

# Call the function with the example text
extract_info(text)


error: redefinition of group name 'name' as group 4; was group 1 at position 95

In [11]:
# Correcting the regular expression to avoid redefinition of group names
import re

def extract_info(text):
    # Define a regular expression pattern to capture the required elements
    # The pattern captures a name, species, hobby, and birthday
    pattern = re.compile(
        r'(?P<name>[A-Z][a-z]+)\s+is\s+a\s+(?P<attitude>[a-z]+\s)?(?P<species>[a-z]+\s[villager]+)\.?.*?'
        r'\1\s+has\s+the\s+(?P<hobby>[a-z]+)\s+hobby.*?'
        r'birthday\s+is\s+(?P<birthday>[A-Z][a-z]+\s+\d+)\.'
    )
    
    # Find all matches in the text
    matches = pattern.finditer(text)
    
    # Extract the information and reformat it
    results = []
    for match in matches:
        name = match.group('name')
        birthday = match.group('birthday')
        hobby = match.group('hobby').capitalize()
        species = match.group('species').replace(' villager', '').capitalize()
        results.append(f"{name} - {birthday} - {hobby} Hobby - {species}")
    
    return results

# Example text
text = """
Raymond is a smug cat villager in the Animal Crossing series. Raymond has the nature hobby. His birthday is October 1.
Shino is a peppy deer villager. Shino has the education hobby . Her birthday is October 31.
Bob is a lazy cat villager. Bob was the first villager created, and his January 1 birthday came as a result of that. Bob has the play hobby.
Marshal is a smug squirrel villager. Marshal has the music hobby and may sing anywhere without the need of a stereo. His birthday is September 29.
Marty is a lazy bear cub villager . Marty has the play hobby. His birthday is April 16 .
"""

# Call the function with the example text
extract_info(text)


['Raymond - October 1 - Nature Hobby - Cat',
 'Shino - October 31 - Education Hobby - Deer',
 'Marshal - September 29 - Music Hobby - Squirrel']

In [12]:
import re

def extract_villager_info(text):
    # This function will use regular expressions to extract the name, birthday, hobby, and species of villagers from the text.
    villager_info_pattern = re.compile(
        r"(?P<name>[A-Z][a-z]+)\s+is\s+a\s+(?P<attitude>[a-z]+\s)?(?P<species>[a-z]+\s)(villager).*?"
        r"(?P=name)\s+has\s+the\s+(?P<hobby>[a-z]+)\s+hobby.*?"
        r"birthday\s+is\s+(?P<birthday>[A-Z][a-z]+\s+\d+)\."
    )

    # Find all matches in the text and extract the details
    villagers = []
    for match in villager_info_pattern.finditer(text):
        name = match.group('name')
        birthday = match.group('birthday')
        hobby = match.group('hobby')
        species = match.group('species').strip()
        villagers.append(f"{name} - {birthday} - {hobby.capitalize()} Hobby - {species.capitalize()}")
    
    return villagers

# Sample text from the user
text_from_user = """
Raymond is a smug cat villager in the Animal Crossing series. Raymond has the nature hobby. His birthday is October 1.
Shino is a peppy deer villager. Shino has the education hobby . Her birthday is October 31.
Bob is a lazy cat villager. Bob was the first villager created, and his January 1 birthday came as a result of that. Bob has the play hobby.
Marshal is a smug squirrel villager. Marshal has the music hobby and may sing anywhere without the need of a stereo. His birthday is September 29.
Marty is a lazy bear cub villager . Marty has the play hobby. His birthday is April 16 .
"""

# Extracting the information from the given text
villager_details = extract_villager_info(text_from_user)
villager_details


['Raymond - October 1 - Nature Hobby - Cat',
 'Shino - October 31 - Education Hobby - Deer',
 'Marshal - September 29 - Music Hobby - Squirrel']

In [13]:
# It seems the pattern is still not capturing some entries. I will create a more robust pattern to match each line individually.
# This approach will process the text line by line, which can help to capture each entry more reliably.

def extract_villager_info_line_by_line(text):
    # Define a regular expression pattern to capture the required elements
    pattern = re.compile(
        r"(?P<name>[A-Z][a-z]+)\s+is\s+a\s+(?P<attitude>[a-z]+\s)?(?P<species>[a-z]+\s).*? "
        r"(?P=name)\s+has\s+the\s+(?P<hobby>[a-z]+)\s+hobby.*? "
        r"birthday\s+is\s+(?P<birthday>[A-Z][a-z]+\s+\d+)\."
    )
    
    # Split the text into lines
    lines = text.strip().split('\n')
    
    # Process each line individually
    results = []
    for line in lines:
        match = pattern.search(line)
        if match:
            name = match.group('name')
            birthday = match.group('birthday')
            hobby = match.group('hobby').capitalize()
            species = match.group('species').strip().capitalize().replace(" villager", "")
            results.append(f"{name} - {birthday} - {hobby} Hobby - {species}")
    
    return results

# Call the function with the example text
extract_villager_info_line_by_line(text)


['Raymond - October 1 - Nature Hobby - Cat',
 'Shino - October 31 - Education Hobby - Deer',
 'Marshal - September 29 - Music Hobby - Squirrel']

In [14]:
# Adjusting the regular expression to include a more robust pattern to capture Bob's entry as well.

def extract_villager_info_comprehensive(text):
    # Define a regular expression pattern to capture the required elements
    pattern = re.compile(
        r"(?P<name>[A-Z][a-z]+)\s+is\s+a\s+(?P<attitude>[a-z]+\s)?(?P<species>[a-z]+\s).*?"
        r"(?P=name)\s+has\s+the\s+(?P<hobby>[a-z]+)\s+hobby.*?"
        r"birthday\s+is\s+(?P<birthday>[A-Z][a-z]+\s+\d+)\."
    )
    
    # Process the text to match the pattern for each villager
    matches = pattern.finditer(text)
    
    # Extract the information and reformat it
    results = []
    for match in matches:
        name = match.group('name')
        birthday = match.group('birthday')
        hobby = match.group('hobby').capitalize()
        species = match.group('species').strip().capitalize().replace(" villager", "")
        results.append(f"{name} - {birthday} - {hobby} Hobby - {species}")
    
    return results

# Call the function with the example text
extract_villager_info_comprehensive(text)


['Raymond - October 1 - Nature Hobby - Cat',
 'Shino - October 31 - Education Hobby - Deer',
 'Marshal - September 29 - Music Hobby - Squirrel']

In [15]:
# Let's manually inspect Bob's entry and adjust the regular expression pattern to match it.

# Adjusting the regular expression pattern to account for potential differences in Bob's text entry.
def extract_bob_info(text):
    # Define a regular expression pattern to capture the required elements
    pattern = re.compile(
        r"(?P<name>[A-Z][a-z]+)\s+is\s+a\s+(?P<attitude>[a-z]+\s)?(?P<species>[a-z]+)\s+villager.*?"
        r"(?P=name)\s+was\s+the\s+first\s+villager\s+created,.*?"
        r"and\s+his\s+(?P<birthday>[A-Z][a-z]+\s+\d+)\s+birthday\s+came\s+as\s+a\s+result\s+of\s+that\.\s+"
        r"(?P=name)\s+has\s+the\s+(?P<hobby>[a-z]+)\s+hobby\."
    )
    
    # Find the match in the text
    match = pattern.search(text)
    
    # Extract the information and reformat it if a match is found
    if match:
        name = match.group('name')
        birthday = match.group('birthday')
        hobby = match.group('hobby').capitalize()
        species = match.group('species').strip().capitalize()
        return f"{name} - {birthday} - {hobby} Hobby - {species}"
    else:
        return "No match found for Bob's entry."

# Extracting Bob's information from the text
bob_info = extract_bob_info(text)
bob_info


'Bob - January 1 - Play Hobby - Cat'