# Regular Expressions
- Regular expressions (regex) are sequences of characters that form search patterns used for matching and manipulating strings.
- They are commonly used in text processing tasks to find, extract, replace, or split substrings within text based on specific patterns.

In [3]:
import re

In [4]:
my_string = "Let's write RegEx!"

In [5]:
# Extract all words from a string
re.findall(r"\w+", my_string)

['Let', 's', 'write', 'RegEx']

In [6]:
# Extract all single word characters from a string
re.findall(r"\w", my_string)

['L', 'e', 't', 's', 'w', 'r', 'i', 't', 'e', 'R', 'e', 'g', 'E', 'x']

In [7]:
# Extract all sequences of whitespace characters from a string
re.findall(r"\s+", my_string)

[' ', ' ']

In [8]:
# Extract all individual lowercase letters from a string
re.findall(r"[a-z]", my_string)

['e', 't', 's', 'w', 'r', 'i', 't', 'e', 'e', 'g', 'x']

#### Practicing regular expressions: re.split() and re.findall()
- The re.split() function in Python is used to split a string by the occurrences of a pattern.
- The re.findall() function in Python is used to find all occurrences of a pattern in a given string. It returns a list of all non-overlapping matches of the pattern in the string.

![q2](question2.png)

In [9]:
# Write a pattern to match sentence endings: sentence_endings
sentence_endings = r"[.?!]"
# Split my_string on sentence endings and print the result
print(re.split(sentence_endings, my_string))

["Let's write RegEx", '']


In [10]:
# Find all capitalized words in my_string and print the result
capitalized_words = r"[A-Z]\w+"
print(re.findall(capitalized_words, my_string))

['Let', 'RegEx']


In [11]:
# Split my_string on spaces and print the result
spaces = r"\s+"
print(re.findall(spaces, my_string))

[' ', ' ']


In [12]:
# Find all digits in my_string and print the result
digits = r"\d+"
print(re.findall(digits, my_string))

[]


# Tokenization 
-  Breaking down a sentence into smaller pieces

!["Question2"](Question3.png)

In [13]:
# Import necessary modules
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [14]:
scene_one = "Hello there! This is a test string. How are you today? I hope you are doing well."

In [15]:
# Split scene_one into sentences: sentences
sentences = sent_tokenize(scene_one)
sentences

['Hello there!',
 'This is a test string.',
 'How are you today?',
 'I hope you are doing well.']

In [16]:
# Use word_tokenize to tokenize the fourth sentence: tokenized_sent
tokenized_sent = word_tokenize(scene_one[3])
tokenized_sent

['l']

In [17]:
# Make a set of unique tokens in the entire scene: unique_tokens
unique_tokens = set(word_tokenize(scene_one))
unique_tokens

{'!',
 '.',
 '?',
 'Hello',
 'How',
 'I',
 'This',
 'a',
 'are',
 'doing',
 'hope',
 'is',
 'string',
 'test',
 'there',
 'today',
 'well',
 'you'}

![Question4.png](Question4.png)

In [22]:
# Search for the first occurrence of "coconuts" in scene_one: match
match = re.search("coconuts", scene_one)
match

In [24]:
match = re.search("How", scene_one)
match

<re.Match object; span=(36, 39), match='How'>

In [31]:
scene_one[36:39]

'How'

In [33]:
# Print the start and end indexes of match
print(match.start(), match.end())

36 39


![Question4](Question5.png)

In [40]:
my_string = "Here is some text with [something] in brackets and [another thing] in brackets."

In [46]:
# Write a regular expression to search for anything in square brackets: pattern1
pattern1 = r"\[.*\]"
temp = re.findall(pattern1, my_string)
temp

['[something] in brackets and [another thing]']

In [47]:
# Use re.search to find the first text in square brackets
print(re.search(pattern1, my_string))

<re.Match object; span=(23, 66), match='[something] in brackets and [another thing]'>


![Question6.png](Question6.png)

In [48]:
# Find the script notation at the beginning of the fourth sentence and print it
pattern2 = r"[\w\s]+:"
print(re.match(pattern2, my_string[3]))

None


In [50]:
print(re.match(pattern2, scene_one[3]))

None
