# Regular Expressions
- Regular expressions (regex) are sequences of characters that form search patterns used for matching and manipulating strings.
- They are commonly used in text processing tasks to find, extract, replace, or split substrings within text based on specific patterns.

In [1]:
import re

In [2]:
my_string = "Let's write RegEx!"

In [3]:
# Extract all words from a string
re.findall(r"\w+", my_string)

['Let', 's', 'write', 'RegEx']

In [4]:
# Extract all single word characters from a string
re.findall(r"\w", my_string)

['L', 'e', 't', 's', 'w', 'r', 'i', 't', 'e', 'R', 'e', 'g', 'E', 'x']

In [5]:
# Extract all sequences of whitespace characters from a string
re.findall(r"\s+", my_string)

[' ', ' ']

In [6]:
# Extract all individual lowercase letters from a string
re.findall(r"[a-z]", my_string)

['e', 't', 's', 'w', 'r', 'i', 't', 'e', 'e', 'g', 'x']

#### Practicing regular expressions: re.split() and re.findall()
- The re.split() function in Python is used to split a string by the occurrences of a pattern.
- The re.findall() function in Python is used to find all occurrences of a pattern in a given string. It returns a list of all non-overlapping matches of the pattern in the string.

![q2](question2.png)

In [7]:
# Write a pattern to match sentence endings: sentence_endings
sentence_endings = r"[.?!]"
# Split my_string on sentence endings and print the result
print(re.split(sentence_endings, my_string))

["Let's write RegEx", '']


In [8]:
# Find all capitalized words in my_string and print the result
capitalized_words = r"[A-Z]\w+"
print(re.findall(capitalized_words, my_string))

['Let', 'RegEx']


In [9]:
# Split my_string on spaces and print the result
spaces = r"\s+"
print(re.findall(spaces, my_string))

[' ', ' ']


In [10]:
# Find all digits in my_string and print the result
digits = r"\d+"
print(re.findall(digits, my_string))

[]


# Tokenization 
-  Breaking down a sentence into smaller pieces

!["Question2"](Question3.png)

In [11]:
# Import necessary modules
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [12]:
scene_one = "Hello there! This is a test string. How are you today? I hope you are doing well."

In [13]:
# Split scene_one into sentences: sentences
sentences = sent_tokenize(scene_one)
sentences

['Hello there!',
 'This is a test string.',
 'How are you today?',
 'I hope you are doing well.']

In [14]:
# Use word_tokenize to tokenize the fourth sentence: tokenized_sent
tokenized_sent = word_tokenize(scene_one[3])
tokenized_sent

['l']

In [15]:
# Make a set of unique tokens in the entire scene: unique_tokens
unique_tokens = set(word_tokenize(scene_one))
unique_tokens

{'!',
 '.',
 '?',
 'Hello',
 'How',
 'I',
 'This',
 'a',
 'are',
 'doing',
 'hope',
 'is',
 'string',
 'test',
 'there',
 'today',
 'well',
 'you'}

![Question4.png](Question4.png)

In [16]:
# Search for the first occurrence of "coconuts" in scene_one: match
match = re.search("coconuts", scene_one)
match

In [17]:
match = re.search("How", scene_one)
match

<re.Match object; span=(36, 39), match='How'>

In [18]:
scene_one[36:39]

'How'

In [19]:
# Print the start and end indexes of match
print(match.start(), match.end())

36 39


![Question4](Question5.png)

In [20]:
my_string = "Here is some text with [something] in brackets and [another thing] in brackets."

In [21]:
# Write a regular expression to search for anything in square brackets: pattern1
pattern1 = r"\[.*\]"
temp = re.findall(pattern1, my_string)
temp

['[something] in brackets and [another thing]']

In [22]:
# Use re.search to find the first text in square brackets
print(re.search(pattern1, my_string))

<re.Match object; span=(23, 66), match='[something] in brackets and [another thing]'>


![Question6.png](Question6.png)

In [23]:
# Find the script notation at the beginning of the fourth sentence and print it
pattern2 = r"[\w\s]+:"
print(re.match(pattern2, my_string[3]))

None


In [24]:
print(re.match(pattern2, scene_one[3]))

None


# Advanced tokenization with regex

![Question7.png](Question7.png)

In [25]:
my_string = "SOLDIER #1: Found them? In Mercea? The coconut's tropical!"

In [30]:
re.findall(r"(\w)", my_string)

['S',
 'O',
 'L',
 'D',
 'I',
 'E',
 'R',
 '1',
 'F',
 'o',
 'u',
 'n',
 'd',
 't',
 'h',
 'e',
 'm',
 'I',
 'n',
 'M',
 'e',
 'r',
 'c',
 'e',
 'a',
 'T',
 'h',
 'e',
 'c',
 'o',
 'c',
 'o',
 'n',
 'u',
 't',
 's',
 't',
 'r',
 'o',
 'p',
 'i',
 'c',
 'a',
 'l']

In [31]:
re.findall(r"(\w+)", my_string)

['SOLDIER',
 '1',
 'Found',
 'them',
 'In',
 'Mercea',
 'The',
 'coconut',
 's',
 'tropical']

In [32]:
re.findall(r"(\w+|)", my_string)

['SOLDIER',
 '',
 '',
 '1',
 '',
 '',
 'Found',
 '',
 'them',
 '',
 '',
 'In',
 '',
 'Mercea',
 '',
 '',
 'The',
 '',
 'coconut',
 '',
 's',
 '',
 'tropical',
 '',
 '']

In [34]:
re.findall(r"(\w+|\?)", my_string)

['SOLDIER',
 '1',
 'Found',
 'them',
 '?',
 'In',
 'Mercea',
 '?',
 'The',
 'coconut',
 's',
 'tropical']

### Common Regular Expressions Rules and Usage in NLP

1. **Matching Characters**:
   - `.`: Matches any character except a newline.
     ```python
     re.findall(r'.', 'Hello')  # Output: ['H', 'e', 'l', 'l', 'o']
     ```

2. **Character Classes**:
   - `\d`: Matches any digit (0-9).
     ```python
     re.findall(r'\d', 'There are 2 apples and 5 bananas.')  # Output: ['2', '5']
     ```
   - `\w`: Matches any word character (alphanumeric + underscore).
     ```python
     re.findall(r'\w', 'Hello_World!')  # Output: ['H', 'e', 'l', 'l', 'o', '_', 'W', 'o', 'r', 'l', 'd']
     ```
   - `\s`: Matches any whitespace character (spaces, tabs, line breaks).
     ```python
     re.findall(r'\s', 'Hello World\n')  # Output: [' ', '\n']
     ```

3. **Quantifiers**:
   - `*`: Matches 0 or more repetitions.
     ```python
     re.findall(r'\d*', '123abc')  # Output: ['123', '', '', '', '', '']
     ```
   - `+`: Matches 1 or more repetitions.
     ```python
     re.findall(r'\d+', '123abc')  # Output: ['123']
     ```
   - `?`: Matches 0 or 1 repetition.
     ```python
     re.findall(r'\d?', '123abc')  # Output: ['1', '2', '3', '', '', '']
     ```

4. **Anchors**:
   - `^`: Matches the start of the string.
     ```python
     re.findall(r'^Hello', 'Hello World')  # Output: ['Hello']
     ```
   - `$`: Matches the end of the string.
     ```python
     re.findall(r'World$', 'Hello World')  # Output: ['World']
     ```

5. **Groups and Alternation**:
   - `(...)`: Groups a pattern.
     ```python
     re.findall(r'(ab)+', 'ababab')  # Output: ['ab']
     ```
   - `|`: Acts like a boolean OR.
     ```python
     re.findall(r'cat|dog', 'cat and dog')  # Output: ['cat', 'dog']
     ```

6. **Escape Characters**:
   - `\`: Escapes special characters.
     ```python
     re.findall(r'\.', '3.14')  # Output: ['.']
     ```

### Examples in NLP

1. **Tokenization**:
   - Split a text into words:
     ```python
     import re
     text = "Hello, world! This is NLP."
     tokens = re.findall(r'\w+', text)
     print(tokens)  # Output: ['Hello', 'world', 'This', 'is', 'NLP']
     ```

2. **Removing Punctuation**:
   - Remove punctuation from a text:
     ```python
     import re
     text = "Hello, world! This is NLP."
     clean_text = re.sub(r'[^\w\s]', '', text)
     print(clean_text)  # Output: 'Hello world This is NLP'
     ```

3. **Extracting Email Addresses**:
   - Find all email addresses in a text:
     ```python
     import re
     text = "Contact us at support@example.com or sales@example.com."
     emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', text)
     print(emails)  # Output: ['support@example.com', 'sales@example.com']
     ```

4. **Finding Dates**:
   - Extract dates in the format `DD/MM/YYYY`:
     ```python
     import re
     text = "Today's date is 31/05/2024."
     dates = re.findall(r'\b\d{2}/\d{2}/\d{4}\b', text)
     print(dates)  # Output: ['31/05/2024']
     ```

### Practice Example

Let's create a pattern to extract hashtags from a tweet:

```python
    import re

    tweet = "Loving the sunny weather! #sunnyday #beautiful"
    hashtags = re.findall(r'#\w+', tweet)
    print(hashtags)  # Output: ['#sunnyday', '#beautiful']
```

In this example:
- `#\w+` matches a hashtag, where `#` is the literal character and `\w+` matches one or more word characters.

In [36]:
print(re.findall(r"(\w+|\?|!)", my_string))

print(re.findall(r"(\w+|#\d|\?|!)", my_string))

print(re.findall(r"(#\d\w+\?!)", my_string))

print(re.findall(r"\s+", my_string))

['SOLDIER', '1', 'Found', 'them', '?', 'In', 'Mercea', '?', 'The', 'coconut', 's', 'tropical', '!']
['SOLDIER', '#1', 'Found', 'them', '?', 'In', 'Mercea', '?', 'The', 'coconut', 's', 'tropical', '!']
[]
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


# Regex with NLTK tokenization

![Question8.png](Question8.png)

In [37]:
# Import the necessary module
from nltk.tokenize import regexp_tokenize, TweetTokenizer

![Question9.png](Question9.png)

In [40]:
tweets = [
    "Loving the sunny weather! #sunnyday #beautiful",
    "Just finished a great workout. #fitness #health",
    "Can't wait for the weekend! #TGIF #relaxation",
]


In [41]:
# Import the necessary modules
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer

# Define a regex pattern to find hashtags: pattern1
pattern1 = r"#\w+"

# Use the pattern on the first tweet in the tweets list
hashtags = regexp_tokenize(tweets[0], pattern1)

print(hashtags)

['#sunnyday', '#beautiful']


![Question10.png](Question10.png)

In [42]:
# Import the necessary modules
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer

# Write a pattern that matches both mentions (@) and hashtags
pattern2 = r"[@#]\w+"

# Use the pattern on the last tweet in the tweets list
mentions_hashtags = regexp_tokenize(tweets[-1], pattern2)

print(mentions_hashtags)

['#TGIF', '#relaxation']


![Question11.png](Question11.png)

In [43]:
# Import the necessary modules
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer

# Use the TweetTokenizer to tokenize all tweets into one list
tknzr = TweetTokenizer()

all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)

[['Loving', 'the', 'sunny', 'weather', '!', '#sunnyday', '#beautiful'], ['Just', 'finished', 'a', 'great', 'workout', '.', '#fitness', '#health'], ["Can't", 'wait', 'for', 'the', 'weekend', '!', '#TGIF', '#relaxation']]


# Non-ascii tokenization

![Question12.png](Question12.png)

In [50]:
german_text = "Ich liebe NLP! 🎉 NLP ist faszinierend. Übung macht den Meister. 😄"

In [51]:
# Tokenize and print all words in german_text
all_words = word_tokenize(german_text)
print(all_words)

# Tokenize and print only capital words
capital_words = r"[A-ZÜ]\w+"
print(regexp_tokenize(german_text, capital_words))

# Tokenize and print only emoji
emoji = "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
print(regexp_tokenize(german_text, emoji))

['Ich', 'liebe', 'NLP', '!', '🎉', 'NLP', 'ist', 'faszinierend', '.', 'Übung', 'macht', 'den', 'Meister', '.', '😄']
['Ich', 'NLP', 'NLP', 'Übung', 'Meister']
['🎉', '😄']


# Charting practice

![Question13.png](Question13.png)

In [52]:
holy_grail = """
ARTHUR: What is your name?
SOLDIER #1: My name is Arthur, King of the Britons.
ARTHUR: What is your quest?
SOLDIER #1: To seek the Holy Grail.
"""