In [3]:
# Regular Expression, is a sequence of characters that forms a search pattern.
# RegEx can be used to check if a string contains the specified search pattern.
# Python has a built-in package called re, which can be used to work with Regular Expressions.

import re

txt = "The rain in Spain"
# Search the string to see if it starts with "The" and ends with "Spain":
print(re.search('^The.*ra$', txt)) # "search" Returns a Match object if there is a match anywhere in the string other wise None


None


In [29]:
txt = "hello planet"
# ^ --> Starts with  --> "^hello"
x = re.findall("^hello", txt) #['hello']  # findall Returns a list containing all matches
print(x)
# $ --> Ends with  --> planet$"
print(re.findall("planet$", txt)) # ['planet']

# [] --> A set of characters  --> "[a-z]"
print(re.findall("[a-n]", txt)) # ['h', 'e', 'l', 'l', 'l', 'a', 'n', 'e']

# . --> Any character (except newline character)  --> "he..o"
string = "That will be 59 dollars 25 cents"
print(re.findall('Th.....l', string)) # ['That wil']  # Search for a sequence that starts with "he", followed by two (any) characters, and an "o":
# \ --> Signals a special sequence (can also be used to escape special characters)  --> "\d"
print(re.findall('\d', string)) # ['5', '9'] #Find all digit characters
# * --> Zero or more occurrences  --> "he.*o"
print(re.findall('he.*o', txt)) # Search for a sequence that starts with "he", followed by 0 or more  (any) characters, and an "o":
# + --> One or more occurrences  --> "he.+o"
print(re.findall('p.+a', txt)) #['pla'] #Search for a sequence that starts with "he", followed by 1 or more  (any) characters, and an "o":
print(re.findall('p.+l', txt)) #[] This time we got no match, because there were not not one more characters between "p" and the "l"
# ? --> Zero or one occurrences  --> "he.?o"
print(re.findall('he.?o', txt)) #[] This time we got no match, because there were not zero, not one, but two characters between "he" and the "o"
# {} --> Exactly the specified number of occurrences  --> "he.{2}o"
print(re.findall('he.{2}o', txt)) # ['hello'] #Search for a sequence that starts with "he", followed excactly 2 (any) characters, and "o":
# | --> Either or  --> "falls|stays"
print(re.findall('dollars|cents', string)) # ['dollars', 'cents']
print(re.findall('dollars|pound', string)) # ['dollars']
print(re.findall('taka|pound', string)) # []

# () --> Capture and group


['hello']
['planet']
['h', 'e', 'l', 'l', 'l', 'a', 'n', 'e']
['That wil']
['5', '9', '2', '5']
['hello']
['pla']
[]
[]
['hello']
['dollars', 'cents']
['dollars']
[]


In [30]:
# Special Sequences
# \A --> Returns a match if the specified characters are at the beginning of the string  --> "\AThe"
# \b --> Returns a match where the specified characters are at the beginning or at the end of a word(the "r" in the beginning is making sure that the string is being treated as a "raw string")  -->  r"\bain" r"ai\b"
# \B --> Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word(the "r" in the beginning is making sure that the string is being treated as a "raw string") --> r"\Bain"r"ai\B"
# \d --> Returns a match where the string contains digits (numbers from 0-9)  --> "\d"
# \D --> Returns a match where the string DOES NOT contain digits  --> "\D"
# \s --> Returns a match where the string contains a white space character  --> "\s"
# \S --> Returns a match where the string DOES NOT contain a white space character  --> "\S"
# \w --> Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)  --> "\w"
# \W --> Returns a match where the string DOES NOT contain any word characters  --> "\W"
# \Z --> Returns a match if the specified characters are at the end of the string  -->  "Spai\Z"

In [31]:
# Set of characters inside a pair of square brackets [] with a special meaning:
# [arn] --> Returns a match where one of the specified characters (a, r, or n) is present
# [a-n] --> Returns a match for any lower case character, alphabetically between a and n
# [^arn] --> Returns a match for any character EXCEPT a, r, and n
# [0123] --> Returns a match where any of the specified digits (0, 1, 2, or 3) are present
# [0-9] --> Returns a match for any digit between 0 and 9
# [0-5][0-9] --> Returns a match for any two-digit numbers from 00 and 59
# [a-zA-Z] --> Returns a match for any character alphabetically between a and z, lower case OR upper case
# [+] --> In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character in the string

In [36]:
# split() function returns a list where the string has been split at each match:
txt = "The rain in Spain"
x = re.split("\s", txt) # here \s meaning white space
print(x)

# You can control the number of occurrences by specifying the maxsplit parameter:
txt = "The rain in Spain"
x = re.split("\s", txt, 1)
print(x)

['The', 'rain', 'in', 'Spain']
['The', 'rain in Spain']


In [38]:
# sub() function replaces the matches with the text of your choice:
txt = "The rain in Spain"
x = re.sub("\s", "**", txt) # Replace every white-space character with the number **:
print(x)

The**rain**in**Spain
