# **Regular Expression**

In [1]:
import re

In [2]:
#match - A Match Object is an object containing information about the search and the result.

print(re.match("ab","ABC"))

None


In [6]:
#match

print(re.match("ab","abc"))

<re.Match object; span=(0, 2), match='ab'>


In [3]:
#match.span() returns a tuple containing the start-, and end positions of the match.

re.match("ab","abc").span()

(0, 2)

In [5]:
#match.string returns the string passed into the function

re.match("ab","abc").string

'abc'

In [4]:
#match.group() returns the part of the string where there was a match

re.match("ab","abc").group()

'ab'

In [7]:
#exercise check whether the telephone number in the list below begins with 02

phone_lists = ["02-123-4567", "0899876543", "0987654321"]
pattern = "02"



# **RegEx Functions**

In [8]:
#findall() returns a list containing all matches.

pattern = "AIS"
text = '''Manage all your AIS numbers with one login. Manage things much easier with one login My AIS allows you to access all your AIS number accounts by logging in only once. 
          It is really convenient to manage your AIS postpaid, AIS 1-2-Call! And Fibre accounts all at one go.'''


re.findall(pattern, text)

['AIS', 'AIS', 'AIS', 'AIS', 'AIS']

In [9]:
#finditer() returns an iterator yielding match objects matching the regex pattern.

for match in re.finditer(pattern, text):
  print(f"start index {match.start()}, end index {match.end()} ,{text[match.start():match.end()]}")

start index 16, end index 19 ,AIS
start index 88, end index 91 ,AIS
start index 122, end index 125 ,AIS
start index 217, end index 220 ,AIS
start index 231, end index 234 ,AIS


In [10]:
#search() function searches the string for a match, and returns a Match object if there is a match. If there is more than one match, only the first occurrence of the match will be returned.

re.search(pattern, text)

<re.Match object; span=(16, 19), match='AIS'>

In [11]:
#split() returns a list where the string has been split at each match.

re.split("!", text)

['Manage all your AIS numbers with one login. Manage things much easier with one login My AIS allows you to access all your AIS number accounts by logging in only once. \n          It is really convenient to manage your AIS postpaid, AIS 1-2-Call',
 ' And Fibre accounts all at one go.']

In [12]:
#sub() replaces the matches with the text of your choice.

re.sub("!", "*", text)

'Manage all your AIS numbers with one login. Manage things much easier with one login My AIS allows you to access all your AIS number accounts by logging in only once. \n          It is really convenient to manage your AIS postpaid, AIS 1-2-Call* And Fibre accounts all at one go.'

In [13]:
#subn() The re.subn() is similar to re.sub() except it returns a tuple of 2 items containing the new string and the number of substitutions made.

re.subn("!", "*", text)

('Manage all your AIS numbers with one login. Manage things much easier with one login My AIS allows you to access all your AIS number accounts by logging in only once. \n          It is really convenient to manage your AIS postpaid, AIS 1-2-Call* And Fibre accounts all at one go.',
 1)

In [14]:
#compile(pattern) Regular expressions are handled as strings by Python. However, with compile(), you can computer a regular expression pattern into a regular expression object.

pattern = 'AIS'

AIS_pattern = re.compile(pattern)

AIS_pattern.findall(text)

['AIS', 'AIS', 'AIS', 'AIS', 'AIS']

In [None]:
#exercise write a function that return a position of each match

pattern = "AIS"
text = '''Manage all your AIS numbers with one login. Manage things much easier with one login My AIS allows you to access all your AIS number accounts by logging in only once. 
          It is really convenient to manage your AIS postpaid, AIS 1-2-Call! And Fibre accounts all at one go.'''




# **Metacharacters**

In [15]:
text = '''Manage all your AIS numbers with one login. Manage things much easier with one login My AIS allows you to access all your AIS number accounts by logging in only once. 
          It is really convenient to manage your AIS postpaid, AIS 1-2-Call! And Fibre accounts all at one go.'''

In [16]:
def patternFinder(pattern, text):
  i = 1
  for match in re.finditer(pattern, text):
    print(f"{i}. match {match.group()} - start index {match.start()}, end index {match.end()}")
    i += 1
  if not any(re.finditer(pattern, text)):
    print("No matches")

In [17]:
pattern = "AIS"

patternFinder(pattern, text)

1. match AIS - start index 16, end index 19
2. match AIS - start index 88, end index 91
3. match AIS - start index 122, end index 125
4. match AIS - start index 217, end index 220
5. match AIS - start index 231, end index 234


In [18]:
#[] A set of characters

pattern = "[AEIOU]"

patternFinder(pattern, text)

1. match A - start index 16, end index 17
2. match I - start index 17, end index 18
3. match A - start index 88, end index 89
4. match I - start index 89, end index 90
5. match A - start index 122, end index 123
6. match I - start index 123, end index 124
7. match I - start index 178, end index 179
8. match A - start index 217, end index 218
9. match I - start index 218, end index 219
10. match A - start index 231, end index 232
11. match I - start index 232, end index 233
12. match A - start index 245, end index 246


In [19]:
#. Any character (except newline character)

pattern = "[AEIOU].."

patternFinder(pattern, text)

1. match AIS - start index 16, end index 19
2. match AIS - start index 88, end index 91
3. match AIS - start index 122, end index 125
4. match It  - start index 178, end index 181
5. match AIS - start index 217, end index 220
6. match AIS - start index 231, end index 234
7. match And - start index 245, end index 248


In [20]:
#^ Starts with

pattern = "^AIS"

patternFinder(pattern, text)

No matches


In [21]:
#^ Starts with

pattern = "^Manage"

patternFinder(pattern, text)


1. match Manage - start index 0, end index 6


In [22]:
#$ Ends with

pattern = "AIS$"

patternFinder(pattern, text)

No matches


In [23]:
#* Zero or more occurrences

pattern = "the*"
text = "they thou their thief thee"

patternFinder(pattern, text)

1. match the - start index 0, end index 3
2. match th - start index 5, end index 7
3. match the - start index 10, end index 13
4. match th - start index 16, end index 18
5. match thee - start index 22, end index 26


In [24]:
#+ one or more occurrences

pattern = "the+"
text = "they thou their thief thee"

patternFinder(pattern, text)

1. match the - start index 0, end index 3
2. match the - start index 10, end index 13
3. match thee - start index 22, end index 26


In [25]:
#? zero or one occurrences

pattern = "the?"
text = "they thou their thief thee"

patternFinder(pattern, text)

1. match the - start index 0, end index 3
2. match th - start index 5, end index 7
3. match the - start index 10, end index 13
4. match th - start index 16, end index 18
5. match the - start index 22, end index 25


In [26]:
#{} Exactly the specified number of occurrences

pattern = "the{2}"
text = "they thou their thief thee"

patternFinder(pattern, text)

1. match thee - start index 22, end index 26


In [27]:
#| Either or

pattern = "ei|ef"
text = "they thou their thief thee"

patternFinder(pattern, text)

1. match ei - start index 12, end index 14
2. match ef - start index 19, end index 21


In [31]:
#exercise find words with Sh sound

text = "Susie works in a shoeshine shop. Where she shines she sits, and where she sits she shines"


pattern = "she?"


patternFinder(pattern, text)

1. match sh - start index 17, end index 19
2. match sh - start index 21, end index 23
3. match sh - start index 27, end index 29
4. match she - start index 39, end index 42
5. match sh - start index 43, end index 45
6. match she - start index 50, end index 53
7. match she - start index 70, end index 73
8. match she - start index 79, end index 82
9. match sh - start index 83, end index 85


# **Special Sequences**

In [32]:
text = '''Manage all your AIS numbers with one login. Manage things much easier with one login My AIS allows you to access all your AIS number accounts by logging in only once. 
          It is really convenient to manage your AIS postpaid, AIS 1-2-Call! And Fibre accounts all at one go.'''

In [33]:
#\A Returns a match if the specified characters are at the beginning of the string

pattern = "\AManage"

patternFinder(pattern, text)

1. match Manage - start index 0, end index 6


In [34]:
#\b Returns a match where the specified characters are at the beginning or at the end of a word (the "r" in the beginning is making sure that the string is being treated as a "raw string")

pattern = r"\bAIS"

patternFinder(pattern, text)

1. match AIS - start index 16, end index 19
2. match AIS - start index 88, end index 91
3. match AIS - start index 122, end index 125
4. match AIS - start index 217, end index 220
5. match AIS - start index 231, end index 234


In [35]:
#\B Returns a match where the specified characters are at the beginning or at the end of a word (the "r" in the beginning is making sure that the string is being treated as a "raw string")

pattern = r"\BAIS"

patternFinder(pattern, text)

No matches


In [36]:
#\d Returns a match where the string contains digits (numbers from 0-9)	
#\D Returns a match where the string DOES NOT contain digits	

pattern = "\d"

patternFinder(pattern, text)

1. match 1 - start index 235, end index 236
2. match 2 - start index 237, end index 238


In [None]:
#\s Returns a match where the string contains a white space character	
#\S Returns a match where the string DOES NOT contain a white space character	

pattern = "\s"

patternFinder(pattern, text)

In [39]:
#\w Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)	
#\W Returns a match where the string DOES NOT contain any word characters

pattern = "\s\w{10}\s"

patternFinder(pattern, text)
print(text[190:202])

1. match  convenient  - start index 190, end index 202
 convenient 


In [None]:
#\Z Returns a match if the specified characters are at the end of the string		

pattern = "g..\Z"

patternFinder(pattern, text)

1. match go. - start index 275, end index 278


In [None]:
#exercise extract GB of each handset
HS_List = ["iPhone 13 mini (128 GB)", "iPhone 13 mini (256 GB)", "iPhone 13 mini (512 GB)", "iPhone 13 Pro (128 GB)", "iPhone 13 Pro (256 GB)", "iPhone 13 Pro (512 GB)"]

pattern = 

for text in HS_List:
  patternFinder(pattern, text)

1. match 128 GB - start index 16, end index 22
1. match 256 GB - start index 16, end index 22
1. match 512 GB - start index 16, end index 22
1. match 128 GB - start index 15, end index 21
1. match 256 GB - start index 15, end index 21
1. match 512 GB - start index 15, end index 21


# **Sets**

In [None]:
#[arn] Returns a match where one of the specified characters (a, r, or n) are present	
#[^arn] Returns a match for any character EXCEPT a, r, and n

pattern = "[ABCDEFGH]"

patternFinder(pattern, text)

1. match A - start index 16, end index 17
2. match A - start index 88, end index 89
3. match A - start index 122, end index 123
4. match A - start index 217, end index 218
5. match A - start index 231, end index 232
6. match C - start index 239, end index 240
7. match A - start index 245, end index 246
8. match F - start index 249, end index 250


In [None]:
#[a-n] Returns a match for any lower case character, alphabetically between a and n

pattern = "[A-H]"

patternFinder(pattern, text)

1. match A - start index 16, end index 17
2. match A - start index 88, end index 89
3. match A - start index 122, end index 123
4. match A - start index 217, end index 218
5. match A - start index 231, end index 232
6. match C - start index 239, end index 240
7. match A - start index 245, end index 246
8. match F - start index 249, end index 250


In [None]:
#[0123] Returns a match where any of the specified digits (0, 1, 2, or 3) are present

pattern = "[12345]"

patternFinder(pattern, text)

1. match 1 - start index 235, end index 236
2. match 2 - start index 237, end index 238


In [None]:
#[0-9] Returns a match for any digit between 0 and 9

pattern = "[1-5]"

patternFinder(pattern, text)

1. match 1 - start index 235, end index 236
2. match 2 - start index 237, end index 238


In [None]:
#[0-5][0-9] Returns a match for any two-digit numbers from 00 and 59

pattern = "[0-9][0-9]"

patternFinder(pattern, text)

No matches


In [None]:
#[a-zA-Z] Returns a match for any character alphabetically between a and z, lower case OR upper case

pattern = "\A[a-zA-Z]+\s"

patternFinder(pattern, text)

1. match Manage  - start index 0, end index 7


In [42]:
#exercise write a pattern to extract price and price unit of each package

sim_List = ["ซิม Disney+ Hotstar 499 บาท/ปี", "ซิม Disney+ Hotstar 49 บาท/เดือน", "เน็ต มาราธอน 499 บาท", "เน็ต มาราธอน 535 บาท", "เน็ต มาราธอน 799 บาท", "เน็ต มาราธอน 880"]


price_pattern = "\d+"
priceunit_pattern = "บาท.*"

for text in sim_List:
  patternFinder(price_pattern, text)
  patternFinder(priceunit_pattern, text)

1. match 499 - start index 20, end index 23
1. match บาท/ปี - start index 24, end index 30
1. match 49 - start index 20, end index 22
1. match บาท/เดือน - start index 23, end index 32
1. match 499 - start index 13, end index 16
1. match บาท - start index 17, end index 20
1. match 535 - start index 13, end index 16
1. match บาท - start index 17, end index 20
1. match 799 - start index 13, end index 16
1. match บาท - start index 17, end index 20
1. match 880 - start index 13, end index 16
No matches
