## Matching patterns without using Regex

In [10]:
# phone number style : 234-232-4334
def isPhoneNumber(text):
    if len(text) != 12:
        return False 
    for i in range(0,3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False 
    for i in range(4,7):
        if not text[i].isdecimal():
            return False 
    if text[7] != '-':
        return False 
    for i in range(8,12):
        if not text[i].isdecimal():
            return False 
    return True

In [18]:
test_phoneno = '232-423-2323'
isPhoneNumber(test_phoneno)

True

In [33]:
def detect_ph_no(message):
    """
    provide a string input
    """
    try:
        type(message) ==int
        for i in range(len(message)):
            chunk = message[i:i+12]
            if isPhoneNumber(chunk):
                print('Phone number found: ' + chunk)         

    except Exception as e:
        print(f'provide a string input::: {e}')


In [34]:
message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
detect_ph_no(message)

Phone number found: 415-555-1011
Phone number found: 415-555-9999


In [36]:
detect_ph_no(232)

provide a string input::: object of type 'int' has no len()


# USING REGEX 

In [37]:
import re 

In [38]:
phoneNumberRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

In [39]:
phoneNumberRegex

re.compile(r'\d\d\d-\d\d\d-\d\d\d\d', re.UNICODE)

In [45]:
print('\\n\n\\n')

\n
\n


#### Raw string:
By putting an r before the first quote of the string value, you can mark the string as a raw string, which does not escape characters.

In [47]:
print(r'\n')

\n


In [50]:
print('df\ne')

df
e


In [51]:
print(r'df\ne')

df\ne


## Regex compile/search/group()

A Regex object’s search() method searches the string it is passed for any
matches to the regex. The search() method will return None if the regex pat-
tern is not found in the string. If the pattern is found, the search() method
returns a Match object. Match objects have a group() method that will return
the actual matched text from the searched string. 

In [54]:
phoneNumberRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumberRegex.search('my number is 234-234-2341')
mo

<re.Match object; span=(13, 25), match='234-234-2341'>

In [55]:
mo.group()

'234-234-2341'

## Grouping with parentheses

 * Supppose, you want to separate the area code from the rest of the phone number.Adding parentheses will create gorups in the regex:(\d\d\d)-(\d\d\d-\d\d\d\d).
 * The first set of parentheses in a regex string will be group 1 and the second set will be goup 2. By passing integer 1 or 2 to the group() match object method, we can grab different groups, passing 0 to the group will return the entire match. 

In [68]:
phoneNumberRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)') 
mo = phoneNumberRegex.search('my number is 234-234-2341')

print('matching object :', mo)
print(f'\ngroup one : {mo.group(1)}')
print('grop two : ',mo.group(2))
print('entire match: ',mo.group(0))
print('\ngroups = ', mo.groups())

matching object : <re.Match object; span=(13, 25), match='234-234-2341'>

group one : 234
grop two :  234-2341
entire match:  234-234-2341

groups =  ('234', '234-2341')


In [69]:
areacode, main_no = mo.groups()

In [71]:
print(areacode)
print(main_no)

234
234-2341


In [82]:
# (234) 232-3843  # new format 
ph_no_regex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = ph_no_regex.search('my number is (234) 234-2233')
mo.groups()

('(234)', '234-2233')

## Matching Multiple Groups with the Pipe 
 * The | character is called a pipe. You can use it anywhere you want to match one
of many expressions. For example, the regular expression r'Batman|Tina Fey'
will match either 'Batman' or 'Tina Fey'.
 * When both Batman and Tina Fey occur in the searched string, the first
occurrence of matching text will be returned as the Match object. 

In [89]:
heroRegex = re.compile(r'Batman|Tina Fey') 
mo1 = heroRegex.search('Batman and Tina Fey')
mo1.group()

'Batman'

In [90]:
mo2 = heroRegex.search(' Tina Fey and Batman')
try: 
    print(mo2.group())
except:
    print('no matching found')

Tina Fey


In [91]:
# case sensitive matching 
mo2 = heroRegex.search(' tina fey and batman')
try: 
    print(mo2.group())
except:
    print('no matching found')

no matching found


## Matching with prefixes 
You can also use the pipe to match one of several patterns as part of
your regex. For example, say you wanted to match any of the strings 'Batman',
'Batmobile', 'Batcopter', and 'Batbat'. Since all these strings start with Bat, it
would be nice if you could specify that prefix only once. This can be done
with parentheses.  
 * If you need to match an actual pipe character, escape it with a back­
slash, like \|.


In [95]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)') 
mo = batRegex.search('Batmobile is Batman') 
mo.group()

'Batmobile'

In [96]:
mo.groups()

('mobile',)

## Optional Matching with the question mark

In [99]:
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The adventures of Batwoman')
mo1.group()

'Batwoman'

In [100]:
mo1 = batRegex.search('The adventures of Batman')
mo1.group()

'Batman'

In [105]:
ph_regex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = ph_regex.search('my number is 234-233-5421')
mo1.group()

'234-233-5421'

In [106]:
mo1 = ph_regex.search('my number is 233-5421')
mo1.group()

'233-5421'

## Matching zero or more with the star
The * (called the star or asterisk) means “match zero or more”—the group
that precedes the star can occur any number of times in the text. It can be
completely absent or repeated over and over again.

In [107]:
bat_regex = re.compile(r'Bat(wo)*man')
mo = bat_regex.search('The adventures of Batman')
mo.group()

'Batman'

In [109]:
mo = bat_regex.search('The adventures of Batwoman')
mo.group()

'Batwoman'

In [110]:
mo = bat_regex.search('The adventures of Batwowowowowowowoman')
mo.group()

'Batwowowowowowowoman'

## Matching one or more with the plus 
While * means “match zero or more,” the + (or plus) means “match one or
more.” Unlike the star, which does not require its group to appear in the
matched string, the group preceding a plus must appear at least once. It is
not optional.

In [111]:
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batwoman')
mo1.group()

'Batwoman'

In [113]:
mo1 = batRegex.search('The Adventures of Batman')
try:
    print(mo1.group())
except:
    print('No matches found')

No matches found


In [115]:
mo1 = batRegex.search('The Adventures of Batwowowowowowowowoman')
mo1.group()

'Batwowowowowowowowoman'

## Matching Specific Repetitions with Curly Brackets
 * If you have a group that you want to repeat a specific number of times, follow the group in your regex with a number in curly brackets.
 * For example,the regex (Ha){3} will match the string 'HaHaHa', but it will not match 'HaHa',since the latter has only two repeats of the (Ha) group.
 * Instead of one number, you can specify a range by writing a minimum,a comma, and a maximum in between the curly brackets. For example, theregex (Ha){3,5} will match 'HaHaHa', 'HaHaHaHa', and 'HaHaHaHaHa'.
 * You can also leave out the first or second number in the curly bracketsto leave the minimum or maximum unbounded. For example, (Ha){3,} will match three or more instances of the (Ha) group, while (Ha){,5} will match zero to five instances. Curly brackets can help make your regular expres- sions shorter.

In [116]:
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHa')
mo1.group()

'HaHaHa'

In [117]:
mo2 = haRegex.search('Ha')
try:
    print(mo2.group())
except:
    print('No matches found')

No matches found


## Greedy and Nongreedy Matching
Python’s regular expressions are greedy by default, which means that in
ambiguous situations they will match the longest string possible. The non-
greedy version of the curly brackets, which matches the shortest string pos-
sible, has the closing curly bracket followed by a question mark.


In [118]:
greedy_regex = re.compile(r'(Ha){3,5}')
mo1 = greedy_regex.search('HaHaHaHaHa')
mo1.group()

'HaHaHaHaHa'

In [119]:
non_greedy_regex = re.compile(r'(Ha){3,5}?')
mo1 = non_greedy_regex.search('HaHaHaHaHa')
mo1.group()

'HaHaHa'

## Findall() method 
While search() will return a Match object of the first matched text
in the searched string, the findall() method will return the strings of every
match in the searched string. 

 * When called on a regex with no groups, such as \d\d\d-\d\d\d-\d\d\d\d,the method findall() returns a list of string matches, such as ['415-555-9999', '212-555-0000'].
 
 * When called on a regex that has groups, such as (\d\d\d)-(\d\d\d)-(\d\d\d\d), the method findall() returns a list of tuples of strings (one stringfor each group), such as [('415', '555', '1122'), ('212', '555', '0000')].


In [120]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('Cell: 415-555-9999 Work: 212-555-0000')
mo.group()

'415-555-9999'

In [121]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')


['415-555-9999', '212-555-0000']

In [124]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
mo = phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')
mo

[('415', '555', '9999'), ('212', '555', '0000')]

## Character Classes

In [125]:
reg = re.compile(r'\W')

In [129]:
reg.findall('&5%______23ldjfALSDF?????//|\\\\^')

['&', '%', '?', '?', '?', '?', '?', '/', '/', '|', '\\', '\\', '^']

<img src='charactor_class_regex.png'>

The regular expression \d+\s\w+ will match text that has one or more
numeric digits (\d+), followed by a whitespace character (\s), followed by
one or more letter/digit/underscore characters ( \w+). The findall() method
returns all matching strings of the regex pattern in a list.

In [133]:
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('''
12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 2314142,;laskdjf234 las2324,
7,swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge''')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '234 las2324',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

## Making character classes 
 * . You can defineyour own character class using square brackets. For example, the characterclass [aeiouAEIOU] will match any vowel, both lowercase and uppercase.
 * You can also include ranges of letters or numbers by using a hyphen.For example, [a-zA-Z0-9]
 * By placing a caret character (^) just after the character class’s opening bracket, you can make a negative character class. A negative character class will match all the characters that are not in the character class. 


In [136]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('thomaskutty reji thomaskutty')

['o', 'a', 'u', 'e', 'i', 'o', 'a', 'u']

In [140]:
consonantRegex = re.compile(r'[^aeiouAEIOU\s\.]')
consonantRegex.findall('Roaby food. BD.')


['R', 'b', 'y', 'f', 'd', 'B', 'D']

## The Caret and Dollar Sign Characters
You can also use the caret symbol (^) at the start of a regex to indicate that
a match must occur at the beginning of the searched text. Likewise, you can
put a dollar sign at the end of the regex to indicate the string must end
with this regex pattern. And you can use the ^ and $ together to indicate
that the entire string must match the regex—that is, it’s not enough for a
match to be made on some subset of the string.


In [141]:
beginsWithHello = re.compile(r'^Hello')
beginsWithHello.search('Hello world!')

<re.Match object; span=(0, 5), match='Hello'>

In [142]:
beginsWithHello.search('He said hello.') == None

True

In [143]:
endsWithNumber = re.compile(r'\d$')
endsWithNumber.search('Your number is 42')

<re.Match object; span=(16, 17), match='2'>

In [144]:
endsWithNumber.search('Your number is forty two.') == None

True

In [145]:
wholeStringIsNum = re.compile(r'^\d+$')
wholeStringIsNum.search('1234567890')

<re.Match object; span=(0, 10), match='1234567890'>

In [146]:
 wholeStringIsNum.search('12345xyz67890') == None

True

In [148]:
wholeStringIsNum.search('12 34567890') == None

True

## The Wildcard Character
The . (or dot) character in a regular expression is called a wildcard and will
match any character except for a newline. 

In [160]:
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

Remember that the dot character will match just one character, which
is why the match for the text flat in the previous example matched only lat.
To match an actual dot, escape the dot with a backslash: \..

In [159]:
atRegex = re.compile(r'\w*at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'flat', 'mat']

## Matching Everything with Dot-Star
you want to match the string 'First Name:', followed by any and all text,
followed by 'Last Name:', and then followed by anything again. You can
use the dot-star (.*) to stand in for that “anything.” 

In [166]:
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart') 
mo.groups()

('Al', 'Sweigart')

The dot-star uses greedy mode: It will always try to match as much text as
possible. To match any and all text in a nongreedy fashion, use the dot, star,
and question mark ( .*?). Like with curly brackets, the question mark tells
Python to match in a nongreedy way.


In [169]:
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man>'

In [171]:
greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man> for dinner.>'

## Matching Newlines with the Dot Character

In [173]:
nonewline_reg = re.compile('.*') 
nonewline_reg.search('this is the first line.\nThis is the second line').group()

'this is the first line.'

In [174]:
newline_reg = re.compile('.*',re.DOTALL)
newline_reg.search('this is the first line. \nThis is the second line').group()

'this is the first line. \nThis is the second line'

## Regex symbols 
 * The ? matches zero or one of the preceding group.
 * The * matches zero or more of the preceding group.
 * The + matches one or more of the preceding group.
 * The {n} matches exactly n of the preceding group.
 * The {n,} matches n or more of the preceding group.
 * The {,m} matches 0 to m of the preceding group.
 * The {n,m} matches at least n and at most m of the preceding group.
 * {n,m}? or *? or +? performs a nongreedy match of the preceding group.
 * ^spam means the string must begin with spam.
 * spam  then dollar symbol means the string must end with spam.$ 
 
 * The . matches any character, except newline characters.
 * \d, \w, and \s match a digit, word, or space character, respectively.
 * \D, \W, and \S match anything except a digit, word, or space character,
 * respectively.
 * [abc] matches any character between the brackets (such as a, b, or c).
 * [^abc] matches any character that isn’t between the brackets.

## Case insensitive matching 

In [177]:
robocop = re.compile(r'(robocop)', re.IGNORECASE)
robocop.search('asldkf asdfas f roboCoP asldj asfj').group()

'roboCoP'

## Substituting strings with the sub() method 

In [179]:
names_reg = re.compile(r'thomaskutty \w+' )
names_reg.sub("system_1", "thomaskutty reji gave secret code to the system2 server")

'system_1 gave secret code to the system2 server'

you want to censor the names of the secret agents by
showing just the first letters of their names. To do this, you could use the
regex Agent (\w)\w* and pass r'\1****' as the first argument to sub(). The \1
in that string will be replaced by whatever text was matched by group 1—
that is, the (\w) group of the regular expression.


In [182]:
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****',
'Agent Alice told Agent Carol that AgentEve knew Agent Bob was a double agent.')


'A**** told C**** that AgentEve knew B**** was a double agent.'