## Regular Expression

https://www.tutorialspoint.com/python3/python_reg_expressions.htm


Except for the control characters, (+ ? . * ^ $ ( ) [ ] { } | \), all characters match themselves. You can escape a control character by preceding it with a backslash.
```
. single char
? 0 or 1 occurrance
+ 1 or N occurrance
* 0 or N occurrance
{n,m} n-m occurrances
() grouping
[] collection
```

In [40]:
import re

### match

In [26]:
line = "I think I understand regular expressions"

rexResult = re.match('think', line, re.M|re.I)
if rexResult:
   print("Match Found: " + rexResult.group())
else:
   print("No match was found")


No match was found


In [29]:
rexResult = re.match(r'(.*)think(.*)', line, re.M|re.I)
if rexResult:
   print(rexResult.groups(), len(rexResult.groups()))
   print("Match Found: " + rexResult.group(0))
   print("Match Found: " + rexResult.group(1))
   print("Match Found: " + rexResult.group(2))
else:
   print("No match was found")

('I ', ' I understand regular expressions') 2
Match Found: I think I understand regular expressions
Match Found: I 
Match Found:  I understand regular expressions


#### greedy match

In [42]:
line = "<python>perl>"

In [45]:
rexResult = re.match(r'<(.*)>', line, re.M|re.I)   # greedy

if rexResult:
   print(rexResult.groups(), len(rexResult.groups()))
   print("Match Found: " + rexResult.group(0))
   print("Match Found: " + rexResult.group(1))
else:
   print("No match was found")

('python>perl',) 1
Match Found: <python>perl>
Match Found: python>perl


In [46]:
rexResult = re.match(r'<(.*?)>', line, re.M|re.I)   # non-greedy

if rexResult:
   print(rexResult.groups(), len(rexResult.groups()))
   print("Match Found: " + rexResult.group(0))
   print("Match Found: " + rexResult.group(1))
else:
   print("No match was found")

('python',) 1
Match Found: <python>
Match Found: python


### backreferences

In [52]:
line = "python&pails" 
line = "Python&pails"

In [53]:
rexResult = re.match(r'([Pp])ython&\1ails', line, re.M|re.I)

if rexResult:
   print(rexResult.groups(), len(rexResult.groups()))
   print("Match Found: " + rexResult.group(0))
   print("Match Found: " + rexResult.group(1))
else:
   print("No match was found")

('P',) 1
Match Found: Python&pails
Match Found: P


In [60]:
line = '"Halo Python"'

In [61]:
rexResult = re.match(r'(["])([^\1]*)\1', line, re.M|re.I)   # match any quoted string

if rexResult:
   print(rexResult.groups(), len(rexResult.groups()))
   print("Match Found: " + rexResult.group(0))
   print("Match Found: " + rexResult.group(1))
   print("Match Found: " + rexResult.group(2))
else:
   print("No match was found")

('"', 'Halo Python') 2
Match Found: "Halo Python"
Match Found: "
Match Found: Halo Python


### search

In [35]:
rexResult = re.search('Think', line, re.M|re.I)
if rexResult:
   print(rexResult.groups(), len(rexResult.groups()))
   print("Search Found: "+rexResult.group(0))
else:
   print("Nothing found in search")

() 0
Search Found: think


### search and replace

In [38]:
phone = "200-959-4559 # This is Phone Number"

# Delete Python-style comments
num = re.sub(r'#.*$', "", phone)
print ("Phone Num : ", num)

Phone Num :  200-959-4559 


In [39]:
# Remove anything other than digits
num = re.sub(r'\D', "", phone)    
print ("Phone Num : ", num)

Phone Num :  2009594559


### parse weblog

https://www.cloudera.com/developers/get-started-with-hadoop-tutorial/exercise-2.html
```
CREATE EXTERNAL TABLE intermediate_access_logs (
    ip STRING,
    date STRING,
    method STRING,
    url STRING,
    http_version STRING,
    code1 STRING,
    code2 STRING,
    dash STRING,
    user_agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
    'input.regex' = '([^ ]*) - - \\[([^\\]]*)\\] "([^\ ]*) ([^\ ]*) ([^\ ]*)" (\\d*) (\\d*) "([^"]*)" "([^"]*)"',
    'output.format.string' = "%1$$s %2$$s %3$$s %4$$s %5$$s %6$$s %7$$s %8$$s %9$$s")
LOCATION '/user/hive/warehouse/original_access_logs';
```

In [7]:
web_log = 'local - - [24/Oct/1994:14:41:24 -0600] "GET 36.html HTTP/1.0" 200 822'
rex = r'(\w+) - - \[([^\]]*)\] "(\w+) (.*) (.*)" (\d+) (\d+)'

In [23]:
matchResult = re.match(rex, web_log, re.M|re.I)
if matchResult:
    print("Match Found: " + matchResult.group(0))
    print("  source: " + matchResult.group(1))
    print("  datetime: " + matchResult.group(2))
    print("  req verb: " + matchResult.group(3))
    print("  req path: " + matchResult.group(4))
    print("  req proto: " + matchResult.group(5))
    print("  status: " + matchResult.group(6))
    print("  bytes: " + matchResult.group(7))

Match Found: local - - [24/Oct/1994:14:41:24 -0600] "GET 36.html HTTP/1.0" 200 822
  source: local
  datetime: 24/Oct/1994:14:41:24 -0600
  req verb: GET
  req path: 36.html
  req proto: HTTP/1.0
  status: 200
  bytes: 822
