In [1]:
#Parsing text using regular expression allows to get rid of unnecessary data
#The basic flags are I, L, M, S, U, X.
#•re.I ignores casing.
#•re.L finds a local dependent.
#•re.M finds patterns throughout multiple lines.
#•re.S finds dot matches.
#•re.U works for Unicode data.
#•re.X writes regex in a more readable format.
#The following describes regular expressions’ functionalities.
#•Find a single occurrence of characters a and b: [ab]
#•Find characters except for a and b: [^ab]
#•Find the character range of a to z: [a-z]
#•Find a character range except a to z: [^a-z]
#•Find all the characters from both a to z and A to Z: [a-zA-Z]
#•Find any single character: []
#•Find any whitespace character: \s
#•Find any non-whitespace character: \S
#•Find any digit: \d
#•Find any non-digit: \D
#•Find any non-words: \W
#•Find any words: \w
#•Find either a or b: (a|b)
#•The occurrence of a is either zero or one
#•Matches zero or not more than one occurrence: a? ; ?
#•The occurrence of a is zero or more times: a* ; * matches zero or more than tha
#The occurrence of a is one or more times: a+ ; + matches occurrences one or more than one time
#•Match three simultaneous occurrences of a: a{3}
#•Match three or more simultaneous occurrences of a: a{3,}
#•Match three to six simultaneous occurrences of a: a{3,6}
#•Start of a string: ^
#•End of a string: $
#•Match word boundary: \b
#•Non-word boundary: \B
#re.match() only checks for match the beginning of the string
#re.search ()checks for match everywehre in the string

In [2]:
#Tokenizing is splitting sentence into words using re.split()
import re
re.split('\s+','I like this book')

['I', 'like', 'this', 'book']

In [6]:
#Extracting Email IDs using re.findall()
doc="For more details please mail us at:xyz@abc.com, pqr@mno.com "
address=re.findall(r'[\w\.-]+@[\w\,-]+',doc)
for mails in address:
    print(mails)

xyz@abc
pqr@mno


In [9]:
#Replacing Email IDs use re.sub()
doc = "For more details please mail us at xyz@abc.com"
new_email_ids= re.sub(r'([\w\.-]+@[\w\,-]+)',r'pqr@mno.com',doc)
print(new_email_ids)

For more details please mail us at pqr@mno.com.com


In [33]:
import re
import requests
#url you want to extract
url = 'https://www.gutenberg.org/files/2638/2638-0.txt'
#function to extract
def get_book(url):
# Sends a http request to get the text from project Gutenberg
    raw = requests.get(url).text
# Discards the metadata from the beginning of the book
#    start = re.search(r" START OF THIS PROJECT GUTENBERG EBOOK",raw ).end()
# Discards the metadata from the end of the book
#    stop = re.search(r"II", raw).start()
# Keeps the relevant text
#    text = raw[start:stop]
    return raw
#processing
def preprocess(sentence):
    return re.sub('[^A-Za-z0-9.]+' , ' ', sentence).lower()
book=get_book(url)
processed_book=preprocess(book)
print(processed_book)



In [41]:
# Count number of times "the" is appeared in the book
len(re.findall(r"the",processed_book))

15904

In [45]:
#find all occurance of text in the format "abc--xyz"
re.findall(r'[a-zA-Z0-9]*--[a-zA-Z0-9]*', book)

['one--the', 'away--you']