In [1]:
# 8.12.3 Other Search Functions
# Function search looks in a string for the first 
# occurrence of a substring that matches a regular 
# expression and returns a match object that contains
# the matching substring. 
# The match object’s group method returns that string.

In [2]:
import re
result = re.search('Python', 'Python is Fun')
result.group() if result else 'Not found'

'Python'

In [3]:
result = re.search('Fun!', 'Python is Fun')
result.group() if result else 'Not found'

'Not found'

In [5]:
# Ignore Case with the opitonal flags Keyword Argument
result2 = re.search('Sam', 'SAM WHITE', flags = re.IGNORECASE)
result2.group() if result2 else 'Not found'

'SAM'

In [6]:
# The ^ metacharacter at the beginning of a regular
# expression (and not inside square brackets) is
# an anchor indicating that expression matches only
# the beginning of a string

In [7]:
result = re.search('^Python', 'Python is Fun')
result.group() if result else 'Not found'

'Python'

In [8]:
result = re.search('^fun', 'Python is Fun')
result.group() if result else 'Not found'

'Not found'

In [9]:
# The $ metacharacter at the end of a regular expression
# is an anchor indicating that expression matches
# only the end of a string.

In [10]:
result = re.search('Python$', 'Python is Fun')
result.group() if result else 'Not found'

'Not found'

In [12]:
result = re.search('fun$', 'Python is fun')
result.group() if result else 'Not found'

'fun'

In [13]:
# Function [findall] finds every matching substring
# in a string and returns a list of the matching ss.

In [14]:
contact = 'Wally White, Home: 555-555-1234, Work: 555-555-4321'
re.findall(r'\d{3}-\d{3}-\d{4}', contact)

['555-555-1234', '555-555-4321']

In [15]:
# Data munging or data wrangling
# Two important steps in data munging is 
# Data Cleaning and Transforming Data

In [16]:
# Cleaning Your Data
# Data Validation: Use match method

In [18]:
import pandas as pd
zips = pd.Series({'Boston':'00215', 'Miami':'3310'})
zips

Boston    00215
Miami      3310
dtype: object

In [19]:
# Use regular expressions with Pandas to validate data.
# The str attribute of a series provides string-processing
# and various regular expression methods.
# Use the str attribute's match method to check whether each ZIP code is valid.
zips.str.match(r'\d{5}')

Boston     True
Miami     False
dtype: bool

In [20]:
# Reformatting Your Data
# Assume that an application requires U.S. phone numbers 
# in the format ###-###-####, with hyphens separating 
# each group of digits. The phone numbers have been 
# provided to us as 10-digit strings without hyphen.

In [21]:
# Reformatting Data
contacts = [['Mike Green', 'demo1@deitel.com', '5555555555'],
           ['Sue Brown', 'demo2@deitel.com', '5555551234']]
contactsdf = pd.DataFrame(contacts, columns = ['Name','Email','Phone'])
contactsdf

Unnamed: 0,Name,Email,Phone
0,Mike Green,demo1@deitel.com,5555555555
1,Sue Brown,demo2@deitel.com,5555551234


In [22]:
contactsdf['Phone']

0    5555555555
1    5555551234
Name: Phone, dtype: object

In [23]:
# Function get_formatted_phone maps 10 consecutive digits
# into format ###-###-####
# match object's [groups] method returns a tuple of the
# captured substrings. The regular exoression in block's
# first statement matches only 10 consecutive digits.
# It captures substrings containing the first three digits,
# next three digits, and the last four digits.

In [24]:
# The return statement operates as follows:
# If result is none, we simply return value unmodified
# Otherwise, we call result.groups() to get a tuple 
# containing the captured substrings and pass that 
# tuple to string method join to concatenate the 
# elements, separating each from the next with ‘-‘ to
# form the mapped phone number.

In [25]:
# String Join Method
myTuple = ("John", "Peter", "Vicky")
x = '#'.join(myTuple)
print(x)

John#Peter#Vicky


In [30]:
import re
def get_formatted_phone(value):
    result = re.fullmatch(r'(\d{3})(\d{3})(\d{4})', value)
    return '-'.join(result.groups()) if result else value

In [31]:
formatted_phone = contactsdf['Phone'].map(get_formatted_phone)
formatted_phone

0    555-555-5555
1    555-555-1234
Name: Phone, dtype: object

In [32]:
contactsdf['Phone']= formatted_phone
contactsdf

Unnamed: 0,Name,Email,Phone
0,Mike Green,demo1@deitel.com,555-555-5555
1,Sue Brown,demo2@deitel.com,555-555-1234
