# Basic string cleanup and re module
## Typical data cleaning activities



 data cleanup activities?

- Find NA and fill NA, duplicate (**pd.drop_duplicate**), empty (**pd.dropna**)
- outliers
- Columns with multiple values
- values in a column or across the columns not having same units
- non ascii characters, whitespaces
    + For natural language need more cleanup (nltk, gensim etc.)


# String cleaning in python

In [1]:
keywords = '\t data  science    \n'

In [2]:
# remove whitespaces from keywords
keywords.strip()

'data  science'

# Regular Expressions, Python re module for text manipulation
regex describes a pattern to locate in the text

- pattern matching
- substitution
- splitting

# quick review
- \$ end of line
- ^ beginning of line
- [] for specifying character class like [aez], [a-z]
    + negate using ^ like [^aez]
    + Special characters have no special meaning classes. [ab$]
    
- backslash with a character signal various special sequences    
    - **\d** digits \d=[0-9]
    - **\s** whitespace character \s=[ \t\n\r\f\v]
    - **\w** alphanumeric [a-zA-Z0-9_]
- **.** matches anything except a newline character,

# Repeating qualifier
- * previous character can be matched zero or more times
- **+** repeat at least once
- **?** matches either once or zero times (optional).
- **.** matches anything except a newline character.
- **{m, n}** at least m repetitions, and at most n. c/{1,2}b will match 'c/b', 'c//b' but not cb
 
<font color="red" size="7"> If you need to match a [ or \, or $ you can precede them with a backslash to remove their special meaning: \[ or \\  \\$.

#  Removing variable number of white spaces

In [3]:
str_keywords = "data\t \n\n science job"
print(str_keywords)
str_keywords.strip()

data	 

 science job


'data\t \n\n science job'

# re module in python

provides an interface to the regular expressions engine


# Call Module  functions or compile the pattern

In [4]:
import re
re.findall(r'\s+' ,str_keywords)
#str_keywords


['\t \n\n ', ' ']

or compile

In [5]:
regex = re.compile(r'\s+')
regex.findall(str_keywords)

['\t \n\n ', ' ']

### or to split based on regex

In [7]:
# Note that regex is first compiled before calling split
import re
re.split(r'\s+', str_keywords)

['data', 'science', 'job']

In [8]:
import re
regex = re.compile('\s+')
regex.split(str_keywords)

['data', 'science', 'job']

re.compile() also accepts optional flags argument like

re.IGNORECASE etc

# using findall, search and match



In [9]:
import pandas as pd
pd.set_option('max_colwidth', 100)

In [10]:
pd.read_html('https://docs.python.org/3/howto/regex.html')[4]

Unnamed: 0,Method/Attribute,Purpose
0,group(),Return the string matched by the RE
1,start(),Return the starting position of the match
2,end(),Return the ending position of the match
3,span(),"Return a tuple containing the (start, end) positions of the match"


# Removing whitespaces anywhere

In [11]:
str_keywords = "\r\ndata\t \n\n science\n"

In [12]:
# Write code to remove white space
re.split(r'\s+', str_keywords)

['', 'data', 'science', '']

In [13]:
# Write code to remove white space
re.split(r'\s+', str_keywords.strip())

['data', 'science']

# grouping
Groups are marked by the '(', ')' 

Each group is given as tuple

In [14]:
text = """From: author@example.com
User-Agent: Thunderbird 1.5.0.9 (X11/20061227)
MIME-Version: 1.0"""
print(text)

From: author@example.com
User-Agent: Thunderbird 1.5.0.9 (X11/20061227)
MIME-Version: 1.0


In [15]:
# Build regex for get tuple like findall (User-Agent, Thunderbird 1.5.0.9 (X11/20061227))
regex = re.compile(r'(.*):(.*)')
regex.findall(text)

[('From', ' author@example.com'),
 ('User-Agent', ' Thunderbird 1.5.0.9 (X11/20061227)'),
 ('MIME-Version', ' 1.0')]

# Matching email addresses

In [16]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
print(text)

Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com



In [17]:
pattern = r'[A-Z0-9]+@[A-Z0-9]+[.][A-Z]{2,3}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [18]:
print(regex.findall(text))

## or
for email in regex.finditer(text):
    print(email)
dir(email)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
<re.Match object; span=(5, 20), match='dave@google.com'>
<re.Match object; span=(27, 42), match='steve@gmail.com'>
<re.Match object; span=(47, 60), match='rob@gmail.com'>
<re.Match object; span=(66, 80), match='ryan@yahoo.com'>


['__class__',
 '__class_getitem__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'end',
 'endpos',
 'expand',
 'group',
 'groupdict',
 'groups',
 'lastgroup',
 'lastindex',
 'pos',
 're',
 'regs',
 'span',
 'start',
 'string']

# Methods supported by *match* object

- group()	Return the string matched by the RE
- start()	Return the starting position of the match
- end()	Return the ending position of the match
- span()	Return a tuple containing the (start, end)

In [19]:
# using text and start and end print the email address
for email in regex.finditer(text):
    print(email.group())

dave@google.com
steve@gmail.com
rob@gmail.com
ryan@yahoo.com




# First email address

In [20]:
regex.search(text)

<re.Match object; span=(5, 20), match='dave@google.com'>

# email at the start

In [21]:
print(regex.match(text))

None


# Replacing string (split, sub, subn) (modify)

In [23]:
regex.sub('removed', text)


'Dave removed\nSteve removed\nRob removed\nRyan removed\n'

# Vectorized String Functions in pandas (str to skip NA)

In [24]:
import numpy as np
email_addresses = pd.Series(regex.findall(text)+[np.nan])
email_addresses

0    dave@google.com
1    steve@gmail.com
2      rob@gmail.com
3     ryan@yahoo.com
4                NaN
dtype: object

check if email address contains gmail

In [25]:
# write code here
email_addresses.map(lambda x: 'gmail' in x)
                    

TypeError: argument of type 'float' is not iterable

use **str** attribute to avoid NaN

In [27]:
# email_addresses.dtype
match = email_addresses.str.contains( 'gmail')
match

0    False
1     True
2     True
3    False
4      NaN
dtype: object

# searching and segmenting

Let's split email address into username, domain name, and domain suffix

In [28]:
pattern = r'([A-Z0-9]+)@([A-Z0-9]+)\.([A-Z]{2,4})'

In [29]:
emails=email_addresses.str.findall(pattern, flags=re.IGNORECASE)
emails

0    [(dave, google, com)]
1    [(steve, gmail, com)]
2      [(rob, gmail, com)]
3     [(ryan, yahoo, com)]
4                      NaN
dtype: object

# vectorized element retrieval

In [30]:
print(email_addresses)
email_addresses.str[:4]

0    dave@google.com
1    steve@gmail.com
2      rob@gmail.com
3     ryan@yahoo.com
4                NaN
dtype: object


0    dave
1    stev
2    rob@
3    ryan
4     NaN
dtype: object

In [62]:
email_addresses_clean=email_addresses[email_addresses.isnull()]
email_addresses_clean

4    NaN
dtype: object

In [58]:
email_addresses_clean=email_addresses.dropna()
email_addresses_clean

0    dave@google.com
1    steve@gmail.com
2      rob@gmail.com
3     ryan@yahoo.com
dtype: object

In [35]:
match

0    False
1     True
2     True
3    False
4      NaN
dtype: object

In [43]:
email_addresses.str.get(0)

0      d
1      s
2      r
3      r
4    NaN
dtype: object

In [69]:
match = email_addresses.str.contains( 'gmail').fillna(False)
#match = match.dropna(inplace=True)
match
#email_addresses[match]

0    False
1     True
2     True
3    False
4    False
dtype: bool

In [70]:
email_addresses[match]

1    steve@gmail.com
2      rob@gmail.com
dtype: object